Commit edacaf70 authored by Dmitry I. Lyakh's avatar Dmitry I. Lyakh
Browse files

Finished CuQuantum executor backend integration, except final allreduce.

parent 037ce2d1
Loading
Loading
Loading
Loading
+5 −5
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ ISSUES:
  That is, the order of tensor operations across all participating
  processes must be consistent such that every encountered global
  tensor operation will receive the same tensor operand irrespective
  of the difference in the locally generated tensor name. Special
  of the difference in the locally generated tensor names. Special
  care needs to be taken in iterating over associative tensor containers,
  to ensure that the keys are consistent accross all participating
  processes. For example, automatically generated tensor names
@@ -21,9 +21,7 @@ ISSUES:

BUGS:

- 32-bit integer MPI message chunking issue in the backend.

- Fix the bug(s) in the tensor order reduction mechanism in the TalshExecutor backend.
- Fix the bug(s) in the tensor order reduction mechanism in the TalshNodeExecutor backend.


FEATURES:
@@ -39,11 +37,13 @@ FEATURES:
  Contract replaced tensors, then replace the contracted
  tensor with a new tensor (sub)network.

- Implement the Renormalization procedure.

- Implement SAVE/LOAD API for TensorExpansion.

- Implement TensorNetwork slice computing Generator.

- Implement b-D procedure.
- Implement bl-D procedure.

- Implement conjugate gradient optimization procedure.

+9 −3
Original line number Diff line number Diff line
/** ExaTN::Numerics: General client header (free function API)
REVISION: 2021/10/30
REVISION: 2022/01/07

Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/

/** Rationale:
 1. Vector space and subspace registration [spaces.hpp, space_register.hpp]:
@@ -1086,6 +1086,12 @@ inline std::shared_ptr<exatn::TensorNetwork> makeTensorNetwork(const std::string
// INTERNAL CONTROL API //
//////////////////////////

 /** Switches the computational backend: {"default","cuquantum"}.
     Only applies to tensor network execution. **/
inline void switchComputationalBackend(const std::string & backend_name)
 {return numericalServer->switchComputationalBackend(backend_name);}


/** Resets the tensor contraction sequence optimizer that is invoked
    when evaluating tensor networks: {dummy,heuro,greed,metis}. **/
inline void resetContrSeqOptimizer(const std::string & optimizer_name)
+64 −7
Original line number Diff line number Diff line
/** ExaTN::Numerics: Numerical server
REVISION: 2021/12/10
REVISION: 2022/01/07

Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/

#include "num_server.hpp"
#include "tensor_range.hpp"
@@ -89,7 +89,8 @@ NumServer::NumServer(const MPICommProxy & communicator,
                     const ParamConf & parameters,
                     const std::string & graph_executor_name,
                     const std::string & node_executor_name):
 contr_seq_optimizer_("metis"), contr_seq_caching_(false), logging_(0), intra_comm_(communicator), validation_tracing_(false)
 contr_seq_optimizer_("metis"), contr_seq_caching_(false), logging_(0), comp_backend_("default"),
 intra_comm_(communicator), validation_tracing_(false)
{
 int mpi_error = MPI_Comm_size(*(communicator.get<MPI_Comm>()),&num_processes_); assert(mpi_error == MPI_SUCCESS);
 mpi_error = MPI_Comm_rank(*(communicator.get<MPI_Comm>()),&process_rank_); assert(mpi_error == MPI_SUCCESS);
@@ -117,7 +118,8 @@ NumServer::NumServer(const MPICommProxy & communicator,
NumServer::NumServer(const ParamConf & parameters,
                     const std::string & graph_executor_name,
                     const std::string & node_executor_name):
 contr_seq_optimizer_("metis"), contr_seq_caching_(false), logging_(0), validation_tracing_(false)
 contr_seq_optimizer_("metis"), contr_seq_caching_(false), logging_(0), comp_backend_("default"),
 validation_tracing_(false)
{
 num_processes_ = 1; process_rank_ = 0; global_process_rank_ = 0;
 process_world_ = std::make_shared<ProcessGroup>(intra_comm_,num_processes_); //intra-communicator is empty here
@@ -194,6 +196,22 @@ void NumServer::reconfigureTensorRuntime(const ParamConf & parameters,
}
#endif

void NumServer::switchComputationalBackend(const std::string & backend_name)
{
 bool success = sync(); assert(success);
 if(backend_name == "default"){
  comp_backend_ = backend_name;
#ifdef CUQUANTUM
 }else if(backend_name == "cuquantum"){
  comp_backend_ = backend_name;
#endif
 }else{
  std::cout << "#ERROR(exatn::NumServer): switchComputationalBackend: Unknown backend: " << backend_name << std::endl;
  std::abort();
 }
 return;
}

void NumServer::resetContrSeqOptimizer(const std::string & optimizer_name, bool caching)
{
 contr_seq_optimizer_ = optimizer_name;
@@ -612,7 +630,7 @@ bool NumServer::submit(const ProcessGroup & process_group,
 //Determine parallel execution configuration:
 unsigned int local_rank; //local process rank within the process group
 if(!process_group.rankIsIn(process_rank_,&local_rank)) return true; //process is not in the group: Do nothing
 assert(network.isValid()); //debug
 //assert(network.isValid()); //debug
 unsigned int num_procs = process_group.getSize(); //number of executing processes
 assert(local_rank < num_procs);
 if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart())
@@ -883,6 +901,30 @@ bool NumServer::submit(const ProcessGroup & process_group,
bool NumServer::submit(const ProcessGroup & process_group,
                       std::shared_ptr<TensorNetwork> network)
{
#ifdef CUQUANTUM
 //Try execution via an alternative computational backend:
 if(comp_backend_ == "cuquantum"){
  //Determine parallel execution configuration:
  unsigned int local_rank; //local process rank within the process group
  if(!process_group.rankIsIn(process_rank_,&local_rank)) return true; //process is not in the group: Do nothing
  //assert(network->isValid()); //debug
  unsigned int num_procs = process_group.getSize(); //number of executing processes
  assert(local_rank < num_procs);
  if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart())
                            << "]: Submitting tensor network <" << network->getName() << "> (" << network->getTensor(0)->getName()
                            << ") for execution via cuQuantum by " << num_procs << " processes with memory limit "
                            << process_group.getMemoryLimitPerProcess() << " bytes" << std::endl << std::flush;
  if(logging_ > 0) network->printItFile(logfile_);
  const auto exec_handle = tensor_rt_->submit(network,process_group.getMPICommProxy(),num_procs,local_rank);
  bool success = (exec_handle != 0);
  if(success){
   auto res = tn_exec_handles_.emplace(std::make_pair(network->getTensor(0)->getTensorHash(),exec_handle));
   success = res.second;
   if(success && logging_ > 0) logfile_ << "Number of submitted networks via cuQuantum = 1" << std::endl << std::flush;
  }
  return success;
 }
#endif
 if(network) return submit(process_group,*network);
 return false;
}
@@ -1030,6 +1072,14 @@ bool NumServer::sync(const ProcessGroup & process_group, const Tensor & tensor,
{
 bool success = true;
 if(!process_group.rankIsIn(process_rank_)) return success; //process is not in the group: Do nothing
#ifdef CUQUANTUM
 if(comp_backend_ == "cuquantum"){
  auto iter = tn_exec_handles_.find(tensor.getTensorHash());
  bool synced = (iter == tn_exec_handles_.end());
  if(!synced) synced = tensor_rt_->syncNetwork(iter->second,wait);
  return synced;
 }
#endif
 auto iter = tensors_.find(tensor.getName());
 if(iter != tensors_.end()){
  if(iter->second->isComposite()){
@@ -1081,7 +1131,11 @@ bool NumServer::sync(const ProcessGroup & process_group, TensorNetwork & network

bool NumServer::sync(bool wait)
{
 return sync(getCurrentProcessGroup(),wait);
 bool success = sync(getCurrentProcessGroup(),wait);
#ifdef CUQUANTUM
 if(comp_backend_ == "cuquantum" && success) tn_exec_handles_.clear();
#endif
 return success;
}

bool NumServer::sync(const ProcessGroup & process_group, bool wait)
@@ -1092,6 +1146,9 @@ bool NumServer::sync(const ProcessGroup & process_group, bool wait)
 if(success){
  if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart())
   << "]: Locally synchronized all operations" << std::endl << std::flush;
#ifdef CUQUANTUM
  if(comp_backend_ == "cuquantum") tn_exec_handles_.clear();
#endif
#ifdef MPI_ENABLED
  if(wait){
   auto errc = MPI_Barrier(process_group.getMPICommProxy().getRef<MPI_Comm>());
+19 −3
Original line number Diff line number Diff line
/** ExaTN::Numerics: Numerical server
REVISION: 2021/12/22
REVISION: 2022/01/07

Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/

/** Rationale:
 (a) Numerical server provides basic tensor network processing functionality:
@@ -270,6 +270,9 @@ public:
                               const std::string & node_executor_name);
#endif

 /** Switches the computational backend. **/
 void switchComputationalBackend(const std::string & backend_name);

 /** Resets the tensor contraction sequence optimizer that is
     invoked when evaluating tensor networks. **/
 void resetContrSeqOptimizer(const std::string & optimizer_name, //in: tensor contraction sequence optimizer name
@@ -1032,25 +1035,38 @@ protected:

private:

 //Spaces:
 std::shared_ptr<numerics::SpaceRegister> space_register_; //register of vector spaces and their named subspaces
 std::unordered_map<std::string,SpaceId> subname2id_; //maps a subspace name to its parental vector space id

 //Tensors:
 std::unordered_map<std::string,std::shared_ptr<Tensor>> tensors_; //registered tensors (by CREATE operation)
 std::map<std::string,std::shared_ptr<Tensor>> implicit_tensors_; //tensors created implicitly by the runtime (for garbage collection)
 std::unordered_map<std::string,ProcessGroup> tensor_comms_; //process group associated with each tensor

#ifdef CUQUANTUM
 //Tensor network execution handles:
 std::unordered_map<numerics::TensorHashType,runtime::TensorOpExecHandle> tn_exec_handles_;
#endif

 //Contraction path optimizer:
 std::string contr_seq_optimizer_; //tensor contraction sequence optimizer invoked when evaluating tensor networks
 bool contr_seq_caching_; //regulates whether or not to cache pseudo-optimal tensor contraction orders for later reuse

 //Registered external methods and data:
 std::map<std::string,std::shared_ptr<TensorMethod>> ext_methods_; //external tensor methods
 std::map<std::string,std::shared_ptr<BytePacket>> ext_data_; //external data

 //Program scopes:
 std::stack<std::pair<std::string,ScopeId>> scopes_; //TAProL scope stack: {Scope name, Scope Id}

 //Tensor operation factory:
 TensorOpFactory * tensor_op_factory_; //tensor operation factory (non-owning pointer)

 //Configuration:
 int logging_; //logging level
 std::ofstream logfile_; //log file
 std::string comp_backend_; //current computational backend
 int num_processes_; //total number of parallel processes in the dedicated MPI communicator
 int process_rank_; //rank of the current parallel process in the dedicated MPI communicator
 int global_process_rank_; //rank of the current parallel process in MPI_COMM_WORLD
+4 −1
Original line number Diff line number Diff line
@@ -3807,6 +3807,8 @@ TEST(NumServerTester, CuTensorNet) {
 success = exatn::initTensorRnd("C"); assert(success);
 success = exatn::initTensor("D",0.0); assert(success);

 exatn::switchComputationalBackend("default");

 //Contract tensor network:
 int num_repeats = NUM_REPEATS;
 while(--num_repeats >= 0){
@@ -3814,8 +3816,9 @@ TEST(NumServerTester, CuTensorNet) {
  std::cout << "D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y): ";
  auto flops = exatn::getTotalFlopCount();
  auto time_start = exatn::Timer::timeInSecHR();
  success = exatn::evaluateTensorNetworkSync("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)");
  success = exatn::evaluateTensorNetwork("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)");
  assert(success);
  success = exatn::sync("D"); assert(success);
  auto duration = exatn::Timer::timeInSecHR(time_start);
  flops = exatn::getTotalFlopCount() - flops;
  std::cout << "Performance = " << (flops / (1e9 * duration)) << " Gflop/s" << std::endl;
Loading