Commit ed6dc07f authored by Dmitry I. Lyakh's avatar Dmitry I. Lyakh
Browse files

Finished CuQuantum executor for one GPU per process, needs testing ...

parent 6516d89e
Loading
Loading
Loading
Loading
+66 −20
Original line number Diff line number Diff line
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/04
REVISION: 2022/01/05

Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
@@ -91,6 +91,9 @@ struct TensorNetworkReq {
  cudaEventDestroy(data_finish);
  cudaEventDestroy(data_start);
  cudaStreamDestroy(stream);
  cutensornetDestroyContractionPlan(comp_plan);
  cutensornetDestroyContractionOptimizerConfig(opt_config);
  cutensornetDestroyContractionOptimizerInfo(opt_info);
  cutensornetDestroyNetworkDescriptor(net_descriptor);
  if(modes_out != nullptr) delete [] modes_out;
  if(strides_out != nullptr) delete [] strides_out;
@@ -105,8 +108,8 @@ struct TensorNetworkReq {
};


CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
 tensor_data_access_func_(std::move(tensor_data_access_func))
CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth):
 tensor_data_access_func_(std::move(tensor_data_access_func)), pipe_depth_(pipeline_depth)
{
 static_assert(std::is_same<cutensornetHandle_t,void*>::value,"#FATAL(exatn::runtime::CuQuantumExecutor): cutensornetHandle_t != (void*)");

@@ -118,6 +121,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
 for(int i = 0; i < num_gpus; ++i){
  if(talshDeviceState(i,DEV_NVIDIA_GPU) >= DEV_ON){
   gpu_attr_.emplace_back(std::make_pair(i,DeviceAttr{}));
   gpu_attr_.back().second.pipe_level = 0;
   gpu_attr_.back().second.workspace_ptr = talsh::getDeviceBufferBasePtr(DEV_NVIDIA_GPU,i);
   assert(reinterpret_cast<std::size_t>(gpu_attr_.back().second.workspace_ptr) % MEM_ALIGNMENT == 0);
   gpu_attr_.back().second.buffer_size = talsh::getDeviceMaxBufferSize(DEV_NVIDIA_GPU,i);
@@ -257,6 +261,19 @@ cutensornetComputeType_t getCutensorComputeType(const TensorElementType elem_typ
}


void CuQuantumExecutor::acquireWorkspace(unsigned int dev,
                                         void ** workspace_ptr,
                                         uint64_t * workspace_size)
{
 assert(dev < gpu_attr_.size());
 auto & dev_attr = gpu_attr_[dev].second;
 *workspace_size = dev_attr.workspace_size / pipe_depth_;
 *workspace_ptr = (void*)((char*)(dev_attr.workspace_ptr) + ((*workspace_size) * dev_attr.pipe_level));
 dev_attr.pipe_level = (++(dev_attr.pipe_level)) % pipe_depth_;
 return;
}


void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
{
 const auto & net = *(tn_req->network);
@@ -266,6 +283,7 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_
 tn_req->strides_in = new int64_t*[num_input_tensors];
 tn_req->modes_in = new int32_t*[num_input_tensors];
 tn_req->alignments_in = new uint32_t[num_input_tensors];
 tn_req->data_in = new void*[num_input_tensors];

 for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->strides_in[i] = NULL;
 for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->alignments_in[i] = MEM_ALIGNMENT;
@@ -370,7 +388,21 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
   }
   HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_finish,tn_req->stream));
   tn_req->memory_window_ptr.emplace_back(mem_pool_[gpu].getFront());
   auto & net = *(tn_req->network);
   int32_t tens_num = 0;
   for(auto iter = net.cbegin(); iter != net.cend(); ++iter){
    const auto tens_id = iter->first;
    const auto & tens = iter->second;
    const auto tens_hash = tens.getTensor()->getTensorHash();
    auto descr = tn_req->tensor_descriptors.find(tens_hash);
    void * dev_ptr = descr->second.dst_ptr.back();
    if(tens_id == 0){
     tn_req->data_out = dev_ptr;
    }else{
     tn_req->data_in[tens_num++] = dev_ptr;
    }
   }
  }else{ //no enough memory currently
   //Restore previous memory front:
   mem_pool_[gpu].restorePreviousFront(prev_front);
   return;
@@ -385,16 +417,22 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req)
{
 //Configure tensor network contraction on one or all GPUs:
 for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
  const auto gpu_id = gpu_attr_[gpu].first;
  HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
  HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerConfig(gpu_attr_[gpu].second.cutn_handle,&(tn_req->opt_config)));
  HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerInfo(gpu_attr_[gpu].second.cutn_handle,tn_req->net_descriptor,&(tn_req->opt_info)));
  tn_req->worksize = gpu_attr_[gpu].second.workspace_size / PIPELINE_DEPTH;
  tn_req->workspace = gpu_attr_[gpu].second.workspace_ptr; //`Need moving window (pipelining)
  acquireWorkspace(gpu,&(tn_req->workspace),&(tn_req->worksize));
  HANDLE_CTN_ERROR(cutensornetContractionOptimize(gpu_attr_[gpu].second.cutn_handle,
                                                  tn_req->net_descriptor,tn_req->opt_config,
                                                  tn_req->worksize,tn_req->opt_info));
  HANDLE_CTN_ERROR(cutensornetCreateContractionPlan(gpu_attr_[gpu].second.cutn_handle,
                                                    tn_req->net_descriptor,tn_req->opt_info,
                                                    tn_req->worksize,&(tn_req->comp_plan)));
  double flops = 0.0;
  HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle,
                                                                   tn_req->opt_info,
                                                                   CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
                                                                   &flops,sizeof(flops)));
 }
 tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning;
 return;
@@ -406,13 +444,13 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
 //Execute the contraction plan on one or all GPUs:
 for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
  const auto gpu_id = gpu_attr_[gpu].first;
  HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
  int64_t num_slices = 0;
  HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle,
                                                                   tn_req->opt_info,
                                                                   CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
                                                                   &num_slices,sizeof(num_slices)));
  assert(num_slices > 0);
  HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
  HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_start,tn_req->stream));
  for(int64_t slice_id = 0; slice_id < num_slices; ++slice_id){
   HANDLE_CTN_ERROR(cutensornetContraction(gpu_attr_[gpu].second.cutn_handle,
@@ -431,13 +469,21 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req)
{
 //Test work completion on one or all GPUs:
 bool all_completed = true;
 for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
  const auto gpu_id = gpu_attr_[gpu].first;
  HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
  cudaError_t cuda_error = cudaEventQuery(tn_req->compute_finish);
  if(cuda_error == cudaSuccess){
   //`Move MemPool back forward using tn_req->memory_window_ptr
   tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed;
   if(tn_req->memory_window_ptr[gpu] != nullptr){
    mem_pool_[gpu].releaseMemory(tn_req->memory_window_ptr[gpu]);
    tn_req->memory_window_ptr[gpu] = nullptr;
   }
  }else{
   all_completed = false;
  }
 }
 if(all_completed) tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed;
 return;
}

+11 −4
Original line number Diff line number Diff line
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/04
REVISION: 2022/01/05

Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
@@ -40,7 +40,8 @@ class CuQuantumExecutor {

public:

 CuQuantumExecutor(TensorImplFunc tensor_data_access_func);
 CuQuantumExecutor(TensorImplFunc tensor_data_access_func,
                   unsigned int pipeline_depth);

 CuQuantumExecutor(const CuQuantumExecutor &) = delete;
 CuQuantumExecutor & operator=(CuQuantumExecutor &) = delete;
@@ -65,10 +66,13 @@ public:

protected:

 static constexpr float WORKSPACE_FRACTION = 0.2;
 static constexpr unsigned int PIPELINE_DEPTH = 1;
 static constexpr float WORKSPACE_FRACTION = 0.6;
 static constexpr std::size_t MEM_ALIGNMENT = 256;

 void acquireWorkspace(unsigned int dev,
                       void ** workspace_ptr,
                       uint64_t * workspace_size);

 void parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req);
 void loadTensors(std::shared_ptr<TensorNetworkReq> tn_req);
 void planExecution(std::shared_ptr<TensorNetworkReq> tn_req);
@@ -80,6 +84,7 @@ protected:
  std::size_t buffer_size = 0;
  void * workspace_ptr = nullptr;
  std::size_t workspace_size = 0;
  unsigned int pipe_level = 0;
  void * cutn_handle; //cutensornetHandle_t = void*
 };

@@ -91,6 +96,8 @@ protected:
 std::vector<LinearMemoryPool> mem_pool_;
 /** Tensor data access function **/
 TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes}
 /** Pipeline depth **/
 const unsigned int pipe_depth_;
};

} //namespace runtime
+10 −4
Original line number Diff line number Diff line
/** ExaTN: Tensor Runtime: Tensor network executor: Execution queue
REVISION: 2021/12/30
REVISION: 2022/01/05

Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)

Rationale:
 - ExaTN graph executor may accept whole tensor networks for execution
@@ -144,7 +144,7 @@ public:
  return current_network_;
 }

 /** Returns the current iterator to the beginning of the queue. **/
 /** Resets the current iterator to the beginning of the queue. **/
 void reset() {
  lock();
  current_network_ = networks_.begin();
@@ -173,6 +173,12 @@ public:
  return not_over;
 }

 /** Returns the distance from the current iterator
     to the beginning of the queue. **/
 auto getCurrentPos() {
  return std::distance(networks_.begin(),current_network_);
 }

 /** Locks. **/
 inline void lock(){queue_lock_.lock();}
 inline void unlock(){queue_lock_.unlock();}
+30 −20
Original line number Diff line number Diff line
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2022/01/03
REVISION: 2022/01/05

Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
@@ -35,7 +35,8 @@ void LazyGraphExecutor::resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> no
      [this](const numerics::Tensor & tensor, int device_kind, int device_id, std::size_t * size){
        void * data_ptr = this->node_executor_->getTensorImage(tensor,device_kind,device_id,size);
        return data_ptr;
      }
      },
      CUQUANTUM_PIPELINE_DEPTH
    );
  }
#endif
@@ -277,7 +278,7 @@ void LazyGraphExecutor::execute(TensorGraph & dag) {
void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
#ifdef CUQUANTUM
  std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Started executing the tensor network queue via cuQuantum: "
            << tensor_network_queue.getSize() << " elements detected" << std::endl;
            << tensor_network_queue.getSize() << " networks detected" << std::endl;
  assert(node_executor_);
  //Synchronize the node executor:
  bool synced = node_executor_->sync(); assert(synced);
@@ -287,11 +288,16 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
    tensor_network_queue.reset();
    bool not_over = !tensor_network_queue.isOver();
    while(not_over){
      int error_code = 0;
      const auto current_pos = tensor_network_queue.getCurrentPos();
      if(current_pos < CUQUANTUM_PIPELINE_DEPTH){
        const auto current = tensor_network_queue.getCurrent();
        const auto exec_handle = current->second;
      auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
        int error_code = 0;
        auto exec_stat = tensor_network_queue.checkExecStatus(exec_handle);
        if(exec_stat == TensorNetworkQueue::ExecStat::Idle || current_pos == 0){
          exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
          assert(error_code == 0);
        }
        if(exec_stat == TensorNetworkQueue::ExecStat::None){
          exec_stat = cuquantum_executor_->execute(current->first,exec_handle);
          if(exec_stat != TensorNetworkQueue::ExecStat::None){
@@ -301,6 +307,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
          not_over = tensor_network_queue.next();
        }else if(exec_stat == TensorNetworkQueue::ExecStat::Completed){
          auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat);
          assert(current_pos == 0);
          tensor_network_queue.remove();
          std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Completed tensor network execution via cuQuantum\n";
          not_over = !tensor_network_queue.isOver();
@@ -308,6 +315,9 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
          auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat);
          not_over = tensor_network_queue.next();
        }
      }else{
        not_over = false;
      }
    }
  }
  cuquantum_executor_->sync();
+6 −3
Original line number Diff line number Diff line
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2021/12/22
REVISION: 2022/01/05

Copyright (C) 2018-2021 Dmitry Lyakh, Alex McCaskey
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
Copyright (C) 2018-2022 Dmitry Lyakh, Alex McCaskey
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)

Rationale:

@@ -26,6 +26,9 @@ public:

  static constexpr const unsigned int DEFAULT_PIPELINE_DEPTH = 16;
  static constexpr const unsigned int DEFAULT_PREFETCH_DEPTH = 4;
#ifdef CUQUANTUM
  static constexpr const unsigned int CUQUANTUM_PIPELINE_DEPTH = 2;
#endif

  LazyGraphExecutor(): pipeline_depth_(DEFAULT_PIPELINE_DEPTH),
                       prefetch_depth_(DEFAULT_PREFETCH_DEPTH)