Commit 3ed2fe51 authored by Dmitry I. Lyakh's avatar Dmitry I. Lyakh
Browse files

Fully designed the execution workflow for CuQuantumExecutor

parent 7c970df3
Loading
Loading
Loading
Loading
+79 −19
Original line number Diff line number Diff line
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2021/12/29
REVISION: 2021/12/30

Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
@@ -22,8 +22,6 @@ Rationale:

#include "talshxx.hpp"

#include "linear_memory.hpp"

#include "cuquantum_executor.hpp"


@@ -44,7 +42,6 @@ namespace exatn {
namespace runtime {

struct TensorDescriptor {
 std::vector<int32_t> modes;   //indices associated with tensor dimensions
 std::vector<int64_t> extents; //tensor dimension extents
 std::vector<int64_t> strides; //tensor dimension strides (optional)
 void * body_ptr = nullptr;    //pointer to the tensor body image
@@ -53,17 +50,18 @@ struct TensorDescriptor {
};

struct TensorNetworkReq {
 std::shared_ptr<numerics::TensorNetwork> network;
 std::unordered_map<numerics::TensorHashType,TensorDescriptor> tensor_descriptors;
 std::unordered_map<int32_t,int64_t> index_extents;
 TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::None; //tensor network execution status
 std::shared_ptr<numerics::TensorNetwork> network; //tensor network specification
 std::unordered_map<numerics::TensorHashType,TensorDescriptor> tensor_descriptors; //tensor descriptors (shape, volume, data type, body)
 std::unordered_map<unsigned int, std::vector<int32_t>> tensor_modes; //indices associated with tensor dimensions (key is the original tensor id)
 std::unordered_map<int32_t,int64_t> index_extents; //extent of each registered tensor mode
 std::vector<void*> memory_window_ptr; //end of the GPU memory segment allocated for the tensors
 cutensornetNetworkDescriptor_t net_descriptor;
 cutensornetContractionOptimizerConfig_t opt_config;
 cutensornetContractionOptimizerInfo_t opt_info;
 cutensornetContractionPlan_t comp_plan;
 cudaStream_t stream;
 cutensornetComputeType_t compute_type;
 void * memory_window_ptr = nullptr;
 TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::Idle;
};


@@ -83,11 +81,14 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
   gpu_attr_.back().second.workspace_ptr = talsh::getDeviceBufferBasePtr(DEV_NVIDIA_GPU,i);
   assert(reinterpret_cast<std::size_t>(gpu_attr_.back().second.workspace_ptr) % MEM_ALIGNMENT == 0);
   gpu_attr_.back().second.buffer_size = talsh::getDeviceMaxBufferSize(DEV_NVIDIA_GPU,i);
   std::size_t wrk_size = static_cast<float>(gpu_attr_.back().second.buffer_size) * WORKSPACE_FRACTION;
   std::size_t wrk_size = (std::size_t)(static_cast<float>(gpu_attr_.back().second.buffer_size) * WORKSPACE_FRACTION);
   wrk_size -= wrk_size % MEM_ALIGNMENT;
   gpu_attr_.back().second.workspace_size = wrk_size;
   gpu_attr_.back().second.buffer_size -= wrk_size;
   gpu_attr_.back().second.buffer_size -= gpu_attr_.back().second.buffer_size % MEM_ALIGNMENT;
   gpu_attr_.back().second.buffer_ptr = (void*)(((char*)(gpu_attr_.back().second.workspace_ptr)) + wrk_size);
   mem_pool_.emplace_back(LinearMemoryPool(gpu_attr_.back().second.buffer_ptr,
                                           gpu_attr_.back().second.buffer_size,MEM_ALIGNMENT));
  }
 }
 std::cout << "#DEBUG(exatn::runtime::CuQuantumExecutor): Number of available GPUs = " << gpu_attr_.size() << std::endl;
@@ -111,7 +112,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):

CuQuantumExecutor::~CuQuantumExecutor()
{
 bool success = sync(); assert(success);
 sync();
 for(const auto & gpu: gpu_attr_){
  HANDLE_CUDA_ERROR(cudaSetDevice(gpu.first));
  HANDLE_CTN_ERROR(cutensornetDestroy((cutensornetHandle_t)(gpu.second.cutn_handle)));
@@ -130,8 +131,16 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::execute(std::shared_ptr<numerics
 if(res.second){
  auto tn_req = res.first->second;
  tn_req->network = network;
  tn_req->exec_status = TensorNetworkQueue::ExecStat::Idle;
  parseTensorNetwork(tn_req); //still Idle
  loadTensors(tn_req); //Idle --> Loading
  if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Loading){
   planExecution(tn_req); //Loading --> Planning (while loading data)
   if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Planning){
    contractTensorNetwork(tn_req); //Planning --> Executing
   }
  }
  exec_stat = tn_req->exec_status;
  //`Finish
 }else{
  std::cout << "#WARNING(exatn::runtime::CuQuantumExecutor): execute: Repeated tensor network submission detected!\n";
 }
@@ -140,26 +149,77 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::execute(std::shared_ptr<numerics


TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle exec_handle,
                                                     int * error_code,
                                                     bool wait)
                                                     int * error_code)
{
 *error_code = 0;
 TensorNetworkQueue::ExecStat exec_stat = TensorNetworkQueue::ExecStat::None;
 auto iter = active_networks_.find(exec_handle);
 if(iter != active_networks_.end()){
  auto tn_req = iter->second;
  if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Executing){
   testCompletion(tn_req); //Executing --> Completed
  }else{
   if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Idle)
    loadTensors(tn_req); //Idle --> Loading
   if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Loading)
    planExecution(tn_req); //Loading --> Planning (while loading data)
   if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Planning)
    contractTensorNetwork(tn_req); //Planning --> Executing
  }
  exec_stat = tn_req->exec_status;
  //`Finish
  tn_req.reset();
  if(exec_stat == TensorNetworkQueue::ExecStat::Completed)
   active_networks_.erase(iter);
 }
 return exec_stat;
}


bool CuQuantumExecutor::sync()
void CuQuantumExecutor::sync()
{
 while(!active_networks_.empty()){
  for(auto iter = active_networks_.begin(); iter != active_networks_.end(); ++iter){
   int error_code = 0;
   const auto exec_stat = sync(iter->first,&error_code); assert(error_code == 0);
   if(exec_stat == TensorNetworkQueue::ExecStat::Completed) break;
  }
 }
 return;
}


void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
{
 
 return;
}


void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
{
 bool synced = true;
 //`Finish
 return synced;
 
 return;
}


void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req)
{
 
 return;
}


void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
{
 
 return;
}


void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req)
{
 
 return;
}

} //namespace runtime
+12 −4
Original line number Diff line number Diff line
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2021/12/29
REVISION: 2021/12/30

Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
@@ -20,6 +20,7 @@ Rationale:
#include <vector>
#include <functional>

#include "linear_memory.hpp"
#include "tensor_network_queue.hpp"

namespace talsh{
@@ -57,17 +58,22 @@ public:
     If wait = TRUE, waits until completion, otherwise just tests the progress.
     Returns the current status of the tensor network execution. **/
 TensorNetworkQueue::ExecStat sync(const TensorOpExecHandle exec_handle,
                                   int * error_code,
                                   bool wait = true);
                                   int * error_code);

 /** Synchronizes execution of all submitted tensor networks to completion. **/
 bool sync();
 void sync();

protected:

 static constexpr float WORKSPACE_FRACTION = 0.2;
 static constexpr std::size_t MEM_ALIGNMENT = 256;

 void parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req);
 void loadTensors(std::shared_ptr<TensorNetworkReq> tn_req);
 void planExecution(std::shared_ptr<TensorNetworkReq> tn_req);
 void contractTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req);
 void testCompletion(std::shared_ptr<TensorNetworkReq> tn_req);

 struct DeviceAttr{
  void * buffer_ptr = nullptr;
  std::size_t buffer_size = 0;
@@ -80,6 +86,8 @@ protected:
 std::unordered_map<TensorOpExecHandle,std::shared_ptr<TensorNetworkReq>> active_networks_;
 /** Attributes of all GPUs available to the current process **/
 std::vector<std::pair<int,DeviceAttr>> gpu_attr_; //{gpu_id, gpu_attributes}
 /** Moving-window linear memory pool (in GPU RAM) **/
 std::vector<LinearMemoryPool> mem_pool_;
 /** Tensor data access function **/
 TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes}
};
+4 −2
Original line number Diff line number Diff line
/** ExaTN: Tensor Runtime: Tensor network executor: Linear memory allocator
REVISION: 2021/12/29
REVISION: 2021/12/30

Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)

Rationale:

 Linear memory moving window:
 Linear memory moving window (----->):

 (a) front >= back:
 ____________________________________
@@ -29,6 +29,8 @@ Rationale:
#ifndef EXATN_RUNTIME_LINEAR_MEMORY_HPP_
#define EXATN_RUNTIME_LINEAR_MEMORY_HPP_

#include "errors.hpp"

class LinearMemoryPool {

public:
+3 −2
Original line number Diff line number Diff line
/** ExaTN: Tensor Runtime: Tensor network executor: Execution queue
REVISION: 2021/12/27
REVISION: 2021/12/30

Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
@@ -36,7 +36,8 @@ public:
 enum class ExecStat {
  None,      //no execution status
  Idle,      //submitted but execution has not yet started
  Preparing, //preparation for execution has started (loading data, planning)
  Loading,   //started loading data
  Planning,  //preparation for execution has started (planning)
  Executing, //actual execution (numerical computation) has started
  Completed  //execution completed
 };
+3 −3
Original line number Diff line number Diff line
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2021/12/29
REVISION: 2021/12/30

Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
@@ -290,7 +290,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
      int error_code = 0;
      const auto current = tensor_network_queue.getCurrent();
      const auto exec_handle = current->second;
      auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code,false); //this call will progress tensor network execution
      auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
      assert(error_code == 0);
      if(exec_stat == TensorNetworkQueue::ExecStat::None){
        exec_stat = cuquantum_executor_->execute(current->first,exec_handle);
@@ -310,7 +310,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
      }
    }
  }
  synced = cuquantum_executor_->sync(); assert(synced);
  cuquantum_executor_->sync();
#else
  assert(tensor_network_queue.isEmpty());
#endif