Commit e276dd5b authored by Dmitry I. Lyakh's avatar Dmitry I. Lyakh
Browse files

The cuQuantum executor test passes correctness, but fails upon repeats ...

parent f1c8ee3d
Loading
Loading
Loading
Loading
+26 −4
Original line number Diff line number Diff line
@@ -3789,9 +3789,9 @@ TEST(NumServerTester, CuTensorNet) {

 const auto TENS_ELEM_TYPE = TensorElementType::REAL32;

 const int NUM_REPEATS = 3;
 const int NUM_REPEATS = 10;

 //exatn::resetLoggingLevel(2,2); //debug
 exatn::resetLoggingLevel(2,2); //debug

 bool success = true;

@@ -3808,7 +3808,6 @@ TEST(NumServerTester, CuTensorNet) {
 success = exatn::initTensor("D",0.0); assert(success);

 success = exatn::sync(); assert(success);
 exatn::switchComputationalBackend("default"); //{default|cuquantum}

 //Contract tensor network:
 int num_repeats = NUM_REPEATS;
@@ -3822,9 +3821,32 @@ TEST(NumServerTester, CuTensorNet) {
  flops = exatn::getTotalFlopCount() - flops;
  std::cout << "Duration = " << duration << " s; GFlop count = " << flops/1e9
            << "; Performance = " << (flops / (1e9 * duration)) << " Gflop/s\n";
  double norm = 0.0;
  success = exatn::computeNorm1Sync("D",norm); assert(success);
  std::cout << "1-norm of tensor D = " << norm << std::endl;
 }

 //std::this_thread::sleep_for(std::chrono::microseconds(1000000));
#ifdef CUQUANTUM
 success = exatn::sync(); assert(success);
 exatn::switchComputationalBackend("cuquantum"); //{default|cuquantum}

 //Contract tensor network:
 num_repeats = NUM_REPEATS;
 while(--num_repeats >= 0){
  std::cout << "D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y): ";
  auto flops = exatn::getTotalFlopCount();
  auto time_start = exatn::Timer::timeInSecHR();
  success = exatn::evaluateTensorNetwork("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)"); assert(success);
  success = exatn::sync("D",true); assert(success);
  auto duration = exatn::Timer::timeInSecHR(time_start);
  flops = exatn::getTotalFlopCount() - flops;
  std::cout << "Duration = " << duration << " s; GFlop count = " << flops/1e9
            << "; Performance = " << (flops / (1e9 * duration)) << " Gflop/s\n";
  double norm = 0.0;
  success = exatn::computeNorm1Sync("D",norm); assert(success);
  std::cout << "1-norm of tensor D = " << norm << std::endl;
 }
#endif

 //Destroy tensors:
 success = exatn::sync(); assert(success);
+37 −16
Original line number Diff line number Diff line
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/08
REVISION: 2022/01/10

Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
@@ -21,6 +21,7 @@ Rationale:
#include <iostream>

#include "talshxx.hpp"
#include "timers.hpp"

#include "cuquantum_executor.hpp"

@@ -55,6 +56,7 @@ struct TensorNetworkReq {
 TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::None; //tensor network execution status
 int num_procs = 0; //total number of executing processes
 int proc_id = -1; //id of the current executing process
 int64_t num_slices = 0;
 std::shared_ptr<numerics::TensorNetwork> network; //tensor network specification
 std::unordered_map<numerics::TensorHashType, TensorDescriptor> tensor_descriptors; //tensor descriptors (shape, volume, data type, body)
 std::unordered_map<unsigned int, std::vector<int32_t>> tensor_modes; //indices associated with tensor dimensions (key = original tensor id)
@@ -81,17 +83,21 @@ struct TensorNetworkReq {
 cudaDataType_t data_type;
 cutensornetComputeType_t compute_type;
 cudaStream_t stream;
 cudaEvent_t data_start;
 cudaEvent_t data_finish;
 cudaEvent_t data_in_start;
 cudaEvent_t data_in_finish;
 cudaEvent_t compute_start;
 cudaEvent_t compute_finish;
 cudaEvent_t data_out_finish;
 double prepare_start;
 double prepare_finish;

 ~TensorNetworkReq() {
  cudaStreamSynchronize(stream);
  cudaEventDestroy(data_out_finish);
  cudaEventDestroy(compute_finish);
  cudaEventDestroy(compute_start);
  cudaEventDestroy(data_finish);
  cudaEventDestroy(data_start);
  cudaEventDestroy(data_in_finish);
  cudaEventDestroy(data_in_start);
  cudaStreamDestroy(stream);
  cutensornetDestroyContractionPlan(comp_plan);
  cutensornetDestroyContractionOptimizerConfig(opt_config);
@@ -205,7 +211,9 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::execute(std::shared_ptr<numerics


TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle exec_handle,
                                                     int * error_code)
                                                     int * error_code,
                                                     int64_t * num_slices,
                                                     ExecutionTimings * timings)
{
 *error_code = 0;
 TensorNetworkQueue::ExecStat exec_stat = TensorNetworkQueue::ExecStat::None;
@@ -223,6 +231,15 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle ex
    contractTensorNetwork(tn_req); //Planning --> Executing
  }
  exec_stat = tn_req->exec_status;
  if(exec_stat == TensorNetworkQueue::ExecStat::Completed){
   if(num_slices != nullptr) *num_slices = tn_req->num_slices;
   if(timings != nullptr){
    timings->prepare = (tn_req->prepare_finish - tn_req->prepare_start) * 1000.0;
    HANDLE_CUDA_ERROR(cudaEventElapsedTime(&(timings->data_in),tn_req->data_in_start,tn_req->data_in_finish));
    HANDLE_CUDA_ERROR(cudaEventElapsedTime(&(timings->data_out),tn_req->compute_finish,tn_req->data_out_finish));
    HANDLE_CUDA_ERROR(cudaEventElapsedTime(&(timings->compute),tn_req->compute_start,tn_req->compute_finish));
   }
  }
  tn_req.reset();
  if(exec_stat == TensorNetworkQueue::ExecStat::Completed) active_networks_.erase(iter);
 }
@@ -368,10 +385,11 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_
                   tn_req->num_modes_out,tn_req->extents_out,tn_req->strides_out,tn_req->modes_out,tn_req->alignment_out,
                   tn_req->data_type,tn_req->compute_type,&(tn_req->net_descriptor)));
  HANDLE_CUDA_ERROR(cudaStreamCreate(&(tn_req->stream)));
  HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_start)));
  HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_finish)));
  HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_in_start)));
  HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_in_finish)));
  HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->compute_start)));
  HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->compute_finish)));
  HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_out_finish)));
 }
 return;
}
@@ -383,7 +401,6 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
 for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
  const auto gpu_id = gpu_attr_[gpu].first;
  HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
  HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_start,tn_req->stream));
  void * prev_front = mem_pool_[gpu].getFront();
  bool success = true;
  //Acquire device memory:
@@ -394,11 +411,12 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
  }
  if(success){
   //Initiate data transfers:
   HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_in_start,tn_req->stream));
   for(auto & descr: tn_req->tensor_descriptors){
    HANDLE_CUDA_ERROR(cudaMemcpyAsync(descr.second.dst_ptr.back(),descr.second.src_ptr,
                                      descr.second.size,cudaMemcpyDefault,tn_req->stream));
   }
   HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_finish,tn_req->stream));
   HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_in_finish,tn_req->stream));
   tn_req->memory_window_ptr.emplace_back(mem_pool_[gpu].getFront());
   auto & net = *(tn_req->network);
   int32_t tens_num = 0;
@@ -428,6 +446,7 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req)
{
 //Configure tensor network contraction on one or all GPUs:
 tn_req->prepare_start = Timer::timeInSecHR();
 for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
  const auto gpu_id = gpu_attr_[gpu].first;
  HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
@@ -447,6 +466,7 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req)
                                                                   &flops,sizeof(flops)));
  flops_ += flops;
 }
 tn_req->prepare_finish = Timer::timeInSecHR();
 tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning;
 return;
}
@@ -458,27 +478,28 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
 for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
  const auto gpu_id = gpu_attr_[gpu].first;
  HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
  int64_t num_slices = 0;
  tn_req->num_slices = 0;
  HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle,
                                                                   tn_req->opt_info,
                                                                   CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
                                                                   &num_slices,sizeof(num_slices)));
  assert(num_slices > 0);
                                                                   &(tn_req->num_slices),sizeof(tn_req->num_slices)));
  assert(tn_req->num_slices > 0);
  HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_start,tn_req->stream));
  for(int64_t slice_id = tn_req->proc_id; slice_id < num_slices; slice_id += tn_req->num_procs){
  for(int64_t slice_id = tn_req->proc_id; slice_id < tn_req->num_slices; slice_id += tn_req->num_procs){
   HANDLE_CTN_ERROR(cutensornetContraction(gpu_attr_[gpu].second.cutn_handle,
                                           tn_req->comp_plan,
                                           tn_req->data_in,tn_req->data_out,
                                           tn_req->workspace,tn_req->worksize,
                                           slice_id,tn_req->stream));
  }
  HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_finish,tn_req->stream));
  const auto output_hash = tn_req->network->getTensor(0)->getTensorHash();
  auto iter = tn_req->tensor_descriptors.find(output_hash);
  assert(iter != tn_req->tensor_descriptors.cend());
  const auto & descr = iter->second;
  HANDLE_CUDA_ERROR(cudaMemcpyAsync(descr.src_ptr,descr.dst_ptr[gpu],
                                    descr.size,cudaMemcpyDefault,tn_req->stream));
  HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_finish,tn_req->stream));
  HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_out_finish,tn_req->stream));
 }
 tn_req->exec_status = TensorNetworkQueue::ExecStat::Executing;
 return;
@@ -492,7 +513,7 @@ void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req)
 for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
  const auto gpu_id = gpu_attr_[gpu].first;
  HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
  cudaError_t cuda_error = cudaEventQuery(tn_req->compute_finish);
  cudaError_t cuda_error = cudaEventQuery(tn_req->data_out_finish);
  if(cuda_error == cudaSuccess){
   if(tn_req->memory_window_ptr[gpu] != nullptr){
    mem_pool_[gpu].releaseMemory(tn_req->memory_window_ptr[gpu]);
+12 −3
Original line number Diff line number Diff line
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/08
REVISION: 2022/01/10

Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
@@ -35,6 +35,13 @@ using TensorImplTalshFunc = std::function<std::shared_ptr<talsh::Tensor>(const n

struct TensorNetworkReq;

struct ExecutionTimings {
 float prepare = 0.0;
 float data_in = 0.0;
 float data_out = 0.0;
 float compute = 0.0;
};


class CuQuantumExecutor {

@@ -63,12 +70,14 @@ public:
     If wait = TRUE, waits until completion, otherwise just tests the progress.
     Returns the current status of the tensor network execution. **/
 TensorNetworkQueue::ExecStat sync(const TensorOpExecHandle exec_handle, //in: tensor network execution handle
                                   int * error_code); //out: error code (0:success)
                                   int * error_code, //out: error code (0:success)
                                   int64_t * num_slices = nullptr, //out: number of tensor network slices
                                   ExecutionTimings * timings = nullptr); //out: execution timings (ms)

 /** Synchronizes execution of all submitted tensor networks to completion. **/
 void sync();

 /** Returns total executed flop count. **/
 /** Returns the total executed flop count. **/
 double getTotalFlopCount() const {return flops_;}

protected:
+8 −3
Original line number Diff line number Diff line
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2022/01/08
REVISION: 2022/01/10

Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
@@ -298,9 +298,11 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
        const auto current = tensor_network_queue.getCurrent();
        const auto exec_handle = current->second;
        int error_code = 0;
        int64_t num_slices = 0;
        ExecutionTimings timings;
        auto exec_stat = tensor_network_queue.checkExecStatus(exec_handle);
        if(exec_stat == TensorNetworkQueue::ExecStat::Idle || current_pos == 0){
          exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
          exec_stat = cuquantum_executor_->sync(exec_handle,&error_code,&num_slices,&timings); //this call will progress tensor network execution
          assert(error_code == 0);
        }
        if(exec_stat == TensorNetworkQueue::ExecStat::None){
@@ -329,7 +331,10 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
        }else if(exec_stat == TensorNetworkQueue::ExecStat::Completed){
          if(logging_.load() != 0){
            logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart())
                     << "](LazyGraphExecutor)[EXEC_THREAD]: Completed via cuQuantum tensor network " << exec_handle << std::endl;
                     << "](LazyGraphExecutor)[EXEC_THREAD]: Completed via cuQuantum tensor network " << exec_handle
                     << ": NumSlices = " << num_slices << "; Time (ms): In{" << timings.data_in
                     << "}, Prep{" << timings.prepare << "}, Comp{" << timings.compute
                     << "}, Out{" << timings.data_out << "}" << std::endl;
#ifdef DEBUG
            logfile_.flush();
#endif