Loading src/exatn/tests/NumServerTester.cpp +26 −4 Original line number Diff line number Diff line Loading @@ -3789,9 +3789,9 @@ TEST(NumServerTester, CuTensorNet) { const auto TENS_ELEM_TYPE = TensorElementType::REAL32; const int NUM_REPEATS = 3; const int NUM_REPEATS = 10; //exatn::resetLoggingLevel(2,2); //debug exatn::resetLoggingLevel(2,2); //debug bool success = true; Loading @@ -3808,7 +3808,6 @@ TEST(NumServerTester, CuTensorNet) { success = exatn::initTensor("D",0.0); assert(success); success = exatn::sync(); assert(success); exatn::switchComputationalBackend("default"); //{default|cuquantum} //Contract tensor network: int num_repeats = NUM_REPEATS; Loading @@ -3822,9 +3821,32 @@ TEST(NumServerTester, CuTensorNet) { flops = exatn::getTotalFlopCount() - flops; std::cout << "Duration = " << duration << " s; GFlop count = " << flops/1e9 << "; Performance = " << (flops / (1e9 * duration)) << " Gflop/s\n"; double norm = 0.0; success = exatn::computeNorm1Sync("D",norm); assert(success); std::cout << "1-norm of tensor D = " << norm << std::endl; } //std::this_thread::sleep_for(std::chrono::microseconds(1000000)); #ifdef CUQUANTUM success = exatn::sync(); assert(success); exatn::switchComputationalBackend("cuquantum"); //{default|cuquantum} //Contract tensor network: num_repeats = NUM_REPEATS; while(--num_repeats >= 0){ std::cout << "D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y): "; auto flops = exatn::getTotalFlopCount(); auto time_start = exatn::Timer::timeInSecHR(); success = exatn::evaluateTensorNetwork("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)"); assert(success); success = exatn::sync("D",true); assert(success); auto duration = exatn::Timer::timeInSecHR(time_start); flops = exatn::getTotalFlopCount() - flops; std::cout << "Duration = " << duration << " s; GFlop count = " << flops/1e9 << "; Performance = " << (flops / (1e9 * duration)) << " Gflop/s\n"; double norm = 0.0; success = exatn::computeNorm1Sync("D",norm); assert(success); std::cout << "1-norm of tensor D = " << norm << std::endl; } #endif //Destroy tensors: success = exatn::sync(); assert(success); Loading src/runtime/executor/cuquantum/cuquantum_executor.cu +37 −16 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/08 REVISION: 2022/01/10 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading @@ -21,6 +21,7 @@ Rationale: #include <iostream> #include "talshxx.hpp" #include "timers.hpp" #include "cuquantum_executor.hpp" Loading Loading @@ -55,6 +56,7 @@ struct TensorNetworkReq { TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::None; //tensor network execution status int num_procs = 0; //total number of executing processes int proc_id = -1; //id of the current executing process int64_t num_slices = 0; std::shared_ptr<numerics::TensorNetwork> network; //tensor network specification std::unordered_map<numerics::TensorHashType, TensorDescriptor> tensor_descriptors; //tensor descriptors (shape, volume, data type, body) std::unordered_map<unsigned int, std::vector<int32_t>> tensor_modes; //indices associated with tensor dimensions (key = original tensor id) Loading @@ -81,17 +83,21 @@ struct TensorNetworkReq { cudaDataType_t data_type; cutensornetComputeType_t compute_type; cudaStream_t stream; cudaEvent_t data_start; cudaEvent_t data_finish; cudaEvent_t data_in_start; cudaEvent_t data_in_finish; cudaEvent_t compute_start; cudaEvent_t compute_finish; cudaEvent_t data_out_finish; double prepare_start; double prepare_finish; ~TensorNetworkReq() { cudaStreamSynchronize(stream); cudaEventDestroy(data_out_finish); cudaEventDestroy(compute_finish); cudaEventDestroy(compute_start); cudaEventDestroy(data_finish); cudaEventDestroy(data_start); cudaEventDestroy(data_in_finish); cudaEventDestroy(data_in_start); cudaStreamDestroy(stream); cutensornetDestroyContractionPlan(comp_plan); cutensornetDestroyContractionOptimizerConfig(opt_config); Loading Loading @@ -205,7 +211,9 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::execute(std::shared_ptr<numerics TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle exec_handle, int * error_code) int * error_code, int64_t * num_slices, ExecutionTimings * timings) { *error_code = 0; TensorNetworkQueue::ExecStat exec_stat = TensorNetworkQueue::ExecStat::None; Loading @@ -223,6 +231,15 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle ex contractTensorNetwork(tn_req); //Planning --> Executing } exec_stat = tn_req->exec_status; if(exec_stat == TensorNetworkQueue::ExecStat::Completed){ if(num_slices != nullptr) *num_slices = tn_req->num_slices; if(timings != nullptr){ timings->prepare = (tn_req->prepare_finish - tn_req->prepare_start) * 1000.0; HANDLE_CUDA_ERROR(cudaEventElapsedTime(&(timings->data_in),tn_req->data_in_start,tn_req->data_in_finish)); HANDLE_CUDA_ERROR(cudaEventElapsedTime(&(timings->data_out),tn_req->compute_finish,tn_req->data_out_finish)); HANDLE_CUDA_ERROR(cudaEventElapsedTime(&(timings->compute),tn_req->compute_start,tn_req->compute_finish)); } } tn_req.reset(); if(exec_stat == TensorNetworkQueue::ExecStat::Completed) active_networks_.erase(iter); } Loading Loading @@ -368,10 +385,11 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_ tn_req->num_modes_out,tn_req->extents_out,tn_req->strides_out,tn_req->modes_out,tn_req->alignment_out, tn_req->data_type,tn_req->compute_type,&(tn_req->net_descriptor))); HANDLE_CUDA_ERROR(cudaStreamCreate(&(tn_req->stream))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_start))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_finish))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_in_start))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_in_finish))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->compute_start))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->compute_finish))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_out_finish))); } return; } Loading @@ -383,7 +401,6 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req) for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_start,tn_req->stream)); void * prev_front = mem_pool_[gpu].getFront(); bool success = true; //Acquire device memory: Loading @@ -394,11 +411,12 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req) } if(success){ //Initiate data transfers: HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_in_start,tn_req->stream)); for(auto & descr: tn_req->tensor_descriptors){ HANDLE_CUDA_ERROR(cudaMemcpyAsync(descr.second.dst_ptr.back(),descr.second.src_ptr, descr.second.size,cudaMemcpyDefault,tn_req->stream)); } HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_finish,tn_req->stream)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_in_finish,tn_req->stream)); tn_req->memory_window_ptr.emplace_back(mem_pool_[gpu].getFront()); auto & net = *(tn_req->network); int32_t tens_num = 0; Loading Loading @@ -428,6 +446,7 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req) void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req) { //Configure tensor network contraction on one or all GPUs: tn_req->prepare_start = Timer::timeInSecHR(); for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); Loading @@ -447,6 +466,7 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req) &flops,sizeof(flops))); flops_ += flops; } tn_req->prepare_finish = Timer::timeInSecHR(); tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning; return; } Loading @@ -458,27 +478,28 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq> for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); int64_t num_slices = 0; tn_req->num_slices = 0; HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle, tn_req->opt_info, CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES, &num_slices,sizeof(num_slices))); assert(num_slices > 0); &(tn_req->num_slices),sizeof(tn_req->num_slices))); assert(tn_req->num_slices > 0); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_start,tn_req->stream)); for(int64_t slice_id = tn_req->proc_id; slice_id < num_slices; slice_id += tn_req->num_procs){ for(int64_t slice_id = tn_req->proc_id; slice_id < tn_req->num_slices; slice_id += tn_req->num_procs){ HANDLE_CTN_ERROR(cutensornetContraction(gpu_attr_[gpu].second.cutn_handle, tn_req->comp_plan, tn_req->data_in,tn_req->data_out, tn_req->workspace,tn_req->worksize, slice_id,tn_req->stream)); } HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_finish,tn_req->stream)); const auto output_hash = tn_req->network->getTensor(0)->getTensorHash(); auto iter = tn_req->tensor_descriptors.find(output_hash); assert(iter != tn_req->tensor_descriptors.cend()); const auto & descr = iter->second; HANDLE_CUDA_ERROR(cudaMemcpyAsync(descr.src_ptr,descr.dst_ptr[gpu], descr.size,cudaMemcpyDefault,tn_req->stream)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_finish,tn_req->stream)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_out_finish,tn_req->stream)); } tn_req->exec_status = TensorNetworkQueue::ExecStat::Executing; return; Loading @@ -492,7 +513,7 @@ void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req) for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); cudaError_t cuda_error = cudaEventQuery(tn_req->compute_finish); cudaError_t cuda_error = cudaEventQuery(tn_req->data_out_finish); if(cuda_error == cudaSuccess){ if(tn_req->memory_window_ptr[gpu] != nullptr){ mem_pool_[gpu].releaseMemory(tn_req->memory_window_ptr[gpu]); Loading src/runtime/executor/cuquantum/cuquantum_executor.hpp +12 −3 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/08 REVISION: 2022/01/10 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -35,6 +35,13 @@ using TensorImplTalshFunc = std::function<std::shared_ptr<talsh::Tensor>(const n struct TensorNetworkReq; struct ExecutionTimings { float prepare = 0.0; float data_in = 0.0; float data_out = 0.0; float compute = 0.0; }; class CuQuantumExecutor { Loading Loading @@ -63,12 +70,14 @@ public: If wait = TRUE, waits until completion, otherwise just tests the progress. Returns the current status of the tensor network execution. **/ TensorNetworkQueue::ExecStat sync(const TensorOpExecHandle exec_handle, //in: tensor network execution handle int * error_code); //out: error code (0:success) int * error_code, //out: error code (0:success) int64_t * num_slices = nullptr, //out: number of tensor network slices ExecutionTimings * timings = nullptr); //out: execution timings (ms) /** Synchronizes execution of all submitted tensor networks to completion. **/ void sync(); /** Returns total executed flop count. **/ /** Returns the total executed flop count. **/ double getTotalFlopCount() const {return flops_;} protected: Loading src/runtime/executor/graph_executors/lazy/graph_executor_lazy.cpp +8 −3 Original line number Diff line number Diff line /** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy REVISION: 2022/01/08 REVISION: 2022/01/10 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -298,9 +298,11 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { const auto current = tensor_network_queue.getCurrent(); const auto exec_handle = current->second; int error_code = 0; int64_t num_slices = 0; ExecutionTimings timings; auto exec_stat = tensor_network_queue.checkExecStatus(exec_handle); if(exec_stat == TensorNetworkQueue::ExecStat::Idle || current_pos == 0){ exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution exec_stat = cuquantum_executor_->sync(exec_handle,&error_code,&num_slices,&timings); //this call will progress tensor network execution assert(error_code == 0); } if(exec_stat == TensorNetworkQueue::ExecStat::None){ Loading Loading @@ -329,7 +331,10 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { }else if(exec_stat == TensorNetworkQueue::ExecStat::Completed){ if(logging_.load() != 0){ logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart()) << "](LazyGraphExecutor)[EXEC_THREAD]: Completed via cuQuantum tensor network " << exec_handle << std::endl; << "](LazyGraphExecutor)[EXEC_THREAD]: Completed via cuQuantum tensor network " << exec_handle << ": NumSlices = " << num_slices << "; Time (ms): In{" << timings.data_in << "}, Prep{" << timings.prepare << "}, Comp{" << timings.compute << "}, Out{" << timings.data_out << "}" << std::endl; #ifdef DEBUG logfile_.flush(); #endif Loading Loading
src/exatn/tests/NumServerTester.cpp +26 −4 Original line number Diff line number Diff line Loading @@ -3789,9 +3789,9 @@ TEST(NumServerTester, CuTensorNet) { const auto TENS_ELEM_TYPE = TensorElementType::REAL32; const int NUM_REPEATS = 3; const int NUM_REPEATS = 10; //exatn::resetLoggingLevel(2,2); //debug exatn::resetLoggingLevel(2,2); //debug bool success = true; Loading @@ -3808,7 +3808,6 @@ TEST(NumServerTester, CuTensorNet) { success = exatn::initTensor("D",0.0); assert(success); success = exatn::sync(); assert(success); exatn::switchComputationalBackend("default"); //{default|cuquantum} //Contract tensor network: int num_repeats = NUM_REPEATS; Loading @@ -3822,9 +3821,32 @@ TEST(NumServerTester, CuTensorNet) { flops = exatn::getTotalFlopCount() - flops; std::cout << "Duration = " << duration << " s; GFlop count = " << flops/1e9 << "; Performance = " << (flops / (1e9 * duration)) << " Gflop/s\n"; double norm = 0.0; success = exatn::computeNorm1Sync("D",norm); assert(success); std::cout << "1-norm of tensor D = " << norm << std::endl; } //std::this_thread::sleep_for(std::chrono::microseconds(1000000)); #ifdef CUQUANTUM success = exatn::sync(); assert(success); exatn::switchComputationalBackend("cuquantum"); //{default|cuquantum} //Contract tensor network: num_repeats = NUM_REPEATS; while(--num_repeats >= 0){ std::cout << "D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y): "; auto flops = exatn::getTotalFlopCount(); auto time_start = exatn::Timer::timeInSecHR(); success = exatn::evaluateTensorNetwork("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)"); assert(success); success = exatn::sync("D",true); assert(success); auto duration = exatn::Timer::timeInSecHR(time_start); flops = exatn::getTotalFlopCount() - flops; std::cout << "Duration = " << duration << " s; GFlop count = " << flops/1e9 << "; Performance = " << (flops / (1e9 * duration)) << " Gflop/s\n"; double norm = 0.0; success = exatn::computeNorm1Sync("D",norm); assert(success); std::cout << "1-norm of tensor D = " << norm << std::endl; } #endif //Destroy tensors: success = exatn::sync(); assert(success); Loading
src/runtime/executor/cuquantum/cuquantum_executor.cu +37 −16 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/08 REVISION: 2022/01/10 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading @@ -21,6 +21,7 @@ Rationale: #include <iostream> #include "talshxx.hpp" #include "timers.hpp" #include "cuquantum_executor.hpp" Loading Loading @@ -55,6 +56,7 @@ struct TensorNetworkReq { TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::None; //tensor network execution status int num_procs = 0; //total number of executing processes int proc_id = -1; //id of the current executing process int64_t num_slices = 0; std::shared_ptr<numerics::TensorNetwork> network; //tensor network specification std::unordered_map<numerics::TensorHashType, TensorDescriptor> tensor_descriptors; //tensor descriptors (shape, volume, data type, body) std::unordered_map<unsigned int, std::vector<int32_t>> tensor_modes; //indices associated with tensor dimensions (key = original tensor id) Loading @@ -81,17 +83,21 @@ struct TensorNetworkReq { cudaDataType_t data_type; cutensornetComputeType_t compute_type; cudaStream_t stream; cudaEvent_t data_start; cudaEvent_t data_finish; cudaEvent_t data_in_start; cudaEvent_t data_in_finish; cudaEvent_t compute_start; cudaEvent_t compute_finish; cudaEvent_t data_out_finish; double prepare_start; double prepare_finish; ~TensorNetworkReq() { cudaStreamSynchronize(stream); cudaEventDestroy(data_out_finish); cudaEventDestroy(compute_finish); cudaEventDestroy(compute_start); cudaEventDestroy(data_finish); cudaEventDestroy(data_start); cudaEventDestroy(data_in_finish); cudaEventDestroy(data_in_start); cudaStreamDestroy(stream); cutensornetDestroyContractionPlan(comp_plan); cutensornetDestroyContractionOptimizerConfig(opt_config); Loading Loading @@ -205,7 +211,9 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::execute(std::shared_ptr<numerics TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle exec_handle, int * error_code) int * error_code, int64_t * num_slices, ExecutionTimings * timings) { *error_code = 0; TensorNetworkQueue::ExecStat exec_stat = TensorNetworkQueue::ExecStat::None; Loading @@ -223,6 +231,15 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle ex contractTensorNetwork(tn_req); //Planning --> Executing } exec_stat = tn_req->exec_status; if(exec_stat == TensorNetworkQueue::ExecStat::Completed){ if(num_slices != nullptr) *num_slices = tn_req->num_slices; if(timings != nullptr){ timings->prepare = (tn_req->prepare_finish - tn_req->prepare_start) * 1000.0; HANDLE_CUDA_ERROR(cudaEventElapsedTime(&(timings->data_in),tn_req->data_in_start,tn_req->data_in_finish)); HANDLE_CUDA_ERROR(cudaEventElapsedTime(&(timings->data_out),tn_req->compute_finish,tn_req->data_out_finish)); HANDLE_CUDA_ERROR(cudaEventElapsedTime(&(timings->compute),tn_req->compute_start,tn_req->compute_finish)); } } tn_req.reset(); if(exec_stat == TensorNetworkQueue::ExecStat::Completed) active_networks_.erase(iter); } Loading Loading @@ -368,10 +385,11 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_ tn_req->num_modes_out,tn_req->extents_out,tn_req->strides_out,tn_req->modes_out,tn_req->alignment_out, tn_req->data_type,tn_req->compute_type,&(tn_req->net_descriptor))); HANDLE_CUDA_ERROR(cudaStreamCreate(&(tn_req->stream))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_start))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_finish))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_in_start))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_in_finish))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->compute_start))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->compute_finish))); HANDLE_CUDA_ERROR(cudaEventCreate(&(tn_req->data_out_finish))); } return; } Loading @@ -383,7 +401,6 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req) for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_start,tn_req->stream)); void * prev_front = mem_pool_[gpu].getFront(); bool success = true; //Acquire device memory: Loading @@ -394,11 +411,12 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req) } if(success){ //Initiate data transfers: HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_in_start,tn_req->stream)); for(auto & descr: tn_req->tensor_descriptors){ HANDLE_CUDA_ERROR(cudaMemcpyAsync(descr.second.dst_ptr.back(),descr.second.src_ptr, descr.second.size,cudaMemcpyDefault,tn_req->stream)); } HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_finish,tn_req->stream)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_in_finish,tn_req->stream)); tn_req->memory_window_ptr.emplace_back(mem_pool_[gpu].getFront()); auto & net = *(tn_req->network); int32_t tens_num = 0; Loading Loading @@ -428,6 +446,7 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req) void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req) { //Configure tensor network contraction on one or all GPUs: tn_req->prepare_start = Timer::timeInSecHR(); for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); Loading @@ -447,6 +466,7 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req) &flops,sizeof(flops))); flops_ += flops; } tn_req->prepare_finish = Timer::timeInSecHR(); tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning; return; } Loading @@ -458,27 +478,28 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq> for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); int64_t num_slices = 0; tn_req->num_slices = 0; HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle, tn_req->opt_info, CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES, &num_slices,sizeof(num_slices))); assert(num_slices > 0); &(tn_req->num_slices),sizeof(tn_req->num_slices))); assert(tn_req->num_slices > 0); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_start,tn_req->stream)); for(int64_t slice_id = tn_req->proc_id; slice_id < num_slices; slice_id += tn_req->num_procs){ for(int64_t slice_id = tn_req->proc_id; slice_id < tn_req->num_slices; slice_id += tn_req->num_procs){ HANDLE_CTN_ERROR(cutensornetContraction(gpu_attr_[gpu].second.cutn_handle, tn_req->comp_plan, tn_req->data_in,tn_req->data_out, tn_req->workspace,tn_req->worksize, slice_id,tn_req->stream)); } HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_finish,tn_req->stream)); const auto output_hash = tn_req->network->getTensor(0)->getTensorHash(); auto iter = tn_req->tensor_descriptors.find(output_hash); assert(iter != tn_req->tensor_descriptors.cend()); const auto & descr = iter->second; HANDLE_CUDA_ERROR(cudaMemcpyAsync(descr.src_ptr,descr.dst_ptr[gpu], descr.size,cudaMemcpyDefault,tn_req->stream)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_finish,tn_req->stream)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_out_finish,tn_req->stream)); } tn_req->exec_status = TensorNetworkQueue::ExecStat::Executing; return; Loading @@ -492,7 +513,7 @@ void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req) for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); cudaError_t cuda_error = cudaEventQuery(tn_req->compute_finish); cudaError_t cuda_error = cudaEventQuery(tn_req->data_out_finish); if(cuda_error == cudaSuccess){ if(tn_req->memory_window_ptr[gpu] != nullptr){ mem_pool_[gpu].releaseMemory(tn_req->memory_window_ptr[gpu]); Loading
src/runtime/executor/cuquantum/cuquantum_executor.hpp +12 −3 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/08 REVISION: 2022/01/10 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -35,6 +35,13 @@ using TensorImplTalshFunc = std::function<std::shared_ptr<talsh::Tensor>(const n struct TensorNetworkReq; struct ExecutionTimings { float prepare = 0.0; float data_in = 0.0; float data_out = 0.0; float compute = 0.0; }; class CuQuantumExecutor { Loading Loading @@ -63,12 +70,14 @@ public: If wait = TRUE, waits until completion, otherwise just tests the progress. Returns the current status of the tensor network execution. **/ TensorNetworkQueue::ExecStat sync(const TensorOpExecHandle exec_handle, //in: tensor network execution handle int * error_code); //out: error code (0:success) int * error_code, //out: error code (0:success) int64_t * num_slices = nullptr, //out: number of tensor network slices ExecutionTimings * timings = nullptr); //out: execution timings (ms) /** Synchronizes execution of all submitted tensor networks to completion. **/ void sync(); /** Returns total executed flop count. **/ /** Returns the total executed flop count. **/ double getTotalFlopCount() const {return flops_;} protected: Loading
src/runtime/executor/graph_executors/lazy/graph_executor_lazy.cpp +8 −3 Original line number Diff line number Diff line /** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy REVISION: 2022/01/08 REVISION: 2022/01/10 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -298,9 +298,11 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { const auto current = tensor_network_queue.getCurrent(); const auto exec_handle = current->second; int error_code = 0; int64_t num_slices = 0; ExecutionTimings timings; auto exec_stat = tensor_network_queue.checkExecStatus(exec_handle); if(exec_stat == TensorNetworkQueue::ExecStat::Idle || current_pos == 0){ exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution exec_stat = cuquantum_executor_->sync(exec_handle,&error_code,&num_slices,&timings); //this call will progress tensor network execution assert(error_code == 0); } if(exec_stat == TensorNetworkQueue::ExecStat::None){ Loading Loading @@ -329,7 +331,10 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { }else if(exec_stat == TensorNetworkQueue::ExecStat::Completed){ if(logging_.load() != 0){ logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart()) << "](LazyGraphExecutor)[EXEC_THREAD]: Completed via cuQuantum tensor network " << exec_handle << std::endl; << "](LazyGraphExecutor)[EXEC_THREAD]: Completed via cuQuantum tensor network " << exec_handle << ": NumSlices = " << num_slices << "; Time (ms): In{" << timings.data_in << "}, Prep{" << timings.prepare << "}, Comp{" << timings.compute << "}, Out{" << timings.data_out << "}" << std::endl; #ifdef DEBUG logfile_.flush(); #endif Loading