Loading src/runtime/executor/cuquantum/cuquantum_executor.cu +66 −20 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/04 REVISION: 2022/01/05 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -91,6 +91,9 @@ struct TensorNetworkReq { cudaEventDestroy(data_finish); cudaEventDestroy(data_start); cudaStreamDestroy(stream); cutensornetDestroyContractionPlan(comp_plan); cutensornetDestroyContractionOptimizerConfig(opt_config); cutensornetDestroyContractionOptimizerInfo(opt_info); cutensornetDestroyNetworkDescriptor(net_descriptor); if(modes_out != nullptr) delete [] modes_out; if(strides_out != nullptr) delete [] strides_out; Loading @@ -105,8 +108,8 @@ struct TensorNetworkReq { }; CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func): tensor_data_access_func_(std::move(tensor_data_access_func)) CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth): tensor_data_access_func_(std::move(tensor_data_access_func)), pipe_depth_(pipeline_depth) { static_assert(std::is_same<cutensornetHandle_t,void*>::value,"#FATAL(exatn::runtime::CuQuantumExecutor): cutensornetHandle_t != (void*)"); Loading @@ -118,6 +121,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func): for(int i = 0; i < num_gpus; ++i){ if(talshDeviceState(i,DEV_NVIDIA_GPU) >= DEV_ON){ gpu_attr_.emplace_back(std::make_pair(i,DeviceAttr{})); gpu_attr_.back().second.pipe_level = 0; gpu_attr_.back().second.workspace_ptr = talsh::getDeviceBufferBasePtr(DEV_NVIDIA_GPU,i); assert(reinterpret_cast<std::size_t>(gpu_attr_.back().second.workspace_ptr) % MEM_ALIGNMENT == 0); gpu_attr_.back().second.buffer_size = talsh::getDeviceMaxBufferSize(DEV_NVIDIA_GPU,i); Loading Loading @@ -257,6 +261,19 @@ cutensornetComputeType_t getCutensorComputeType(const TensorElementType elem_typ } void CuQuantumExecutor::acquireWorkspace(unsigned int dev, void ** workspace_ptr, uint64_t * workspace_size) { assert(dev < gpu_attr_.size()); auto & dev_attr = gpu_attr_[dev].second; *workspace_size = dev_attr.workspace_size / pipe_depth_; *workspace_ptr = (void*)((char*)(dev_attr.workspace_ptr) + ((*workspace_size) * dev_attr.pipe_level)); dev_attr.pipe_level = (++(dev_attr.pipe_level)) % pipe_depth_; return; } void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req) { const auto & net = *(tn_req->network); Loading @@ -266,6 +283,7 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_ tn_req->strides_in = new int64_t*[num_input_tensors]; tn_req->modes_in = new int32_t*[num_input_tensors]; tn_req->alignments_in = new uint32_t[num_input_tensors]; tn_req->data_in = new void*[num_input_tensors]; for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->strides_in[i] = NULL; for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->alignments_in[i] = MEM_ALIGNMENT; Loading Loading @@ -370,7 +388,21 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req) } HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_finish,tn_req->stream)); tn_req->memory_window_ptr.emplace_back(mem_pool_[gpu].getFront()); auto & net = *(tn_req->network); int32_t tens_num = 0; for(auto iter = net.cbegin(); iter != net.cend(); ++iter){ const auto tens_id = iter->first; const auto & tens = iter->second; const auto tens_hash = tens.getTensor()->getTensorHash(); auto descr = tn_req->tensor_descriptors.find(tens_hash); void * dev_ptr = descr->second.dst_ptr.back(); if(tens_id == 0){ tn_req->data_out = dev_ptr; }else{ tn_req->data_in[tens_num++] = dev_ptr; } } }else{ //no enough memory currently //Restore previous memory front: mem_pool_[gpu].restorePreviousFront(prev_front); return; Loading @@ -385,16 +417,22 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req) { //Configure tensor network contraction on one or all GPUs: for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerConfig(gpu_attr_[gpu].second.cutn_handle,&(tn_req->opt_config))); HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerInfo(gpu_attr_[gpu].second.cutn_handle,tn_req->net_descriptor,&(tn_req->opt_info))); tn_req->worksize = gpu_attr_[gpu].second.workspace_size / PIPELINE_DEPTH; tn_req->workspace = gpu_attr_[gpu].second.workspace_ptr; //`Need moving window (pipelining) acquireWorkspace(gpu,&(tn_req->workspace),&(tn_req->worksize)); HANDLE_CTN_ERROR(cutensornetContractionOptimize(gpu_attr_[gpu].second.cutn_handle, tn_req->net_descriptor,tn_req->opt_config, tn_req->worksize,tn_req->opt_info)); HANDLE_CTN_ERROR(cutensornetCreateContractionPlan(gpu_attr_[gpu].second.cutn_handle, tn_req->net_descriptor,tn_req->opt_info, tn_req->worksize,&(tn_req->comp_plan))); double flops = 0.0; HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle, tn_req->opt_info, CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT, &flops,sizeof(flops))); } tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning; return; Loading @@ -406,13 +444,13 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq> //Execute the contraction plan on one or all GPUs: for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); int64_t num_slices = 0; HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle, tn_req->opt_info, CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES, &num_slices,sizeof(num_slices))); assert(num_slices > 0); HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_start,tn_req->stream)); for(int64_t slice_id = 0; slice_id < num_slices; ++slice_id){ HANDLE_CTN_ERROR(cutensornetContraction(gpu_attr_[gpu].second.cutn_handle, Loading @@ -431,13 +469,21 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq> void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req) { //Test work completion on one or all GPUs: bool all_completed = true; for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); cudaError_t cuda_error = cudaEventQuery(tn_req->compute_finish); if(cuda_error == cudaSuccess){ //`Move MemPool back forward using tn_req->memory_window_ptr tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed; if(tn_req->memory_window_ptr[gpu] != nullptr){ mem_pool_[gpu].releaseMemory(tn_req->memory_window_ptr[gpu]); tn_req->memory_window_ptr[gpu] = nullptr; } }else{ all_completed = false; } } if(all_completed) tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed; return; } Loading src/runtime/executor/cuquantum/cuquantum_executor.hpp +11 −4 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/04 REVISION: 2022/01/05 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -40,7 +40,8 @@ class CuQuantumExecutor { public: CuQuantumExecutor(TensorImplFunc tensor_data_access_func); CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth); CuQuantumExecutor(const CuQuantumExecutor &) = delete; CuQuantumExecutor & operator=(CuQuantumExecutor &) = delete; Loading @@ -65,10 +66,13 @@ public: protected: static constexpr float WORKSPACE_FRACTION = 0.2; static constexpr unsigned int PIPELINE_DEPTH = 1; static constexpr float WORKSPACE_FRACTION = 0.6; static constexpr std::size_t MEM_ALIGNMENT = 256; void acquireWorkspace(unsigned int dev, void ** workspace_ptr, uint64_t * workspace_size); void parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req); void loadTensors(std::shared_ptr<TensorNetworkReq> tn_req); void planExecution(std::shared_ptr<TensorNetworkReq> tn_req); Loading @@ -80,6 +84,7 @@ protected: std::size_t buffer_size = 0; void * workspace_ptr = nullptr; std::size_t workspace_size = 0; unsigned int pipe_level = 0; void * cutn_handle; //cutensornetHandle_t = void* }; Loading @@ -91,6 +96,8 @@ protected: std::vector<LinearMemoryPool> mem_pool_; /** Tensor data access function **/ TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes} /** Pipeline depth **/ const unsigned int pipe_depth_; }; } //namespace runtime Loading src/runtime/executor/cuquantum/tensor_network_queue.hpp +10 −4 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: Execution queue REVISION: 2021/12/30 REVISION: 2022/01/05 Copyright (C) 2018-2021 Dmitry Lyakh Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Rationale: - ExaTN graph executor may accept whole tensor networks for execution Loading Loading @@ -144,7 +144,7 @@ public: return current_network_; } /** Returns the current iterator to the beginning of the queue. **/ /** Resets the current iterator to the beginning of the queue. **/ void reset() { lock(); current_network_ = networks_.begin(); Loading Loading @@ -173,6 +173,12 @@ public: return not_over; } /** Returns the distance from the current iterator to the beginning of the queue. **/ auto getCurrentPos() { return std::distance(networks_.begin(),current_network_); } /** Locks. **/ inline void lock(){queue_lock_.lock();} inline void unlock(){queue_lock_.unlock();} Loading src/runtime/executor/graph_executors/lazy/graph_executor_lazy.cpp +30 −20 Original line number Diff line number Diff line /** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy REVISION: 2022/01/03 REVISION: 2022/01/05 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -35,7 +35,8 @@ void LazyGraphExecutor::resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> no [this](const numerics::Tensor & tensor, int device_kind, int device_id, std::size_t * size){ void * data_ptr = this->node_executor_->getTensorImage(tensor,device_kind,device_id,size); return data_ptr; } }, CUQUANTUM_PIPELINE_DEPTH ); } #endif Loading Loading @@ -277,7 +278,7 @@ void LazyGraphExecutor::execute(TensorGraph & dag) { void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { #ifdef CUQUANTUM std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Started executing the tensor network queue via cuQuantum: " << tensor_network_queue.getSize() << " elements detected" << std::endl; << tensor_network_queue.getSize() << " networks detected" << std::endl; assert(node_executor_); //Synchronize the node executor: bool synced = node_executor_->sync(); assert(synced); Loading @@ -287,11 +288,16 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { tensor_network_queue.reset(); bool not_over = !tensor_network_queue.isOver(); while(not_over){ int error_code = 0; const auto current_pos = tensor_network_queue.getCurrentPos(); if(current_pos < CUQUANTUM_PIPELINE_DEPTH){ const auto current = tensor_network_queue.getCurrent(); const auto exec_handle = current->second; auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution int error_code = 0; auto exec_stat = tensor_network_queue.checkExecStatus(exec_handle); if(exec_stat == TensorNetworkQueue::ExecStat::Idle || current_pos == 0){ exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution assert(error_code == 0); } if(exec_stat == TensorNetworkQueue::ExecStat::None){ exec_stat = cuquantum_executor_->execute(current->first,exec_handle); if(exec_stat != TensorNetworkQueue::ExecStat::None){ Loading @@ -301,6 +307,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { not_over = tensor_network_queue.next(); }else if(exec_stat == TensorNetworkQueue::ExecStat::Completed){ auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat); assert(current_pos == 0); tensor_network_queue.remove(); std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Completed tensor network execution via cuQuantum\n"; not_over = !tensor_network_queue.isOver(); Loading @@ -308,6 +315,9 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat); not_over = tensor_network_queue.next(); } }else{ not_over = false; } } } cuquantum_executor_->sync(); Loading src/runtime/executor/graph_executors/lazy/graph_executor_lazy.hpp +6 −3 Original line number Diff line number Diff line /** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy REVISION: 2021/12/22 REVISION: 2022/01/05 Copyright (C) 2018-2021 Dmitry Lyakh, Alex McCaskey Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) Copyright (C) 2018-2022 Dmitry Lyakh, Alex McCaskey Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Rationale: Loading @@ -26,6 +26,9 @@ public: static constexpr const unsigned int DEFAULT_PIPELINE_DEPTH = 16; static constexpr const unsigned int DEFAULT_PREFETCH_DEPTH = 4; #ifdef CUQUANTUM static constexpr const unsigned int CUQUANTUM_PIPELINE_DEPTH = 2; #endif LazyGraphExecutor(): pipeline_depth_(DEFAULT_PIPELINE_DEPTH), prefetch_depth_(DEFAULT_PREFETCH_DEPTH) Loading Loading
src/runtime/executor/cuquantum/cuquantum_executor.cu +66 −20 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/04 REVISION: 2022/01/05 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -91,6 +91,9 @@ struct TensorNetworkReq { cudaEventDestroy(data_finish); cudaEventDestroy(data_start); cudaStreamDestroy(stream); cutensornetDestroyContractionPlan(comp_plan); cutensornetDestroyContractionOptimizerConfig(opt_config); cutensornetDestroyContractionOptimizerInfo(opt_info); cutensornetDestroyNetworkDescriptor(net_descriptor); if(modes_out != nullptr) delete [] modes_out; if(strides_out != nullptr) delete [] strides_out; Loading @@ -105,8 +108,8 @@ struct TensorNetworkReq { }; CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func): tensor_data_access_func_(std::move(tensor_data_access_func)) CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth): tensor_data_access_func_(std::move(tensor_data_access_func)), pipe_depth_(pipeline_depth) { static_assert(std::is_same<cutensornetHandle_t,void*>::value,"#FATAL(exatn::runtime::CuQuantumExecutor): cutensornetHandle_t != (void*)"); Loading @@ -118,6 +121,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func): for(int i = 0; i < num_gpus; ++i){ if(talshDeviceState(i,DEV_NVIDIA_GPU) >= DEV_ON){ gpu_attr_.emplace_back(std::make_pair(i,DeviceAttr{})); gpu_attr_.back().second.pipe_level = 0; gpu_attr_.back().second.workspace_ptr = talsh::getDeviceBufferBasePtr(DEV_NVIDIA_GPU,i); assert(reinterpret_cast<std::size_t>(gpu_attr_.back().second.workspace_ptr) % MEM_ALIGNMENT == 0); gpu_attr_.back().second.buffer_size = talsh::getDeviceMaxBufferSize(DEV_NVIDIA_GPU,i); Loading Loading @@ -257,6 +261,19 @@ cutensornetComputeType_t getCutensorComputeType(const TensorElementType elem_typ } void CuQuantumExecutor::acquireWorkspace(unsigned int dev, void ** workspace_ptr, uint64_t * workspace_size) { assert(dev < gpu_attr_.size()); auto & dev_attr = gpu_attr_[dev].second; *workspace_size = dev_attr.workspace_size / pipe_depth_; *workspace_ptr = (void*)((char*)(dev_attr.workspace_ptr) + ((*workspace_size) * dev_attr.pipe_level)); dev_attr.pipe_level = (++(dev_attr.pipe_level)) % pipe_depth_; return; } void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req) { const auto & net = *(tn_req->network); Loading @@ -266,6 +283,7 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_ tn_req->strides_in = new int64_t*[num_input_tensors]; tn_req->modes_in = new int32_t*[num_input_tensors]; tn_req->alignments_in = new uint32_t[num_input_tensors]; tn_req->data_in = new void*[num_input_tensors]; for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->strides_in[i] = NULL; for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->alignments_in[i] = MEM_ALIGNMENT; Loading Loading @@ -370,7 +388,21 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req) } HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_finish,tn_req->stream)); tn_req->memory_window_ptr.emplace_back(mem_pool_[gpu].getFront()); auto & net = *(tn_req->network); int32_t tens_num = 0; for(auto iter = net.cbegin(); iter != net.cend(); ++iter){ const auto tens_id = iter->first; const auto & tens = iter->second; const auto tens_hash = tens.getTensor()->getTensorHash(); auto descr = tn_req->tensor_descriptors.find(tens_hash); void * dev_ptr = descr->second.dst_ptr.back(); if(tens_id == 0){ tn_req->data_out = dev_ptr; }else{ tn_req->data_in[tens_num++] = dev_ptr; } } }else{ //no enough memory currently //Restore previous memory front: mem_pool_[gpu].restorePreviousFront(prev_front); return; Loading @@ -385,16 +417,22 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req) { //Configure tensor network contraction on one or all GPUs: for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerConfig(gpu_attr_[gpu].second.cutn_handle,&(tn_req->opt_config))); HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerInfo(gpu_attr_[gpu].second.cutn_handle,tn_req->net_descriptor,&(tn_req->opt_info))); tn_req->worksize = gpu_attr_[gpu].second.workspace_size / PIPELINE_DEPTH; tn_req->workspace = gpu_attr_[gpu].second.workspace_ptr; //`Need moving window (pipelining) acquireWorkspace(gpu,&(tn_req->workspace),&(tn_req->worksize)); HANDLE_CTN_ERROR(cutensornetContractionOptimize(gpu_attr_[gpu].second.cutn_handle, tn_req->net_descriptor,tn_req->opt_config, tn_req->worksize,tn_req->opt_info)); HANDLE_CTN_ERROR(cutensornetCreateContractionPlan(gpu_attr_[gpu].second.cutn_handle, tn_req->net_descriptor,tn_req->opt_info, tn_req->worksize,&(tn_req->comp_plan))); double flops = 0.0; HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle, tn_req->opt_info, CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT, &flops,sizeof(flops))); } tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning; return; Loading @@ -406,13 +444,13 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq> //Execute the contraction plan on one or all GPUs: for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); int64_t num_slices = 0; HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle, tn_req->opt_info, CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES, &num_slices,sizeof(num_slices))); assert(num_slices > 0); HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_start,tn_req->stream)); for(int64_t slice_id = 0; slice_id < num_slices; ++slice_id){ HANDLE_CTN_ERROR(cutensornetContraction(gpu_attr_[gpu].second.cutn_handle, Loading @@ -431,13 +469,21 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq> void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req) { //Test work completion on one or all GPUs: bool all_completed = true; for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now const auto gpu_id = gpu_attr_[gpu].first; HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id)); cudaError_t cuda_error = cudaEventQuery(tn_req->compute_finish); if(cuda_error == cudaSuccess){ //`Move MemPool back forward using tn_req->memory_window_ptr tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed; if(tn_req->memory_window_ptr[gpu] != nullptr){ mem_pool_[gpu].releaseMemory(tn_req->memory_window_ptr[gpu]); tn_req->memory_window_ptr[gpu] = nullptr; } }else{ all_completed = false; } } if(all_completed) tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed; return; } Loading
src/runtime/executor/cuquantum/cuquantum_executor.hpp +11 −4 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/04 REVISION: 2022/01/05 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -40,7 +40,8 @@ class CuQuantumExecutor { public: CuQuantumExecutor(TensorImplFunc tensor_data_access_func); CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth); CuQuantumExecutor(const CuQuantumExecutor &) = delete; CuQuantumExecutor & operator=(CuQuantumExecutor &) = delete; Loading @@ -65,10 +66,13 @@ public: protected: static constexpr float WORKSPACE_FRACTION = 0.2; static constexpr unsigned int PIPELINE_DEPTH = 1; static constexpr float WORKSPACE_FRACTION = 0.6; static constexpr std::size_t MEM_ALIGNMENT = 256; void acquireWorkspace(unsigned int dev, void ** workspace_ptr, uint64_t * workspace_size); void parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req); void loadTensors(std::shared_ptr<TensorNetworkReq> tn_req); void planExecution(std::shared_ptr<TensorNetworkReq> tn_req); Loading @@ -80,6 +84,7 @@ protected: std::size_t buffer_size = 0; void * workspace_ptr = nullptr; std::size_t workspace_size = 0; unsigned int pipe_level = 0; void * cutn_handle; //cutensornetHandle_t = void* }; Loading @@ -91,6 +96,8 @@ protected: std::vector<LinearMemoryPool> mem_pool_; /** Tensor data access function **/ TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes} /** Pipeline depth **/ const unsigned int pipe_depth_; }; } //namespace runtime Loading
src/runtime/executor/cuquantum/tensor_network_queue.hpp +10 −4 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: Execution queue REVISION: 2021/12/30 REVISION: 2022/01/05 Copyright (C) 2018-2021 Dmitry Lyakh Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Rationale: - ExaTN graph executor may accept whole tensor networks for execution Loading Loading @@ -144,7 +144,7 @@ public: return current_network_; } /** Returns the current iterator to the beginning of the queue. **/ /** Resets the current iterator to the beginning of the queue. **/ void reset() { lock(); current_network_ = networks_.begin(); Loading Loading @@ -173,6 +173,12 @@ public: return not_over; } /** Returns the distance from the current iterator to the beginning of the queue. **/ auto getCurrentPos() { return std::distance(networks_.begin(),current_network_); } /** Locks. **/ inline void lock(){queue_lock_.lock();} inline void unlock(){queue_lock_.unlock();} Loading
src/runtime/executor/graph_executors/lazy/graph_executor_lazy.cpp +30 −20 Original line number Diff line number Diff line /** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy REVISION: 2022/01/03 REVISION: 2022/01/05 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -35,7 +35,8 @@ void LazyGraphExecutor::resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> no [this](const numerics::Tensor & tensor, int device_kind, int device_id, std::size_t * size){ void * data_ptr = this->node_executor_->getTensorImage(tensor,device_kind,device_id,size); return data_ptr; } }, CUQUANTUM_PIPELINE_DEPTH ); } #endif Loading Loading @@ -277,7 +278,7 @@ void LazyGraphExecutor::execute(TensorGraph & dag) { void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { #ifdef CUQUANTUM std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Started executing the tensor network queue via cuQuantum: " << tensor_network_queue.getSize() << " elements detected" << std::endl; << tensor_network_queue.getSize() << " networks detected" << std::endl; assert(node_executor_); //Synchronize the node executor: bool synced = node_executor_->sync(); assert(synced); Loading @@ -287,11 +288,16 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { tensor_network_queue.reset(); bool not_over = !tensor_network_queue.isOver(); while(not_over){ int error_code = 0; const auto current_pos = tensor_network_queue.getCurrentPos(); if(current_pos < CUQUANTUM_PIPELINE_DEPTH){ const auto current = tensor_network_queue.getCurrent(); const auto exec_handle = current->second; auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution int error_code = 0; auto exec_stat = tensor_network_queue.checkExecStatus(exec_handle); if(exec_stat == TensorNetworkQueue::ExecStat::Idle || current_pos == 0){ exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution assert(error_code == 0); } if(exec_stat == TensorNetworkQueue::ExecStat::None){ exec_stat = cuquantum_executor_->execute(current->first,exec_handle); if(exec_stat != TensorNetworkQueue::ExecStat::None){ Loading @@ -301,6 +307,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { not_over = tensor_network_queue.next(); }else if(exec_stat == TensorNetworkQueue::ExecStat::Completed){ auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat); assert(current_pos == 0); tensor_network_queue.remove(); std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Completed tensor network execution via cuQuantum\n"; not_over = !tensor_network_queue.isOver(); Loading @@ -308,6 +315,9 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) { auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat); not_over = tensor_network_queue.next(); } }else{ not_over = false; } } } cuquantum_executor_->sync(); Loading
src/runtime/executor/graph_executors/lazy/graph_executor_lazy.hpp +6 −3 Original line number Diff line number Diff line /** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy REVISION: 2021/12/22 REVISION: 2022/01/05 Copyright (C) 2018-2021 Dmitry Lyakh, Alex McCaskey Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) Copyright (C) 2018-2022 Dmitry Lyakh, Alex McCaskey Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Rationale: Loading @@ -26,6 +26,9 @@ public: static constexpr const unsigned int DEFAULT_PIPELINE_DEPTH = 16; static constexpr const unsigned int DEFAULT_PREFETCH_DEPTH = 4; #ifdef CUQUANTUM static constexpr const unsigned int CUQUANTUM_PIPELINE_DEPTH = 2; #endif LazyGraphExecutor(): pipeline_depth_(DEFAULT_PIPELINE_DEPTH), prefetch_depth_(DEFAULT_PREFETCH_DEPTH) Loading