Commit ed6dc07f authored by Dmitry I. Lyakh's avatar Dmitry I. Lyakh
Browse files

Finished CuQuantum executor for one GPU per process, needs testing ...


Signed-off-by: default avatarDmitry I. Lyakh <quant4me@gmail.com>
parent 6516d89e
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/04
REVISION: 2022/01/05
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
......@@ -91,6 +91,9 @@ struct TensorNetworkReq {
cudaEventDestroy(data_finish);
cudaEventDestroy(data_start);
cudaStreamDestroy(stream);
cutensornetDestroyContractionPlan(comp_plan);
cutensornetDestroyContractionOptimizerConfig(opt_config);
cutensornetDestroyContractionOptimizerInfo(opt_info);
cutensornetDestroyNetworkDescriptor(net_descriptor);
if(modes_out != nullptr) delete [] modes_out;
if(strides_out != nullptr) delete [] strides_out;
......@@ -105,8 +108,8 @@ struct TensorNetworkReq {
};
CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
tensor_data_access_func_(std::move(tensor_data_access_func))
CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth):
tensor_data_access_func_(std::move(tensor_data_access_func)), pipe_depth_(pipeline_depth)
{
static_assert(std::is_same<cutensornetHandle_t,void*>::value,"#FATAL(exatn::runtime::CuQuantumExecutor): cutensornetHandle_t != (void*)");
......@@ -118,6 +121,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
for(int i = 0; i < num_gpus; ++i){
if(talshDeviceState(i,DEV_NVIDIA_GPU) >= DEV_ON){
gpu_attr_.emplace_back(std::make_pair(i,DeviceAttr{}));
gpu_attr_.back().second.pipe_level = 0;
gpu_attr_.back().second.workspace_ptr = talsh::getDeviceBufferBasePtr(DEV_NVIDIA_GPU,i);
assert(reinterpret_cast<std::size_t>(gpu_attr_.back().second.workspace_ptr) % MEM_ALIGNMENT == 0);
gpu_attr_.back().second.buffer_size = talsh::getDeviceMaxBufferSize(DEV_NVIDIA_GPU,i);
......@@ -257,6 +261,19 @@ cutensornetComputeType_t getCutensorComputeType(const TensorElementType elem_typ
}
void CuQuantumExecutor::acquireWorkspace(unsigned int dev,
void ** workspace_ptr,
uint64_t * workspace_size)
{
assert(dev < gpu_attr_.size());
auto & dev_attr = gpu_attr_[dev].second;
*workspace_size = dev_attr.workspace_size / pipe_depth_;
*workspace_ptr = (void*)((char*)(dev_attr.workspace_ptr) + ((*workspace_size) * dev_attr.pipe_level));
dev_attr.pipe_level = (++(dev_attr.pipe_level)) % pipe_depth_;
return;
}
void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
{
const auto & net = *(tn_req->network);
......@@ -266,6 +283,7 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_
tn_req->strides_in = new int64_t*[num_input_tensors];
tn_req->modes_in = new int32_t*[num_input_tensors];
tn_req->alignments_in = new uint32_t[num_input_tensors];
tn_req->data_in = new void*[num_input_tensors];
for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->strides_in[i] = NULL;
for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->alignments_in[i] = MEM_ALIGNMENT;
......@@ -370,7 +388,21 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
}
HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_finish,tn_req->stream));
tn_req->memory_window_ptr.emplace_back(mem_pool_[gpu].getFront());
}else{
auto & net = *(tn_req->network);
int32_t tens_num = 0;
for(auto iter = net.cbegin(); iter != net.cend(); ++iter){
const auto tens_id = iter->first;
const auto & tens = iter->second;
const auto tens_hash = tens.getTensor()->getTensorHash();
auto descr = tn_req->tensor_descriptors.find(tens_hash);
void * dev_ptr = descr->second.dst_ptr.back();
if(tens_id == 0){
tn_req->data_out = dev_ptr;
}else{
tn_req->data_in[tens_num++] = dev_ptr;
}
}
}else{ //no enough memory currently
//Restore previous memory front:
mem_pool_[gpu].restorePreviousFront(prev_front);
return;
......@@ -385,16 +417,22 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req)
{
//Configure tensor network contraction on one or all GPUs:
for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
const auto gpu_id = gpu_attr_[gpu].first;
HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerConfig(gpu_attr_[gpu].second.cutn_handle,&(tn_req->opt_config)));
HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerInfo(gpu_attr_[gpu].second.cutn_handle,tn_req->net_descriptor,&(tn_req->opt_info)));
tn_req->worksize = gpu_attr_[gpu].second.workspace_size / PIPELINE_DEPTH;
tn_req->workspace = gpu_attr_[gpu].second.workspace_ptr; //`Need moving window (pipelining)
acquireWorkspace(gpu,&(tn_req->workspace),&(tn_req->worksize));
HANDLE_CTN_ERROR(cutensornetContractionOptimize(gpu_attr_[gpu].second.cutn_handle,
tn_req->net_descriptor,tn_req->opt_config,
tn_req->worksize,tn_req->opt_info));
tn_req->net_descriptor,tn_req->opt_config,
tn_req->worksize,tn_req->opt_info));
HANDLE_CTN_ERROR(cutensornetCreateContractionPlan(gpu_attr_[gpu].second.cutn_handle,
tn_req->net_descriptor,tn_req->opt_info,
tn_req->worksize,&(tn_req->comp_plan)));
tn_req->net_descriptor,tn_req->opt_info,
tn_req->worksize,&(tn_req->comp_plan)));
double flops = 0.0;
HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle,
tn_req->opt_info,
CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
&flops,sizeof(flops)));
}
tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning;
return;
......@@ -406,20 +444,20 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
//Execute the contraction plan on one or all GPUs:
for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
const auto gpu_id = gpu_attr_[gpu].first;
HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
int64_t num_slices = 0;
HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle,
tn_req->opt_info,
CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
&num_slices,sizeof(num_slices)));
tn_req->opt_info,
CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
&num_slices,sizeof(num_slices)));
assert(num_slices > 0);
HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_start,tn_req->stream));
for(int64_t slice_id = 0; slice_id < num_slices; ++slice_id){
HANDLE_CTN_ERROR(cutensornetContraction(gpu_attr_[gpu].second.cutn_handle,
tn_req->comp_plan,
tn_req->data_in,tn_req->data_out,
tn_req->workspace,tn_req->worksize,
slice_id,tn_req->stream));
tn_req->comp_plan,
tn_req->data_in,tn_req->data_out,
tn_req->workspace,tn_req->worksize,
slice_id,tn_req->stream));
}
HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_finish,tn_req->stream));
}
......@@ -431,13 +469,21 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req)
{
//Test work completion on one or all GPUs:
bool all_completed = true;
for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
const auto gpu_id = gpu_attr_[gpu].first;
HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
cudaError_t cuda_error = cudaEventQuery(tn_req->compute_finish);
if(cuda_error == cudaSuccess){
//`Move MemPool back forward using tn_req->memory_window_ptr
tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed;
if(tn_req->memory_window_ptr[gpu] != nullptr){
mem_pool_[gpu].releaseMemory(tn_req->memory_window_ptr[gpu]);
tn_req->memory_window_ptr[gpu] = nullptr;
}
}else{
all_completed = false;
}
}
if(all_completed) tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed;
return;
}
......
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/04
REVISION: 2022/01/05
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
......@@ -40,7 +40,8 @@ class CuQuantumExecutor {
public:
CuQuantumExecutor(TensorImplFunc tensor_data_access_func);
CuQuantumExecutor(TensorImplFunc tensor_data_access_func,
unsigned int pipeline_depth);
CuQuantumExecutor(const CuQuantumExecutor &) = delete;
CuQuantumExecutor & operator=(CuQuantumExecutor &) = delete;
......@@ -65,10 +66,13 @@ public:
protected:
static constexpr float WORKSPACE_FRACTION = 0.2;
static constexpr unsigned int PIPELINE_DEPTH = 1;
static constexpr float WORKSPACE_FRACTION = 0.6;
static constexpr std::size_t MEM_ALIGNMENT = 256;
void acquireWorkspace(unsigned int dev,
void ** workspace_ptr,
uint64_t * workspace_size);
void parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req);
void loadTensors(std::shared_ptr<TensorNetworkReq> tn_req);
void planExecution(std::shared_ptr<TensorNetworkReq> tn_req);
......@@ -80,6 +84,7 @@ protected:
std::size_t buffer_size = 0;
void * workspace_ptr = nullptr;
std::size_t workspace_size = 0;
unsigned int pipe_level = 0;
void * cutn_handle; //cutensornetHandle_t = void*
};
......@@ -91,6 +96,8 @@ protected:
std::vector<LinearMemoryPool> mem_pool_;
/** Tensor data access function **/
TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes}
/** Pipeline depth **/
const unsigned int pipe_depth_;
};
} //namespace runtime
......
/** ExaTN: Tensor Runtime: Tensor network executor: Execution queue
REVISION: 2021/12/30
REVISION: 2022/01/05
Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
Rationale:
- ExaTN graph executor may accept whole tensor networks for execution
......@@ -144,7 +144,7 @@ public:
return current_network_;
}
/** Returns the current iterator to the beginning of the queue. **/
/** Resets the current iterator to the beginning of the queue. **/
void reset() {
lock();
current_network_ = networks_.begin();
......@@ -173,6 +173,12 @@ public:
return not_over;
}
/** Returns the distance from the current iterator
to the beginning of the queue. **/
auto getCurrentPos() {
return std::distance(networks_.begin(),current_network_);
}
/** Locks. **/
inline void lock(){queue_lock_.lock();}
inline void unlock(){queue_lock_.unlock();}
......
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2022/01/03
REVISION: 2022/01/05
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
......@@ -35,7 +35,8 @@ void LazyGraphExecutor::resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> no
[this](const numerics::Tensor & tensor, int device_kind, int device_id, std::size_t * size){
void * data_ptr = this->node_executor_->getTensorImage(tensor,device_kind,device_id,size);
return data_ptr;
}
},
CUQUANTUM_PIPELINE_DEPTH
);
}
#endif
......@@ -277,7 +278,7 @@ void LazyGraphExecutor::execute(TensorGraph & dag) {
void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
#ifdef CUQUANTUM
std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Started executing the tensor network queue via cuQuantum: "
<< tensor_network_queue.getSize() << " elements detected" << std::endl;
<< tensor_network_queue.getSize() << " networks detected" << std::endl;
assert(node_executor_);
//Synchronize the node executor:
bool synced = node_executor_->sync(); assert(synced);
......@@ -287,26 +288,35 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
tensor_network_queue.reset();
bool not_over = !tensor_network_queue.isOver();
while(not_over){
int error_code = 0;
const auto current = tensor_network_queue.getCurrent();
const auto exec_handle = current->second;
auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
assert(error_code == 0);
if(exec_stat == TensorNetworkQueue::ExecStat::None){
exec_stat = cuquantum_executor_->execute(current->first,exec_handle);
if(exec_stat != TensorNetworkQueue::ExecStat::None){
const auto current_pos = tensor_network_queue.getCurrentPos();
if(current_pos < CUQUANTUM_PIPELINE_DEPTH){
const auto current = tensor_network_queue.getCurrent();
const auto exec_handle = current->second;
int error_code = 0;
auto exec_stat = tensor_network_queue.checkExecStatus(exec_handle);
if(exec_stat == TensorNetworkQueue::ExecStat::Idle || current_pos == 0){
exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
assert(error_code == 0);
}
if(exec_stat == TensorNetworkQueue::ExecStat::None){
exec_stat = cuquantum_executor_->execute(current->first,exec_handle);
if(exec_stat != TensorNetworkQueue::ExecStat::None){
auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat);
std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Submitted tensor network to cuQuantum\n";
}
not_over = tensor_network_queue.next();
}else if(exec_stat == TensorNetworkQueue::ExecStat::Completed){
auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat);
assert(current_pos == 0);
tensor_network_queue.remove();
std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Completed tensor network execution via cuQuantum\n";
not_over = !tensor_network_queue.isOver();
}else{
auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat);
std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Submitted tensor network to cuQuantum\n";
not_over = tensor_network_queue.next();
}
not_over = tensor_network_queue.next();
}else if(exec_stat == TensorNetworkQueue::ExecStat::Completed){
auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat);
tensor_network_queue.remove();
std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Completed tensor network execution via cuQuantum\n";
not_over = !tensor_network_queue.isOver();
}else{
auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat);
not_over = tensor_network_queue.next();
not_over = false;
}
}
}
......
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2021/12/22
REVISION: 2022/01/05
Copyright (C) 2018-2021 Dmitry Lyakh, Alex McCaskey
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
Copyright (C) 2018-2022 Dmitry Lyakh, Alex McCaskey
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
Rationale:
......@@ -26,6 +26,9 @@ public:
static constexpr const unsigned int DEFAULT_PIPELINE_DEPTH = 16;
static constexpr const unsigned int DEFAULT_PREFETCH_DEPTH = 4;
#ifdef CUQUANTUM
static constexpr const unsigned int CUQUANTUM_PIPELINE_DEPTH = 2;
#endif
LazyGraphExecutor(): pipeline_depth_(DEFAULT_PIPELINE_DEPTH),
prefetch_depth_(DEFAULT_PREFETCH_DEPTH)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment