Commit 3ed2fe51 authored by Dmitry I. Lyakh's avatar Dmitry I. Lyakh
Browse files

Fully designed the execution workflow for CuQuantumExecutor


Signed-off-by: default avatarDmitry I. Lyakh <quant4me@gmail.com>
parent 7c970df3
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2021/12/29
REVISION: 2021/12/30
Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
......@@ -22,8 +22,6 @@ Rationale:
#include "talshxx.hpp"
#include "linear_memory.hpp"
#include "cuquantum_executor.hpp"
......@@ -44,7 +42,6 @@ namespace exatn {
namespace runtime {
struct TensorDescriptor {
std::vector<int32_t> modes; //indices associated with tensor dimensions
std::vector<int64_t> extents; //tensor dimension extents
std::vector<int64_t> strides; //tensor dimension strides (optional)
void * body_ptr = nullptr; //pointer to the tensor body image
......@@ -53,17 +50,18 @@ struct TensorDescriptor {
};
struct TensorNetworkReq {
std::shared_ptr<numerics::TensorNetwork> network;
std::unordered_map<numerics::TensorHashType,TensorDescriptor> tensor_descriptors;
std::unordered_map<int32_t,int64_t> index_extents;
TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::None; //tensor network execution status
std::shared_ptr<numerics::TensorNetwork> network; //tensor network specification
std::unordered_map<numerics::TensorHashType,TensorDescriptor> tensor_descriptors; //tensor descriptors (shape, volume, data type, body)
std::unordered_map<unsigned int, std::vector<int32_t>> tensor_modes; //indices associated with tensor dimensions (key is the original tensor id)
std::unordered_map<int32_t,int64_t> index_extents; //extent of each registered tensor mode
std::vector<void*> memory_window_ptr; //end of the GPU memory segment allocated for the tensors
cutensornetNetworkDescriptor_t net_descriptor;
cutensornetContractionOptimizerConfig_t opt_config;
cutensornetContractionOptimizerInfo_t opt_info;
cutensornetContractionPlan_t comp_plan;
cudaStream_t stream;
cutensornetComputeType_t compute_type;
void * memory_window_ptr = nullptr;
TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::Idle;
};
......@@ -83,11 +81,14 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
gpu_attr_.back().second.workspace_ptr = talsh::getDeviceBufferBasePtr(DEV_NVIDIA_GPU,i);
assert(reinterpret_cast<std::size_t>(gpu_attr_.back().second.workspace_ptr) % MEM_ALIGNMENT == 0);
gpu_attr_.back().second.buffer_size = talsh::getDeviceMaxBufferSize(DEV_NVIDIA_GPU,i);
std::size_t wrk_size = static_cast<float>(gpu_attr_.back().second.buffer_size) * WORKSPACE_FRACTION;
std::size_t wrk_size = (std::size_t)(static_cast<float>(gpu_attr_.back().second.buffer_size) * WORKSPACE_FRACTION);
wrk_size -= wrk_size % MEM_ALIGNMENT;
gpu_attr_.back().second.workspace_size = wrk_size;
gpu_attr_.back().second.buffer_size -= wrk_size;
gpu_attr_.back().second.buffer_size -= gpu_attr_.back().second.buffer_size % MEM_ALIGNMENT;
gpu_attr_.back().second.buffer_ptr = (void*)(((char*)(gpu_attr_.back().second.workspace_ptr)) + wrk_size);
mem_pool_.emplace_back(LinearMemoryPool(gpu_attr_.back().second.buffer_ptr,
gpu_attr_.back().second.buffer_size,MEM_ALIGNMENT));
}
}
std::cout << "#DEBUG(exatn::runtime::CuQuantumExecutor): Number of available GPUs = " << gpu_attr_.size() << std::endl;
......@@ -111,7 +112,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
CuQuantumExecutor::~CuQuantumExecutor()
{
bool success = sync(); assert(success);
sync();
for(const auto & gpu: gpu_attr_){
HANDLE_CUDA_ERROR(cudaSetDevice(gpu.first));
HANDLE_CTN_ERROR(cutensornetDestroy((cutensornetHandle_t)(gpu.second.cutn_handle)));
......@@ -130,8 +131,16 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::execute(std::shared_ptr<numerics
if(res.second){
auto tn_req = res.first->second;
tn_req->network = network;
tn_req->exec_status = TensorNetworkQueue::ExecStat::Idle;
parseTensorNetwork(tn_req); //still Idle
loadTensors(tn_req); //Idle --> Loading
if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Loading){
planExecution(tn_req); //Loading --> Planning (while loading data)
if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Planning){
contractTensorNetwork(tn_req); //Planning --> Executing
}
}
exec_stat = tn_req->exec_status;
//`Finish
}else{
std::cout << "#WARNING(exatn::runtime::CuQuantumExecutor): execute: Repeated tensor network submission detected!\n";
}
......@@ -140,26 +149,77 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::execute(std::shared_ptr<numerics
TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle exec_handle,
int * error_code,
bool wait)
int * error_code)
{
*error_code = 0;
TensorNetworkQueue::ExecStat exec_stat = TensorNetworkQueue::ExecStat::None;
auto iter = active_networks_.find(exec_handle);
if(iter != active_networks_.end()){
auto tn_req = iter->second;
if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Executing){
testCompletion(tn_req); //Executing --> Completed
}else{
if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Idle)
loadTensors(tn_req); //Idle --> Loading
if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Loading)
planExecution(tn_req); //Loading --> Planning (while loading data)
if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Planning)
contractTensorNetwork(tn_req); //Planning --> Executing
}
exec_stat = tn_req->exec_status;
//`Finish
tn_req.reset();
if(exec_stat == TensorNetworkQueue::ExecStat::Completed)
active_networks_.erase(iter);
}
return exec_stat;
}
bool CuQuantumExecutor::sync()
void CuQuantumExecutor::sync()
{
while(!active_networks_.empty()){
for(auto iter = active_networks_.begin(); iter != active_networks_.end(); ++iter){
int error_code = 0;
const auto exec_stat = sync(iter->first,&error_code); assert(error_code == 0);
if(exec_stat == TensorNetworkQueue::ExecStat::Completed) break;
}
}
return;
}
void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
{
return;
}
void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
{
return;
}
void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req)
{
return;
}
void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
{
return;
}
void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req)
{
bool synced = true;
//`Finish
return synced;
return;
}
} //namespace runtime
......
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2021/12/29
REVISION: 2021/12/30
Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
......@@ -20,6 +20,7 @@ Rationale:
#include <vector>
#include <functional>
#include "linear_memory.hpp"
#include "tensor_network_queue.hpp"
namespace talsh{
......@@ -57,17 +58,22 @@ public:
If wait = TRUE, waits until completion, otherwise just tests the progress.
Returns the current status of the tensor network execution. **/
TensorNetworkQueue::ExecStat sync(const TensorOpExecHandle exec_handle,
int * error_code,
bool wait = true);
int * error_code);
/** Synchronizes execution of all submitted tensor networks to completion. **/
bool sync();
void sync();
protected:
static constexpr float WORKSPACE_FRACTION = 0.2;
static constexpr std::size_t MEM_ALIGNMENT = 256;
void parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req);
void loadTensors(std::shared_ptr<TensorNetworkReq> tn_req);
void planExecution(std::shared_ptr<TensorNetworkReq> tn_req);
void contractTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req);
void testCompletion(std::shared_ptr<TensorNetworkReq> tn_req);
struct DeviceAttr{
void * buffer_ptr = nullptr;
std::size_t buffer_size = 0;
......@@ -80,6 +86,8 @@ protected:
std::unordered_map<TensorOpExecHandle,std::shared_ptr<TensorNetworkReq>> active_networks_;
/** Attributes of all GPUs available to the current process **/
std::vector<std::pair<int,DeviceAttr>> gpu_attr_; //{gpu_id, gpu_attributes}
/** Moving-window linear memory pool (in GPU RAM) **/
std::vector<LinearMemoryPool> mem_pool_;
/** Tensor data access function **/
TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes}
};
......
/** ExaTN: Tensor Runtime: Tensor network executor: Linear memory allocator
REVISION: 2021/12/29
REVISION: 2021/12/30
Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
Rationale:
Linear memory moving window:
Linear memory moving window (----->):
(a) front >= back:
____________________________________
......@@ -29,6 +29,8 @@ Rationale:
#ifndef EXATN_RUNTIME_LINEAR_MEMORY_HPP_
#define EXATN_RUNTIME_LINEAR_MEMORY_HPP_
#include "errors.hpp"
class LinearMemoryPool {
public:
......
/** ExaTN: Tensor Runtime: Tensor network executor: Execution queue
REVISION: 2021/12/27
REVISION: 2021/12/30
Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
......@@ -36,7 +36,8 @@ public:
enum class ExecStat {
None, //no execution status
Idle, //submitted but execution has not yet started
Preparing, //preparation for execution has started (loading data, planning)
Loading, //started loading data
Planning, //preparation for execution has started (planning)
Executing, //actual execution (numerical computation) has started
Completed //execution completed
};
......
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2021/12/29
REVISION: 2021/12/30
Copyright (C) 2018-2021 Dmitry Lyakh
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
......@@ -290,7 +290,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
int error_code = 0;
const auto current = tensor_network_queue.getCurrent();
const auto exec_handle = current->second;
auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code,false); //this call will progress tensor network execution
auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
assert(error_code == 0);
if(exec_stat == TensorNetworkQueue::ExecStat::None){
exec_stat = cuquantum_executor_->execute(current->first,exec_handle);
......@@ -310,7 +310,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
}
}
}
synced = cuquantum_executor_->sync(); assert(synced);
cuquantum_executor_->sync();
#else
assert(tensor_network_queue.isEmpty());
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment