Commit ad725f47 authored by Dmitry I. Lyakh's avatar Dmitry I. Lyakh
Browse files

Enabled distributed execution in CuQuantum executor, still needs allreduce and testing ...


Signed-off-by: default avatarDmitry I. Lyakh <quant4me@gmail.com>
parent ed6dc07f
......@@ -18,14 +18,15 @@
#include "errors.hpp"
//Test activation:
/*#define EXATN_TEST0
/*
#define EXATN_TEST0
#define EXATN_TEST1
#define EXATN_TEST2
#define EXATN_TEST3
#define EXATN_TEST4
#define EXATN_TEST5*/
#define EXATN_TEST5
#define EXATN_TEST6
/*#define EXATN_TEST7
#define EXATN_TEST7
#define EXATN_TEST8
#define EXATN_TEST9
#define EXATN_TEST10
......@@ -44,13 +45,15 @@
#define EXATN_TEST23
#define EXATN_TEST24
#define EXATN_TEST25
#define EXATN_TEST26*/
#define EXATN_TEST26
//#define EXATN_TEST27 //requires input file from source
//#define EXATN_TEST28 //requires input file from source
//#define EXATN_TEST29
//#define EXATN_TEST30
#define EXATN_TEST29
#define EXATN_TEST30
//#define EXATN_TEST31 //requires input file from source
//#define EXATN_TEST32
*/
#define EXATN_TEST32
//#define EXATN_TEST33
#ifdef EXATN_TEST0
......@@ -3774,6 +3777,65 @@ TEST(NumServerTester, ExcitedMCVQE) {
#endif
#ifdef EXATN_TEST32
TEST(NumServerTester, CuTensorNet) {
using exatn::TensorShape;
using exatn::TensorSignature;
using exatn::Tensor;
using exatn::TensorNetwork;
using exatn::TensorExpansion;
using exatn::TensorOperator;
using exatn::TensorElementType;
using exatn::TensorRange;
const auto TENS_ELEM_TYPE = TensorElementType::REAL32;
const int NUM_REPEATS = 3;
//exatn::resetLoggingLevel(1,2); //debug
bool success = true;
//Create tensors:
success = exatn::createTensor("A",TENS_ELEM_TYPE,TensorShape{96,64,64,96}); assert(success);
success = exatn::createTensor("B",TENS_ELEM_TYPE,TensorShape{96,64,64}); assert(success);
success = exatn::createTensor("C",TENS_ELEM_TYPE,TensorShape{64,96,64}); assert(success);
success = exatn::createTensor("D",TENS_ELEM_TYPE,TensorShape{96,64,96,64}); assert(success);
//Init tensors:
success = exatn::initTensorRnd("A"); assert(success);
success = exatn::initTensorRnd("B"); assert(success);
success = exatn::initTensorRnd("C"); assert(success);
success = exatn::initTensor("D",0.0); assert(success);
//Contract tensor network:
int num_repeats = NUM_REPEATS;
while(--num_repeats >= 0){
success = exatn::sync(); assert(success);
std::cout << "D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y): ";
auto flops = exatn::getTotalFlopCount();
auto time_start = exatn::Timer::timeInSecHR();
success = exatn::evaluateTensorNetworkSync("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)");
assert(success);
auto duration = exatn::Timer::timeInSecHR(time_start);
flops = exatn::getTotalFlopCount() - flops;
std::cout << "Performance = " << (flops / (1e9 * duration)) << " Gflop/s" << std::endl;
}
//Destroy tensors:
success = exatn::sync(); assert(success);
success = exatn::destroyTensor("D"); assert(success);
success = exatn::destroyTensor("C"); assert(success);
success = exatn::destroyTensor("B"); assert(success);
success = exatn::destroyTensor("A"); assert(success);
//Synchronize:
success = exatn::sync(); assert(success);
exatn::resetLoggingLevel(0,0);
//Grab a beer!
}
#endif
#ifdef EXATN_TEST33
TEST(NumServerTester, TensorComposite) {
using exatn::TensorShape;
using exatn::TensorSignature;
......
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/05
REVISION: 2022/01/06
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
......@@ -108,8 +108,11 @@ struct TensorNetworkReq {
};
CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth):
tensor_data_access_func_(std::move(tensor_data_access_func)), pipe_depth_(pipeline_depth)
CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func,
unsigned int pipeline_depth,
unsigned int process_rank, unsigned int num_processes):
tensor_data_access_func_(std::move(tensor_data_access_func)),
pipe_depth_(pipeline_depth), process_rank_(process_rank), num_processes_(num_processes)
{
static_assert(std::is_same<cutensornetHandle_t,void*>::value,"#FATAL(exatn::runtime::CuQuantumExecutor): cutensornetHandle_t != (void*)");
......@@ -452,7 +455,7 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
&num_slices,sizeof(num_slices)));
assert(num_slices > 0);
HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_start,tn_req->stream));
for(int64_t slice_id = 0; slice_id < num_slices; ++slice_id){
for(int64_t slice_id = process_rank_; slice_id < num_slices; slice_id += num_processes_){
HANDLE_CTN_ERROR(cutensornetContraction(gpu_attr_[gpu].second.cutn_handle,
tn_req->comp_plan,
tn_req->data_in,tn_req->data_out,
......
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/05
REVISION: 2022/01/06
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
......@@ -41,7 +41,9 @@ class CuQuantumExecutor {
public:
CuQuantumExecutor(TensorImplFunc tensor_data_access_func,
unsigned int pipeline_depth);
unsigned int pipeline_depth,
unsigned int process_rank,
unsigned int num_processes);
CuQuantumExecutor(const CuQuantumExecutor &) = delete;
CuQuantumExecutor & operator=(CuQuantumExecutor &) = delete;
......@@ -98,6 +100,10 @@ protected:
TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes}
/** Pipeline depth **/
const unsigned int pipe_depth_;
/** Process rank **/
const unsigned int process_rank_;
/** Total number of parallel processes **/
const unsigned int num_processes_;
};
} //namespace runtime
......
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2022/01/05
REVISION: 2022/01/06
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
......@@ -25,10 +25,11 @@ namespace runtime {
void LazyGraphExecutor::resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> node_executor,
const ParamConf & parameters,
unsigned int num_processes,
unsigned int process_rank,
unsigned int global_process_rank)
{
TensorGraphExecutor::resetNodeExecutor(node_executor,parameters,process_rank,global_process_rank);
TensorGraphExecutor::resetNodeExecutor(node_executor,parameters,num_processes,process_rank,global_process_rank);
#ifdef CUQUANTUM
if(node_executor){
cuquantum_executor_ = std::make_shared<CuQuantumExecutor>(
......@@ -36,7 +37,9 @@ void LazyGraphExecutor::resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> no
void * data_ptr = this->node_executor_->getTensorImage(tensor,device_kind,device_id,size);
return data_ptr;
},
CUQUANTUM_PIPELINE_DEPTH
CUQUANTUM_PIPELINE_DEPTH,
process_rank,
num_processes
);
}
#endif
......
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2022/01/05
REVISION: 2022/01/06
Copyright (C) 2018-2022 Dmitry Lyakh, Alex McCaskey
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
......@@ -45,6 +45,7 @@ public:
/** Sets/resets the DAG node executor (tensor operation executor). **/
virtual void resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> node_executor,
const ParamConf & parameters,
unsigned int num_processes,
unsigned int process_rank,
unsigned int global_process_rank) override;
......
/** ExaTN:: Tensor Runtime: Tensor graph executor
REVISION: 2021/12/22
REVISION: 2022/01/06
Copyright (C) 2018-2021 Dmitry Lyakh, Tiffany Mintz, Alex McCaskey
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
Copyright (C) 2018-2022 Dmitry Lyakh, Tiffany Mintz, Alex McCaskey
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
Rationale:
(a) Tensor graph executor traverses the tensor graph (DAG) and
......@@ -44,7 +44,8 @@ class TensorGraphExecutor : public Identifiable, public Cloneable<TensorGraphExe
public:
TensorGraphExecutor():
node_executor_(nullptr), num_ops_issued_(0), process_rank_(-1), global_process_rank_(-1),
node_executor_(nullptr), num_ops_issued_(0),
num_processes_(0), process_rank_(-1), global_process_rank_(-1),
logging_(0), stopping_(false), active_(false), serialize_(false), validation_tracing_(false),
time_start_(exatn::Timer::timeInSecHR())
{
......@@ -62,8 +63,10 @@ public:
/** Sets/resets the DAG node executor (tensor operation executor). **/
virtual void resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> node_executor,
const ParamConf & parameters,
unsigned int num_processes,
unsigned int process_rank,
unsigned int global_process_rank) {
num_processes_.store(num_processes);
process_rank_.store(process_rank);
global_process_rank_.store(global_process_rank);
node_executor_ = node_executor;
......@@ -166,6 +169,7 @@ protected:
std::shared_ptr<TensorNodeExecutor> node_executor_; //intr-node tensor operation executor
std::atomic<std::size_t> num_ops_issued_; //total number of issued tensor operations
std::atomic<int> num_processes_; //number of parallel processes
std::atomic<int> process_rank_; //current process rank
std::atomic<int> global_process_rank_; //current global process rank (in MPI_COMM_WORLD)
std::atomic<int> logging_; //logging level (0:none)
......
/** ExaTN:: Tensor Runtime: Task-based execution layer for tensor operations
REVISION: 2021/12/27
REVISION: 2022/01/06
Copyright (C) 2018-2021 Dmitry Lyakh, Tiffany Mintz, Alex McCaskey
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
Copyright (C) 2018-2022 Dmitry Lyakh, Tiffany Mintz, Alex McCaskey
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
**/
#include "tensor_runtime.hpp"
......@@ -98,7 +98,7 @@ void TensorRuntime::launchExecutionThread()
void TensorRuntime::executionThreadWorkflow()
{
graph_executor_->resetNodeExecutor(exatn::getService<TensorNodeExecutor>(node_executor_name_),
parameters_,process_rank_,global_process_rank_);
parameters_,num_processes_,process_rank_,global_process_rank_);
//std::cout << "#DEBUG(exatn::runtime::TensorRuntime)[EXEC_THREAD]: DAG node executor set to "
//<< node_executor_name_ << std::endl << std::flush;
while(alive_.load()){ //alive_ is set by the main thread
......@@ -114,7 +114,8 @@ void TensorRuntime::executionThreadWorkflow()
}
processTensorDataRequests(); //process all outstanding client requests for tensor data (synchronous)
}
graph_executor_->resetNodeExecutor(std::shared_ptr<TensorNodeExecutor>(nullptr),parameters_,process_rank_,global_process_rank_);
graph_executor_->resetNodeExecutor(std::shared_ptr<TensorNodeExecutor>(nullptr),
parameters_,num_processes_,process_rank_,global_process_rank_);
//std::cout << "#DEBUG(exatn::runtime::TensorRuntime)[EXEC_THREAD]: DAG node executor reset. End of life."
//<< std::endl << std::flush;
return; //end of execution thread life
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment