Finished CuQuantum executor for one GPU per process, needs testing ... (ed6dc07f) · Commits · ORNL Quantum Computing Institute / exatn

src/runtime/executor/cuquantum/cuquantum_executor.cu

+66 −20

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
		REVISION: 2022/01/04
		REVISION: 2022/01/05

		Copyright (C) 2018-2022 Dmitry Lyakh
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
		@@ -91,6 +91,9 @@ struct TensorNetworkReq {
		cudaEventDestroy(data_finish);
		cudaEventDestroy(data_start);
		cudaStreamDestroy(stream);
		cutensornetDestroyContractionPlan(comp_plan);
		cutensornetDestroyContractionOptimizerConfig(opt_config);
		cutensornetDestroyContractionOptimizerInfo(opt_info);
		cutensornetDestroyNetworkDescriptor(net_descriptor);
		if(modes_out != nullptr) delete [] modes_out;
		if(strides_out != nullptr) delete [] strides_out;
		@@ -105,8 +108,8 @@ struct TensorNetworkReq {
		};


		CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
		tensor_data_access_func_(std::move(tensor_data_access_func))
		CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth):
		tensor_data_access_func_(std::move(tensor_data_access_func)), pipe_depth_(pipeline_depth)
		{
		static_assert(std::is_same<cutensornetHandle_t,void>::value,"#FATAL(exatn::runtime::CuQuantumExecutor): cutensornetHandle_t != (void)");

		@@ -118,6 +121,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
		for(int i = 0; i < num_gpus; ++i){
		if(talshDeviceState(i,DEV_NVIDIA_GPU) >= DEV_ON){
		gpu_attr_.emplace_back(std::make_pair(i,DeviceAttr{}));
		gpu_attr_.back().second.pipe_level = 0;
		gpu_attr_.back().second.workspace_ptr = talsh::getDeviceBufferBasePtr(DEV_NVIDIA_GPU,i);
		assert(reinterpret_cast<std::size_t>(gpu_attr_.back().second.workspace_ptr) % MEM_ALIGNMENT == 0);
		gpu_attr_.back().second.buffer_size = talsh::getDeviceMaxBufferSize(DEV_NVIDIA_GPU,i);
		@@ -257,6 +261,19 @@ cutensornetComputeType_t getCutensorComputeType(const TensorElementType elem_typ
		}


		void CuQuantumExecutor::acquireWorkspace(unsigned int dev,
		void ** workspace_ptr,
		uint64_t * workspace_size)
		{
		assert(dev < gpu_attr_.size());
		auto & dev_attr = gpu_attr_[dev].second;
		*workspace_size = dev_attr.workspace_size / pipe_depth_;
		workspace_ptr = (void)((char)(dev_attr.workspace_ptr) + ((workspace_size) * dev_attr.pipe_level));
		dev_attr.pipe_level = (++(dev_attr.pipe_level)) % pipe_depth_;
		return;
		}


		void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
		{
		const auto & net = *(tn_req->network);
		@@ -266,6 +283,7 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_
		tn_req->strides_in = new int64_t*[num_input_tensors];
		tn_req->modes_in = new int32_t*[num_input_tensors];
		tn_req->alignments_in = new uint32_t[num_input_tensors];
		tn_req->data_in = new void*[num_input_tensors];

		for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->strides_in[i] = NULL;
		for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->alignments_in[i] = MEM_ALIGNMENT;
		@@ -370,7 +388,21 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
		}
		HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->data_finish,tn_req->stream));
		tn_req->memory_window_ptr.emplace_back(mem_pool_[gpu].getFront());
		auto & net = *(tn_req->network);
		int32_t tens_num = 0;
		for(auto iter = net.cbegin(); iter != net.cend(); ++iter){
		const auto tens_id = iter->first;
		const auto & tens = iter->second;
		const auto tens_hash = tens.getTensor()->getTensorHash();
		auto descr = tn_req->tensor_descriptors.find(tens_hash);
		void * dev_ptr = descr->second.dst_ptr.back();
		if(tens_id == 0){
		tn_req->data_out = dev_ptr;
		}else{
		tn_req->data_in[tens_num++] = dev_ptr;
		}
		}
		}else{ //no enough memory currently
		//Restore previous memory front:
		mem_pool_[gpu].restorePreviousFront(prev_front);
		return;
		@@ -385,16 +417,22 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req)
		{
		//Configure tensor network contraction on one or all GPUs:
		for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
		const auto gpu_id = gpu_attr_[gpu].first;
		HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
		HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerConfig(gpu_attr_[gpu].second.cutn_handle,&(tn_req->opt_config)));
		HANDLE_CTN_ERROR(cutensornetCreateContractionOptimizerInfo(gpu_attr_[gpu].second.cutn_handle,tn_req->net_descriptor,&(tn_req->opt_info)));
		tn_req->worksize = gpu_attr_[gpu].second.workspace_size / PIPELINE_DEPTH;
		tn_req->workspace = gpu_attr_[gpu].second.workspace_ptr; //`Need moving window (pipelining)
		acquireWorkspace(gpu,&(tn_req->workspace),&(tn_req->worksize));
		HANDLE_CTN_ERROR(cutensornetContractionOptimize(gpu_attr_[gpu].second.cutn_handle,
		tn_req->net_descriptor,tn_req->opt_config,
		tn_req->worksize,tn_req->opt_info));
		HANDLE_CTN_ERROR(cutensornetCreateContractionPlan(gpu_attr_[gpu].second.cutn_handle,
		tn_req->net_descriptor,tn_req->opt_info,
		tn_req->worksize,&(tn_req->comp_plan)));
		double flops = 0.0;
		HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle,
		tn_req->opt_info,
		CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
		&flops,sizeof(flops)));
		}
		tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning;
		return;
		@@ -406,13 +444,13 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
		//Execute the contraction plan on one or all GPUs:
		for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
		const auto gpu_id = gpu_attr_[gpu].first;
		HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
		int64_t num_slices = 0;
		HANDLE_CTN_ERROR(cutensornetContractionOptimizerInfoGetAttribute(gpu_attr_[gpu].second.cutn_handle,
		tn_req->opt_info,
		CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
		&num_slices,sizeof(num_slices)));
		assert(num_slices > 0);
		HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
		HANDLE_CUDA_ERROR(cudaEventRecord(tn_req->compute_start,tn_req->stream));
		for(int64_t slice_id = 0; slice_id < num_slices; ++slice_id){
		HANDLE_CTN_ERROR(cutensornetContraction(gpu_attr_[gpu].second.cutn_handle,
		@@ -431,13 +469,21 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
		void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req)
		{
		//Test work completion on one or all GPUs:
		bool all_completed = true;
		for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
		const auto gpu_id = gpu_attr_[gpu].first;
		HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
		cudaError_t cuda_error = cudaEventQuery(tn_req->compute_finish);
		if(cuda_error == cudaSuccess){
		//`Move MemPool back forward using tn_req->memory_window_ptr
		tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed;
		if(tn_req->memory_window_ptr[gpu] != nullptr){
		mem_pool_[gpu].releaseMemory(tn_req->memory_window_ptr[gpu]);
		tn_req->memory_window_ptr[gpu] = nullptr;
		}
		}else{
		all_completed = false;
		}
		}
		if(all_completed) tn_req->exec_status = TensorNetworkQueue::ExecStat::Completed;
		return;
		}

src/runtime/executor/cuquantum/cuquantum_executor.hpp

+11 −4

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
		REVISION: 2022/01/04
		REVISION: 2022/01/05

		Copyright (C) 2018-2022 Dmitry Lyakh
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
		@@ -40,7 +40,8 @@ class CuQuantumExecutor {

		public:

		CuQuantumExecutor(TensorImplFunc tensor_data_access_func);
		CuQuantumExecutor(TensorImplFunc tensor_data_access_func,
		unsigned int pipeline_depth);

		CuQuantumExecutor(const CuQuantumExecutor &) = delete;
		CuQuantumExecutor & operator=(CuQuantumExecutor &) = delete;
		@@ -65,10 +66,13 @@ public:

		protected:

		static constexpr float WORKSPACE_FRACTION = 0.2;
		static constexpr unsigned int PIPELINE_DEPTH = 1;
		static constexpr float WORKSPACE_FRACTION = 0.6;
		static constexpr std::size_t MEM_ALIGNMENT = 256;

		void acquireWorkspace(unsigned int dev,
		void ** workspace_ptr,
		uint64_t * workspace_size);

		void parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req);
		void loadTensors(std::shared_ptr<TensorNetworkReq> tn_req);
		void planExecution(std::shared_ptr<TensorNetworkReq> tn_req);
		@@ -80,6 +84,7 @@ protected:
		std::size_t buffer_size = 0;
		void * workspace_ptr = nullptr;
		std::size_t workspace_size = 0;
		unsigned int pipe_level = 0;
		void * cutn_handle; //cutensornetHandle_t = void*
		};

		@@ -91,6 +96,8 @@ protected:
		std::vector<LinearMemoryPool> mem_pool_;
		/ Tensor data access function /
		TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes}
		/ Pipeline depth /
		const unsigned int pipe_depth_;
		};

		} //namespace runtime

src/runtime/executor/cuquantum/tensor_network_queue.hpp

+10 −4

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: Execution queue
		REVISION: 2021/12/30
		REVISION: 2022/01/05

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		Copyright (C) 2018-2022 Dmitry Lyakh
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)

		Rationale:
		- ExaTN graph executor may accept whole tensor networks for execution
		@@ -144,7 +144,7 @@ public:
		return current_network_;
		}

		/ Returns the current iterator to the beginning of the queue. /
		/ Resets the current iterator to the beginning of the queue. /
		void reset() {
		lock();
		current_network_ = networks_.begin();
		@@ -173,6 +173,12 @@ public:
		return not_over;
		}

		/** Returns the distance from the current iterator
		to the beginning of the queue. **/
		auto getCurrentPos() {
		return std::distance(networks_.begin(),current_network_);
		}

		/ Locks. /
		inline void lock(){queue_lock_.lock();}
		inline void unlock(){queue_lock_.unlock();}

src/runtime/executor/graph_executors/lazy/graph_executor_lazy.cpp

+30 −20

Original line number	Diff line number	Diff line
		/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
		REVISION: 2022/01/03
		REVISION: 2022/01/05

		Copyright (C) 2018-2022 Dmitry Lyakh
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
		@@ -35,7 +35,8 @@ void LazyGraphExecutor::resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> no
		[this](const numerics::Tensor & tensor, int device_kind, int device_id, std::size_t * size){
		void * data_ptr = this->node_executor_->getTensorImage(tensor,device_kind,device_id,size);
		return data_ptr;
		}
		},
		CUQUANTUM_PIPELINE_DEPTH
		);
		}
		#endif
		@@ -277,7 +278,7 @@ void LazyGraphExecutor::execute(TensorGraph & dag) {
		void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
		#ifdef CUQUANTUM
		std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Started executing the tensor network queue via cuQuantum: "
		<< tensor_network_queue.getSize() << " elements detected" << std::endl;
		<< tensor_network_queue.getSize() << " networks detected" << std::endl;
		assert(node_executor_);
		//Synchronize the node executor:
		bool synced = node_executor_->sync(); assert(synced);
		@@ -287,11 +288,16 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
		tensor_network_queue.reset();
		bool not_over = !tensor_network_queue.isOver();
		while(not_over){
		int error_code = 0;
		const auto current_pos = tensor_network_queue.getCurrentPos();
		if(current_pos < CUQUANTUM_PIPELINE_DEPTH){
		const auto current = tensor_network_queue.getCurrent();
		const auto exec_handle = current->second;
		auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
		int error_code = 0;
		auto exec_stat = tensor_network_queue.checkExecStatus(exec_handle);
		if(exec_stat == TensorNetworkQueue::ExecStat::Idle \|\| current_pos == 0){
		exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
		assert(error_code == 0);
		}
		if(exec_stat == TensorNetworkQueue::ExecStat::None){
		exec_stat = cuquantum_executor_->execute(current->first,exec_handle);
		if(exec_stat != TensorNetworkQueue::ExecStat::None){
		@@ -301,6 +307,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
		not_over = tensor_network_queue.next();
		}else if(exec_stat == TensorNetworkQueue::ExecStat::Completed){
		auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat);
		assert(current_pos == 0);
		tensor_network_queue.remove();
		std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Completed tensor network execution via cuQuantum\n";
		not_over = !tensor_network_queue.isOver();
		@@ -308,6 +315,9 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
		auto prev_exec_stat = tensor_network_queue.updateExecStatus(exec_handle,exec_stat);
		not_over = tensor_network_queue.next();
		}
		}else{
		not_over = false;
		}
		}
		}
		cuquantum_executor_->sync();

src/runtime/executor/graph_executors/lazy/graph_executor_lazy.hpp

+6 −3

Original line number	Diff line number	Diff line
		/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
		REVISION: 2021/12/22
		REVISION: 2022/01/05

		Copyright (C) 2018-2021 Dmitry Lyakh, Alex McCaskey
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		Copyright (C) 2018-2022 Dmitry Lyakh, Alex McCaskey
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)

		Rationale:

		@@ -26,6 +26,9 @@ public:

		static constexpr const unsigned int DEFAULT_PIPELINE_DEPTH = 16;
		static constexpr const unsigned int DEFAULT_PREFETCH_DEPTH = 4;
		#ifdef CUQUANTUM
		static constexpr const unsigned int CUQUANTUM_PIPELINE_DEPTH = 2;
		#endif

		LazyGraphExecutor(): pipeline_depth_(DEFAULT_PIPELINE_DEPTH),
		prefetch_depth_(DEFAULT_PREFETCH_DEPTH)