Fully designed the execution workflow for CuQuantumExecutor (3ed2fe51) · Commits · ORNL Quantum Computing Institute / exatn

src/runtime/executor/cuquantum/cuquantum_executor.cu

+79 −19

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
		REVISION: 2021/12/29
		REVISION: 2021/12/30

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		@@ -22,8 +22,6 @@ Rationale:

		#include "talshxx.hpp"

		#include "linear_memory.hpp"

		#include "cuquantum_executor.hpp"


		@@ -44,7 +42,6 @@ namespace exatn {
		namespace runtime {

		struct TensorDescriptor {
		std::vector<int32_t> modes; //indices associated with tensor dimensions
		std::vector<int64_t> extents; //tensor dimension extents
		std::vector<int64_t> strides; //tensor dimension strides (optional)
		void * body_ptr = nullptr; //pointer to the tensor body image
		@@ -53,17 +50,18 @@ struct TensorDescriptor {
		};

		struct TensorNetworkReq {
		std::shared_ptr<numerics::TensorNetwork> network;
		std::unordered_map<numerics::TensorHashType,TensorDescriptor> tensor_descriptors;
		std::unordered_map<int32_t,int64_t> index_extents;
		TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::None; //tensor network execution status
		std::shared_ptr<numerics::TensorNetwork> network; //tensor network specification
		std::unordered_map<numerics::TensorHashType,TensorDescriptor> tensor_descriptors; //tensor descriptors (shape, volume, data type, body)
		std::unordered_map<unsigned int, std::vector<int32_t>> tensor_modes; //indices associated with tensor dimensions (key is the original tensor id)
		std::unordered_map<int32_t,int64_t> index_extents; //extent of each registered tensor mode
		std::vector<void*> memory_window_ptr; //end of the GPU memory segment allocated for the tensors
		cutensornetNetworkDescriptor_t net_descriptor;
		cutensornetContractionOptimizerConfig_t opt_config;
		cutensornetContractionOptimizerInfo_t opt_info;
		cutensornetContractionPlan_t comp_plan;
		cudaStream_t stream;
		cutensornetComputeType_t compute_type;
		void * memory_window_ptr = nullptr;
		TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::Idle;
		};


		@@ -83,11 +81,14 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
		gpu_attr_.back().second.workspace_ptr = talsh::getDeviceBufferBasePtr(DEV_NVIDIA_GPU,i);
		assert(reinterpret_cast<std::size_t>(gpu_attr_.back().second.workspace_ptr) % MEM_ALIGNMENT == 0);
		gpu_attr_.back().second.buffer_size = talsh::getDeviceMaxBufferSize(DEV_NVIDIA_GPU,i);
		std::size_t wrk_size = static_cast<float>(gpu_attr_.back().second.buffer_size) * WORKSPACE_FRACTION;
		std::size_t wrk_size = (std::size_t)(static_cast<float>(gpu_attr_.back().second.buffer_size) * WORKSPACE_FRACTION);
		wrk_size -= wrk_size % MEM_ALIGNMENT;
		gpu_attr_.back().second.workspace_size = wrk_size;
		gpu_attr_.back().second.buffer_size -= wrk_size;
		gpu_attr_.back().second.buffer_size -= gpu_attr_.back().second.buffer_size % MEM_ALIGNMENT;
		gpu_attr_.back().second.buffer_ptr = (void)(((char)(gpu_attr_.back().second.workspace_ptr)) + wrk_size);
		mem_pool_.emplace_back(LinearMemoryPool(gpu_attr_.back().second.buffer_ptr,
		gpu_attr_.back().second.buffer_size,MEM_ALIGNMENT));
		}
		}
		std::cout << "#DEBUG(exatn::runtime::CuQuantumExecutor): Number of available GPUs = " << gpu_attr_.size() << std::endl;
		@@ -111,7 +112,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):

		CuQuantumExecutor::~CuQuantumExecutor()
		{
		bool success = sync(); assert(success);
		sync();
		for(const auto & gpu: gpu_attr_){
		HANDLE_CUDA_ERROR(cudaSetDevice(gpu.first));
		HANDLE_CTN_ERROR(cutensornetDestroy((cutensornetHandle_t)(gpu.second.cutn_handle)));
		@@ -130,8 +131,16 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::execute(std::shared_ptr<numerics
		if(res.second){
		auto tn_req = res.first->second;
		tn_req->network = network;
		tn_req->exec_status = TensorNetworkQueue::ExecStat::Idle;
		parseTensorNetwork(tn_req); //still Idle
		loadTensors(tn_req); //Idle --> Loading
		if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Loading){
		planExecution(tn_req); //Loading --> Planning (while loading data)
		if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Planning){
		contractTensorNetwork(tn_req); //Planning --> Executing
		}
		}
		exec_stat = tn_req->exec_status;
		//`Finish
		}else{
		std::cout << "#WARNING(exatn::runtime::CuQuantumExecutor): execute: Repeated tensor network submission detected!\n";
		}
		@@ -140,26 +149,77 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::execute(std::shared_ptr<numerics


		TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle exec_handle,
		int * error_code,
		bool wait)
		int * error_code)
		{
		*error_code = 0;
		TensorNetworkQueue::ExecStat exec_stat = TensorNetworkQueue::ExecStat::None;
		auto iter = active_networks_.find(exec_handle);
		if(iter != active_networks_.end()){
		auto tn_req = iter->second;
		if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Executing){
		testCompletion(tn_req); //Executing --> Completed
		}else{
		if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Idle)
		loadTensors(tn_req); //Idle --> Loading
		if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Loading)
		planExecution(tn_req); //Loading --> Planning (while loading data)
		if(tn_req->exec_status == TensorNetworkQueue::ExecStat::Planning)
		contractTensorNetwork(tn_req); //Planning --> Executing
		}
		exec_stat = tn_req->exec_status;
		//`Finish
		tn_req.reset();
		if(exec_stat == TensorNetworkQueue::ExecStat::Completed)
		active_networks_.erase(iter);
		}
		return exec_stat;
		}


		bool CuQuantumExecutor::sync()
		void CuQuantumExecutor::sync()
		{
		while(!active_networks_.empty()){
		for(auto iter = active_networks_.begin(); iter != active_networks_.end(); ++iter){
		int error_code = 0;
		const auto exec_stat = sync(iter->first,&error_code); assert(error_code == 0);
		if(exec_stat == TensorNetworkQueue::ExecStat::Completed) break;
		}
		}
		return;
		}


		void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
		{

		return;
		}


		void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
		{
		bool synced = true;
		//`Finish
		return synced;

		return;
		}


		void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req)
		{

		return;
		}


		void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
		{

		return;
		}


		void CuQuantumExecutor::testCompletion(std::shared_ptr<TensorNetworkReq> tn_req)
		{

		return;
		}

		} //namespace runtime

src/runtime/executor/cuquantum/cuquantum_executor.hpp

+12 −4

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
		REVISION: 2021/12/29
		REVISION: 2021/12/30

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		@@ -20,6 +20,7 @@ Rationale:
		#include <vector>
		#include <functional>

		#include "linear_memory.hpp"
		#include "tensor_network_queue.hpp"

		namespace talsh{
		@@ -57,17 +58,22 @@ public:
		If wait = TRUE, waits until completion, otherwise just tests the progress.
		Returns the current status of the tensor network execution. **/
		TensorNetworkQueue::ExecStat sync(const TensorOpExecHandle exec_handle,
		int * error_code,
		bool wait = true);
		int * error_code);

		/ Synchronizes execution of all submitted tensor networks to completion. /
		bool sync();
		void sync();

		protected:

		static constexpr float WORKSPACE_FRACTION = 0.2;
		static constexpr std::size_t MEM_ALIGNMENT = 256;

		void parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req);
		void loadTensors(std::shared_ptr<TensorNetworkReq> tn_req);
		void planExecution(std::shared_ptr<TensorNetworkReq> tn_req);
		void contractTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req);
		void testCompletion(std::shared_ptr<TensorNetworkReq> tn_req);

		struct DeviceAttr{
		void * buffer_ptr = nullptr;
		std::size_t buffer_size = 0;
		@@ -80,6 +86,8 @@ protected:
		std::unordered_map<TensorOpExecHandle,std::shared_ptr<TensorNetworkReq>> active_networks_;
		/ Attributes of all GPUs available to the current process /
		std::vector<std::pair<int,DeviceAttr>> gpu_attr_; //{gpu_id, gpu_attributes}
		/ Moving-window linear memory pool (in GPU RAM) /
		std::vector<LinearMemoryPool> mem_pool_;
		/ Tensor data access function /
		TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes}
		};

src/runtime/executor/cuquantum/linear_memory.hpp

+4 −2

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: Linear memory allocator
		REVISION: 2021/12/29
		REVISION: 2021/12/30

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)

		Rationale:

		Linear memory moving window:
		Linear memory moving window (----->):

		(a) front >= back:
		____________________________________
		@@ -29,6 +29,8 @@ Rationale:
		#ifndef EXATN_RUNTIME_LINEAR_MEMORY_HPP_
		#define EXATN_RUNTIME_LINEAR_MEMORY_HPP_

		#include "errors.hpp"

		class LinearMemoryPool {

		public:

src/runtime/executor/cuquantum/tensor_network_queue.hpp

+3 −2

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: Execution queue
		REVISION: 2021/12/27
		REVISION: 2021/12/30

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		@@ -36,7 +36,8 @@ public:
		enum class ExecStat {
		None, //no execution status
		Idle, //submitted but execution has not yet started
		Preparing, //preparation for execution has started (loading data, planning)
		Loading, //started loading data
		Planning, //preparation for execution has started (planning)
		Executing, //actual execution (numerical computation) has started
		Completed //execution completed
		};

src/runtime/executor/graph_executors/lazy/graph_executor_lazy.cpp

+3 −3

Original line number	Diff line number	Diff line
		/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
		REVISION: 2021/12/29
		REVISION: 2021/12/30

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		@@ -290,7 +290,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
		int error_code = 0;
		const auto current = tensor_network_queue.getCurrent();
		const auto exec_handle = current->second;
		auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code,false); //this call will progress tensor network execution
		auto exec_stat = cuquantum_executor_->sync(exec_handle,&error_code); //this call will progress tensor network execution
		assert(error_code == 0);
		if(exec_stat == TensorNetworkQueue::ExecStat::None){
		exec_stat = cuquantum_executor_->execute(current->first,exec_handle);
		@@ -310,7 +310,7 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
		}
		}
		}
		synced = cuquantum_executor_->sync(); assert(synced);
		cuquantum_executor_->sync();
		#else
		assert(tensor_network_queue.isEmpty());
		#endif