Implemented CuQuantumExecutor::loadTensors (0c74a510) · Commits · ORNL Quantum Computing Institute / exatn

src/runtime/executor/cuquantum/cuquantum_executor.cu

+83 −17

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
		REVISION: 2021/12/30
		REVISION: 2022/01/03

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		Copyright (C) 2018-2022 Dmitry Lyakh
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)

		Rationale:

		@@ -48,14 +48,14 @@ struct TensorDescriptor {
		std::size_t volume = 0; //tensor body volume
		std::size_t size = 0; //tensor body size (bytes)
		void * src_ptr = nullptr; //non-owning pointer to the tensor body source image
		std::vector<void*> dst_ptr; //non-owning pointer to the tensor body dest image (for all GPU)
		std::vector<void*> dst_ptr; //non-owning pointer to the tensor body destination image (on each GPU)
		};

		struct TensorNetworkReq {
		TensorNetworkQueue::ExecStat exec_status = TensorNetworkQueue::ExecStat::None; //tensor network execution status
		std::shared_ptr<numerics::TensorNetwork> network; //tensor network specification
		std::unordered_map<numerics::TensorHashType, TensorDescriptor> tensor_descriptors; //tensor descriptors (shape, volume, data type, body)
		std::unordered_map<unsigned int, std::vector<int32_t>> tensor_modes; //indices associated with tensor dimensions (key is the original tensor id)
		std::unordered_map<unsigned int, std::vector<int32_t>> tensor_modes; //indices associated with tensor dimensions (key = original tensor id)
		std::unordered_map<int32_t, int64_t> mode_extents; //extent of each registered tensor mode
		int32_t * num_modes_in = nullptr;
		int64_t ** extents_in = nullptr;
		@@ -67,7 +67,7 @@ struct TensorNetworkReq {
		int64_t * strides_out = nullptr;
		int32_t * modes_out = nullptr;
		uint32_t alignment_out;
		std::vector<void*> memory_window_ptr; //end of the GPU memory segment allocated for the tensors
		std::vector<void*> memory_window_ptr; //end of the GPU memory segment allocated for the tensors (on each GPU)
		cutensornetNetworkDescriptor_t net_descriptor;
		cutensornetContractionOptimizerConfig_t opt_config;
		cutensornetContractionOptimizerInfo_t opt_info;
		@@ -75,6 +75,20 @@ struct TensorNetworkReq {
		cudaDataType_t data_type;
		cutensornetComputeType_t compute_type;
		cudaStream_t stream;

		~TensorNetworkReq() {
		cudaStreamSynchronize(stream);
		cudaStreamDestroy(stream);
		cutensornetDestroyNetworkDescriptor(net_descriptor);
		if(modes_out != nullptr) delete [] modes_out;
		if(strides_out != nullptr) delete [] strides_out;
		if(extents_out != nullptr) delete [] extents_out;
		if(alignments_in != nullptr) delete [] alignments_in;
		if(modes_in != nullptr) delete [] modes_in;
		if(strides_in != nullptr) delete [] strides_in;
		if(extents_in != nullptr) delete [] extents_in;
		if(num_modes_in != nullptr) delete [] num_modes_in;
		}
		};


		@@ -181,8 +195,7 @@ TensorNetworkQueue::ExecStat CuQuantumExecutor::sync(const TensorOpExecHandle ex
		}
		exec_stat = tn_req->exec_status;
		tn_req.reset();
		if(exec_stat == TensorNetworkQueue::ExecStat::Completed)
		active_networks_.erase(iter);
		if(exec_stat == TensorNetworkQueue::ExecStat::Completed) active_networks_.erase(iter);
		}
		return exec_stat;
		}
		@@ -216,6 +229,21 @@ cudaDataType_t getCudaDataType(const TensorElementType elem_type)
		}


		cutensornetComputeType_t getCutensorComputeType(const TensorElementType elem_type)
		{
		cutensornetComputeType_t cutensor_data_type;
		switch(elem_type){
		case TensorElementType::REAL32: cutensor_data_type = CUTENSORNET_COMPUTE_32F; break;
		case TensorElementType::REAL64: cutensor_data_type = CUTENSORNET_COMPUTE_64F; break;
		case TensorElementType::COMPLEX32: cutensor_data_type = CUTENSORNET_COMPUTE_32F; break;
		case TensorElementType::COMPLEX64: cutensor_data_type = CUTENSORNET_COMPUTE_64F; break;
		default:
		assert(false);
		}
		return cutensor_data_type;
		}


		void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
		{
		const auto & net = *(tn_req->network);
		@@ -239,6 +267,11 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_
		const auto tens_vol = tens.getTensor()->getVolume();
		const auto tens_rank = tens.getRank();
		const auto tens_type = tens.getElementType();
		if(tens_type == TensorElementType::VOID){
		std::cout << "#ERROR(exatn::runtime::CuQuantumExecutor): Network tensor #" << tens_id
		<< " has not been allocated typed storage yet!\n";
		assert(false);
		}
		const auto & tens_legs = tens.getTensorLegs();
		const auto & tens_dims = tens.getDimExtents();

		@@ -249,7 +282,7 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_
		for(unsigned int i = 0; i < tens_rank; ++i) descr.extents[i] = tens_dims[i];
		descr.data_type = getCudaDataType(tens_type);
		descr.volume = tens_vol;
		descr.src_ptr = tensor_data_access_func_(*(tens.getTensor()),DEV_HOST,0,&(descr.size));
		descr.src_ptr = tensor_data_access_func_(*(tens.getTensor()),DEV_HOST,0,&(descr.size)); //`Assuming tensor body is on Host
		assert(descr.src_ptr != nullptr);
		}

		@@ -279,19 +312,52 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_
		}
		}

		HANDLE_CTN_ERROR(cutensornetCreateNetworkDescriptor(gpu_attr_[0].second.cutn_handle,num_input_tensors,
		const auto tens_elem_type = net.getTensorElementType();
		tn_req->data_type = getCudaDataType(tens_elem_type);
		tn_req->compute_type = getCutensorComputeType(tens_elem_type);

		//Create a cuTensorNet network descriptor on one or all GPUs:
		for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
		const auto gpu_id = gpu_attr_[gpu].first;
		HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
		HANDLE_CTN_ERROR(cutensornetCreateNetworkDescriptor(gpu_attr_[gpu].second.cutn_handle,num_input_tensors,
		tn_req->num_modes_in,tn_req->extents_in,tn_req->strides_in,tn_req->modes_in,tn_req->alignments_in,
		tn_req->num_modes_out,tn_req->extents_out,tn_req->strides_out,tn_req->modes_out,tn_req->alignment_out,
		tn_req->data_type,tn_req->compute_type,&(tn_req->net_descriptor)));

		HANDLE_CUDA_ERROR(cudaStreamCreate(&(tn_req->stream)));
		}
		return;
		}


		void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
		{

		//Load tensors to one or all GPUs:
		for(int gpu = 0; gpu < 1; ++gpu){ //`Only one GPU for now
		const auto gpu_id = gpu_attr_[gpu].first;
		HANDLE_CUDA_ERROR(cudaSetDevice(gpu_id));
		void * prev_front = mem_pool_[gpu].getFront();
		bool success = true;
		//Acquire device memory:
		for(auto & descr: tn_req->tensor_descriptors){
		void * dev_ptr = mem_pool_[gpu].acquireMemory(descr.second.size);
		success = (dev_ptr != nullptr); if(!success) break;
		descr.second.dst_ptr.emplace_back(dev_ptr);
		}
		if(success){
		//Initiate data transfers:
		for(auto & descr: tn_req->tensor_descriptors){
		HANDLE_CUDA_ERROR(cudaMemcpyAsync(descr.second.dst_ptr.back(),descr.second.src_ptr,
		descr.second.size,cudaMemcpyDefault,tn_req->stream));
		}
		tn_req->exec_status = TensorNetworkQueue::ExecStat::Loading;
		}else{
		//Restore previous memory front:
		mem_pool_[gpu].restorePreviousFront(prev_front);
		break;
		}
		}
		return;
		}

src/runtime/executor/cuquantum/cuquantum_executor.hpp

+5 −5

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
		REVISION: 2021/12/30
		REVISION: 2022/01/03

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		Copyright (C) 2018-2022 Dmitry Lyakh
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)

		Rationale:
		- ExaTN graph executor may accept whole tensor networks for execution
		@@ -82,11 +82,11 @@ protected:
		void * cutn_handle; //cutensornetHandle_t = void*
		};

		/ Currently processed tensor networks /
		/ Currently processed (progressing) tensor networks /
		std::unordered_map<TensorOpExecHandle,std::shared_ptr<TensorNetworkReq>> active_networks_;
		/ Attributes of all GPUs available to the current process /
		std::vector<std::pair<int,DeviceAttr>> gpu_attr_; //{gpu_id, gpu_attributes}
		/ Moving-window linear memory pool (in GPU RAM) /
		/ Moving-window linear memory pools for all GPUs of the current process /
		std::vector<LinearMemoryPool> mem_pool_;
		/ Tensor data access function /
		TensorImplFunc tensor_data_access_func_; //numerics::Tensor --> {tensor_body_ptr, size_in_bytes}

src/runtime/executor/cuquantum/linear_memory.hpp

+8 −3

Original line number	Diff line number	Diff line
		/** ExaTN: Tensor Runtime: Tensor network executor: Linear memory allocator
		REVISION: 2021/12/30
		REVISION: 2022/01/03

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		Copyright (C) 2018-2022 Dmitry Lyakh
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)

		Rationale:

		@@ -73,6 +73,11 @@ public:
		return;
		}

		void restorePreviousFront(void * front) {
		front_ = front;
		return;
		}

		void * getFront() const {
		return front_;
		}

src/runtime/executor/graph_executors/lazy/graph_executor_lazy.cpp

+5 −5

Original line number	Diff line number	Diff line
		/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
		REVISION: 2021/12/30
		REVISION: 2022/01/03

		Copyright (C) 2018-2021 Dmitry Lyakh
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		Copyright (C) 2018-2022 Dmitry Lyakh
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
		**/

		#include "graph_executor_lazy.hpp"
		@@ -275,9 +275,9 @@ void LazyGraphExecutor::execute(TensorGraph & dag) {


		void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
		#ifdef CUQUANTUM
		std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Started executing the tensor network queue via cuQuantum: "
		<< tensor_network_queue.getSize() << " elements detected" << std::endl;
		#ifdef CUQUANTUM
		assert(node_executor_);
		//Synchronize the node executor:
		bool synced = node_executor_->sync(); assert(synced);
		@@ -311,10 +311,10 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
		}
		}
		cuquantum_executor_->sync();
		std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Finished executing the tensor network queue via cuQuantum\n";
		#else
		assert(tensor_network_queue.isEmpty());
		#endif
		std::cout << "#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Finished executing the tensor network queue via cuQuantum\n";
		return;
		}