Implemented CuQuantumExecutor::parseTensorNetwork (ee550dac) · Commits · ORNL Quantum Computing Institute / exatn

src/runtime/executor/cuquantum/cuquantum_executor.cu

+99 −5

Original line number	Diff line number	Diff line
		@@ -44,9 +44,11 @@ namespace runtime {
		struct TensorDescriptor {
		std::vector<int64_t> extents; //tensor dimension extents
		std::vector<int64_t> strides; //tensor dimension strides (optional)
		void * body_ptr = nullptr; //pointer to the tensor body image
		std::size_t volume = 0; //tensor body volume
		cudaDataType_t data_type; //tensor element data type
		std::size_t volume = 0; //tensor body volume
		std::size_t size = 0; //tensor body size (bytes)
		void * src_ptr = nullptr; //non-owning pointer to the tensor body source image
		std::vector<void*> dst_ptr; //non-owning pointer to the tensor body dest image (for all GPU)
		};

		struct TensorNetworkReq {
		@@ -54,14 +56,25 @@ struct TensorNetworkReq {
		std::shared_ptr<numerics::TensorNetwork> network; //tensor network specification
		std::unordered_map<numerics::TensorHashType,TensorDescriptor> tensor_descriptors; //tensor descriptors (shape, volume, data type, body)
		std::unordered_map<unsigned int, std::vector<int32_t>> tensor_modes; //indices associated with tensor dimensions (key is the original tensor id)
		std::unordered_map<int32_t,int64_t> index_extents; //extent of each registered tensor mode
		std::unordered_map<int32_t,int64_t> mode_extents; //extent of each registered tensor mode
		int32_t * num_modes_in = nullptr;
		int64_t ** extents_in = nullptr;
		int64_t ** strides_in = nullptr;
		int32_t ** modes_in = nullptr;
		uint32_t * alignments_in = nullptr;
		int32_t num_modes_out;
		int64_t * extents_out = nullptr;
		int64_t * strides_out = nullptr;
		int32_t * modes_out = nullptr;
		uint32_t alignment_out;
		std::vector<void*> memory_window_ptr; //end of the GPU memory segment allocated for the tensors
		cutensornetNetworkDescriptor_t net_descriptor;
		cutensornetContractionOptimizerConfig_t opt_config;
		cutensornetContractionOptimizerInfo_t opt_info;
		cutensornetContractionPlan_t comp_plan;
		cudaStream_t stream;
		cudaDataType_t data_type;
		cutensornetComputeType_t compute_type;
		cudaStream_t stream;
		};


		@@ -188,9 +201,90 @@ void CuQuantumExecutor::sync()
		}


		cudaDataType_t getCudaDataType(const TensorElementType elem_type)
		{
		cudaDataType_t cuda_data_type;
		switch(elem_type){
		case TensorElementType::REAL32: cuda_data_type = CUDA_R_32F; break;
		case TensorElementType::REAL64: cuda_data_type = CUDA_R_64F; break;
		case TensorElementType::COMPLEX32: cuda_data_type = CUDA_C_32F; break;
		case TensorElementType::COMPLEX64: cuda_data_type = CUDA_C_64F; break;
		default:
		assert(false);
		}
		return cuda_data_type;
		}


		void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_req)
		{
		const auto & net = *(tn_req->network);
		const int32_t num_input_tensors = net.getNumTensors();
		tn_req->num_modes_in = new int32_t[num_input_tensors];
		tn_req->extents_in = new int64_t*[num_input_tensors];
		tn_req->strides_in = new int64_t*[num_input_tensors];
		tn_req->modes_in = new int32_t*[num_input_tensors];
		tn_req->alignments_in = new uint32_t[num_input_tensors];

		for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->strides_in[i] = NULL;
		for(unsigned int i = 0; i < num_input_tensors; ++i) tn_req->alignments_in[i] = MEM_ALIGNMENT;
		tn_req->strides_out = NULL;
		tn_req->alignment_out = MEM_ALIGNMENT;

		int32_t mode_id = 0, tens_num = 0;
		for(auto iter = net.cbegin(); iter != net.cend(); ++iter){
		const auto tens_id = iter->first;
		const auto & tens = iter->second;
		const auto tens_hash = tens.getTensor()->getTensorHash();
		const auto tens_vol = tens.getTensor()->getVolume();
		const auto tens_rank = tens.getRank();
		const auto tens_type = tens.getElementType();
		const auto & tens_legs = tens.getTensorLegs();
		const auto & tens_dims = tens.getDimExtents();

		auto res0 = tn_req->tensor_descriptors.emplace(std::make_pair(tens_hash,TensorDescriptor{}));
		if(res0.second){
		auto & descr = res0.first->second;
		descr.extents.resize(tens_rank);
		for(unsigned int i = 0; i < tens_rank; ++i) descr.extents[i] = tens_dims[i];
		descr.data_type = getCudaDataType(tens_type);
		descr.volume = tens_vol;
		descr.src_ptr = tensor_data_access_func_(*(tens.getTensor()),DEV_HOST,0,&(descr.size));
		assert(descr.src_ptr != nullptr);
		}

		auto res1 = tn_req->tensor_modes.emplace(std::make_pair(tens_id,std::vector<int32_t>(tens_rank)));
		assert(res1.second);
		for(unsigned int i = 0; i < tens_rank; ++i){
		const auto other_tens_id = tens_legs[i].getTensorId();
		const auto other_tens_leg_id = tens_legs[i].getDimensionId();
		auto other_tens_iter = tn_req->tensor_modes.find(other_tens_id);
		if(other_tens_iter == tn_req->tensor_modes.end()){
		res1.first->second[i] = ++mode_id;
		auto new_mode = tn_req->mode_extents.emplace(std::make_pair(mode_id,tens_dims[i]));
		}else{
		res1.first->second[i] = other_tens_iter->second[other_tens_leg_id];
		}
		}

		if(tens_id == 0){ //output tensor
		tn_req->num_modes_out = tens_rank;
		tn_req->extents_out = res0.first->second.extents.data();
		tn_req->modes_out = res1.first->second.data();
		}else{ //input tensors
		tn_req->num_modes_in[tens_num] = tens_rank;
		tn_req->extents_in[tens_num] = res0.first->second.extents.data();
		tn_req->modes_in[tens_num] = res1.first->second.data();
		++tens_num;
		}
		}

		HANDLE_CTN_ERROR(cutensornetCreateNetworkDescriptor(gpu_attr_[0].second.cutn_handle,num_input_tensors,
		tn_req->num_modes_in,tn_req->extents_in,tn_req->strides_in,tn_req->modes_in,tn_req->alignments_in,
		tn_req->num_modes_out,tn_req->extents_out,tn_req->strides_out,tn_req->modes_out,tn_req->alignment_out,
		tn_req->data_type,tn_req->compute_type,&(tn_req->net_descriptor)));

		HANDLE_CUDA_ERROR(cudaStreamCreate(&(tn_req->stream)));
		return;
		}

src/runtime/executor/cuquantum/cuquantum_executor.hpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -30,7 +30,7 @@ class Tensor;
		namespace exatn {
		namespace runtime {

		using TensorImplFunc = std::function<const void(const numerics::Tensor &, int, int, std::size_t )>;
		using TensorImplFunc = std::function<void(const numerics::Tensor &, int, int, std::size_t )>;
		using TensorImplTalshFunc = std::function<std::shared_ptr<talsh::Tensor>(const numerics::Tensor &, int, int)>;

		struct TensorNetworkReq;

src/runtime/executor/graph_executors/lazy/graph_executor_lazy.cpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -33,7 +33,7 @@ void LazyGraphExecutor::resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> no
		if(node_executor){
		cuquantum_executor_ = std::make_shared<CuQuantumExecutor>(
		[this](const numerics::Tensor & tensor, int device_kind, int device_id, std::size_t * size){
		const void * data_ptr = this->node_executor_->getTensorImage(tensor,device_kind,device_id,size);
		void * data_ptr = this->node_executor_->getTensorImage(tensor,device_kind,device_id,size);
		return data_ptr;
		}
		);

src/runtime/executor/node_executors/exatensor/node_executor_exatensor.hpp

+5 −5

Original line number	Diff line number	Diff line
		/** ExaTN:: Tensor Runtime: Tensor graph node executor: Exatensor
		REVISION: 2021/12/27
		REVISION: 2021/12/30

		Copyright (C) 2018-2021 Dmitry Lyakh, Tiffany Mintz, Alex McCaskey
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		@@ -87,7 +87,7 @@ public:
		std::shared_ptr<talsh::Tensor> getLocalTensor(const numerics::Tensor & tensor,
		const std::vector<std::pair<DimOffset,DimExtent>> & slice_spec) override;

		const void * getTensorImage(const numerics::Tensor & tensor,
		void * getTensorImage(const numerics::Tensor & tensor,
		int device_kind,
		int device_id,
		std::size_t * size = nullptr) const override {return nullptr;}

src/runtime/executor/node_executors/talsh/node_executor_talsh.cpp

+4 −4

Original line number	Diff line number	Diff line
		/** ExaTN:: Tensor Runtime: Tensor graph node executor: Talsh
		REVISION: 2021/12/27
		REVISION: 2021/12/30

		Copyright (C) 2018-2021 Dmitry Lyakh, Tiffany Mintz, Alex McCaskey
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
		@@ -1459,7 +1459,7 @@ std::shared_ptr<talsh::Tensor> TalshNodeExecutor::getLocalTensor(const numerics:
		}


		const void * TalshNodeExecutor::getTensorImage(const numerics::Tensor & tensor,
		void * TalshNodeExecutor::getTensorImage(const numerics::Tensor & tensor,
		int device_kind, int device_id,
		std::size_t * size) const
		{