Finished CuQuantum executor backend integration, except final allreduce. (edacaf70) · Commits · ORNL Quantum Computing Institute / exatn

development.txt

+5 −5

Original line number	Diff line number	Diff line
		@@ -9,7 +9,7 @@ ISSUES:
		That is, the order of tensor operations across all participating
		processes must be consistent such that every encountered global
		tensor operation will receive the same tensor operand irrespective
		of the difference in the locally generated tensor name. Special
		of the difference in the locally generated tensor names. Special
		care needs to be taken in iterating over associative tensor containers,
		to ensure that the keys are consistent accross all participating
		processes. For example, automatically generated tensor names
		@@ -21,9 +21,7 @@ ISSUES:

		BUGS:

		- 32-bit integer MPI message chunking issue in the backend.

		- Fix the bug(s) in the tensor order reduction mechanism in the TalshExecutor backend.
		- Fix the bug(s) in the tensor order reduction mechanism in the TalshNodeExecutor backend.


		FEATURES:
		@@ -39,11 +37,13 @@ FEATURES:
		Contract replaced tensors, then replace the contracted
		tensor with a new tensor (sub)network.

		- Implement the Renormalization procedure.

		- Implement SAVE/LOAD API for TensorExpansion.

		- Implement TensorNetwork slice computing Generator.

		- Implement b-D procedure.
		- Implement bl-D procedure.

		- Implement conjugate gradient optimization procedure.

src/exatn/exatn_numerics.hpp

+9 −3

Original line number	Diff line number	Diff line
		/** ExaTN::Numerics: General client header (free function API)
		REVISION: 2021/10/30
		REVISION: 2022/01/07

		Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
		Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh)
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/

		/** Rationale:
		1. Vector space and subspace registration [spaces.hpp, space_register.hpp]:
		@@ -1086,6 +1086,12 @@ inline std::shared_ptr<exatn::TensorNetwork> makeTensorNetwork(const std::string
		// INTERNAL CONTROL API //
		//////////////////////////

		/** Switches the computational backend: {"default","cuquantum"}.
		Only applies to tensor network execution. **/
		inline void switchComputationalBackend(const std::string & backend_name)
		{return numericalServer->switchComputationalBackend(backend_name);}


		/** Resets the tensor contraction sequence optimizer that is invoked
		when evaluating tensor networks: {dummy,heuro,greed,metis}. **/
		inline void resetContrSeqOptimizer(const std::string & optimizer_name)

src/exatn/num_server.cpp

+64 −7

Original line number	Diff line number	Diff line
		/** ExaTN::Numerics: Numerical server
		REVISION: 2021/12/10
		REVISION: 2022/01/07

		Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
		Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh)
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/

		#include "num_server.hpp"
		#include "tensor_range.hpp"
		@@ -89,7 +89,8 @@ NumServer::NumServer(const MPICommProxy & communicator,
		const ParamConf & parameters,
		const std::string & graph_executor_name,
		const std::string & node_executor_name):
		contr_seq_optimizer_("metis"), contr_seq_caching_(false), logging_(0), intra_comm_(communicator), validation_tracing_(false)
		contr_seq_optimizer_("metis"), contr_seq_caching_(false), logging_(0), comp_backend_("default"),
		intra_comm_(communicator), validation_tracing_(false)
		{
		int mpi_error = MPI_Comm_size(*(communicator.get<MPI_Comm>()),&num_processes_); assert(mpi_error == MPI_SUCCESS);
		mpi_error = MPI_Comm_rank(*(communicator.get<MPI_Comm>()),&process_rank_); assert(mpi_error == MPI_SUCCESS);
		@@ -117,7 +118,8 @@ NumServer::NumServer(const MPICommProxy & communicator,
		NumServer::NumServer(const ParamConf & parameters,
		const std::string & graph_executor_name,
		const std::string & node_executor_name):
		contr_seq_optimizer_("metis"), contr_seq_caching_(false), logging_(0), validation_tracing_(false)
		contr_seq_optimizer_("metis"), contr_seq_caching_(false), logging_(0), comp_backend_("default"),
		validation_tracing_(false)
		{
		num_processes_ = 1; process_rank_ = 0; global_process_rank_ = 0;
		process_world_ = std::make_shared<ProcessGroup>(intra_comm_,num_processes_); //intra-communicator is empty here
		@@ -194,6 +196,22 @@ void NumServer::reconfigureTensorRuntime(const ParamConf & parameters,
		}
		#endif

		void NumServer::switchComputationalBackend(const std::string & backend_name)
		{
		bool success = sync(); assert(success);
		if(backend_name == "default"){
		comp_backend_ = backend_name;
		#ifdef CUQUANTUM
		}else if(backend_name == "cuquantum"){
		comp_backend_ = backend_name;
		#endif
		}else{
		std::cout << "#ERROR(exatn::NumServer): switchComputationalBackend: Unknown backend: " << backend_name << std::endl;
		std::abort();
		}
		return;
		}

		void NumServer::resetContrSeqOptimizer(const std::string & optimizer_name, bool caching)
		{
		contr_seq_optimizer_ = optimizer_name;
		@@ -612,7 +630,7 @@ bool NumServer::submit(const ProcessGroup & process_group,
		//Determine parallel execution configuration:
		unsigned int local_rank; //local process rank within the process group
		if(!process_group.rankIsIn(process_rank_,&local_rank)) return true; //process is not in the group: Do nothing
		assert(network.isValid()); //debug
		//assert(network.isValid()); //debug
		unsigned int num_procs = process_group.getSize(); //number of executing processes
		assert(local_rank < num_procs);
		if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart())
		@@ -883,6 +901,30 @@ bool NumServer::submit(const ProcessGroup & process_group,
		bool NumServer::submit(const ProcessGroup & process_group,
		std::shared_ptr<TensorNetwork> network)
		{
		#ifdef CUQUANTUM
		//Try execution via an alternative computational backend:
		if(comp_backend_ == "cuquantum"){
		//Determine parallel execution configuration:
		unsigned int local_rank; //local process rank within the process group
		if(!process_group.rankIsIn(process_rank_,&local_rank)) return true; //process is not in the group: Do nothing
		//assert(network->isValid()); //debug
		unsigned int num_procs = process_group.getSize(); //number of executing processes
		assert(local_rank < num_procs);
		if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart())
		<< "]: Submitting tensor network <" << network->getName() << "> (" << network->getTensor(0)->getName()
		<< ") for execution via cuQuantum by " << num_procs << " processes with memory limit "
		<< process_group.getMemoryLimitPerProcess() << " bytes" << std::endl << std::flush;
		if(logging_ > 0) network->printItFile(logfile_);
		const auto exec_handle = tensor_rt_->submit(network,process_group.getMPICommProxy(),num_procs,local_rank);
		bool success = (exec_handle != 0);
		if(success){
		auto res = tn_exec_handles_.emplace(std::make_pair(network->getTensor(0)->getTensorHash(),exec_handle));
		success = res.second;
		if(success && logging_ > 0) logfile_ << "Number of submitted networks via cuQuantum = 1" << std::endl << std::flush;
		}
		return success;
		}
		#endif
		if(network) return submit(process_group,*network);
		return false;
		}
		@@ -1030,6 +1072,14 @@ bool NumServer::sync(const ProcessGroup & process_group, const Tensor & tensor,
		{
		bool success = true;
		if(!process_group.rankIsIn(process_rank_)) return success; //process is not in the group: Do nothing
		#ifdef CUQUANTUM
		if(comp_backend_ == "cuquantum"){
		auto iter = tn_exec_handles_.find(tensor.getTensorHash());
		bool synced = (iter == tn_exec_handles_.end());
		if(!synced) synced = tensor_rt_->syncNetwork(iter->second,wait);
		return synced;
		}
		#endif
		auto iter = tensors_.find(tensor.getName());
		if(iter != tensors_.end()){
		if(iter->second->isComposite()){
		@@ -1081,7 +1131,11 @@ bool NumServer::sync(const ProcessGroup & process_group, TensorNetwork & network

		bool NumServer::sync(bool wait)
		{
		return sync(getCurrentProcessGroup(),wait);
		bool success = sync(getCurrentProcessGroup(),wait);
		#ifdef CUQUANTUM
		if(comp_backend_ == "cuquantum" && success) tn_exec_handles_.clear();
		#endif
		return success;
		}

		bool NumServer::sync(const ProcessGroup & process_group, bool wait)
		@@ -1092,6 +1146,9 @@ bool NumServer::sync(const ProcessGroup & process_group, bool wait)
		if(success){
		if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart())
		<< "]: Locally synchronized all operations" << std::endl << std::flush;
		#ifdef CUQUANTUM
		if(comp_backend_ == "cuquantum") tn_exec_handles_.clear();
		#endif
		#ifdef MPI_ENABLED
		if(wait){
		auto errc = MPI_Barrier(process_group.getMPICommProxy().getRef<MPI_Comm>());

src/exatn/num_server.hpp

+19 −3

Original line number	Diff line number	Diff line
		/** ExaTN::Numerics: Numerical server
		REVISION: 2021/12/22
		REVISION: 2022/01/07

		Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
		Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
		Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh)
		Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/

		/** Rationale:
		(a) Numerical server provides basic tensor network processing functionality:
		@@ -270,6 +270,9 @@ public:
		const std::string & node_executor_name);
		#endif

		/ Switches the computational backend. /
		void switchComputationalBackend(const std::string & backend_name);

		/** Resets the tensor contraction sequence optimizer that is
		invoked when evaluating tensor networks. **/
		void resetContrSeqOptimizer(const std::string & optimizer_name, //in: tensor contraction sequence optimizer name
		@@ -1032,25 +1035,38 @@ protected:

		private:

		//Spaces:
		std::shared_ptr<numerics::SpaceRegister> space_register_; //register of vector spaces and their named subspaces
		std::unordered_map<std::string,SpaceId> subname2id_; //maps a subspace name to its parental vector space id

		//Tensors:
		std::unordered_map<std::string,std::shared_ptr<Tensor>> tensors_; //registered tensors (by CREATE operation)
		std::map<std::string,std::shared_ptr<Tensor>> implicit_tensors_; //tensors created implicitly by the runtime (for garbage collection)
		std::unordered_map<std::string,ProcessGroup> tensor_comms_; //process group associated with each tensor

		#ifdef CUQUANTUM
		//Tensor network execution handles:
		std::unordered_map<numerics::TensorHashType,runtime::TensorOpExecHandle> tn_exec_handles_;
		#endif

		//Contraction path optimizer:
		std::string contr_seq_optimizer_; //tensor contraction sequence optimizer invoked when evaluating tensor networks
		bool contr_seq_caching_; //regulates whether or not to cache pseudo-optimal tensor contraction orders for later reuse

		//Registered external methods and data:
		std::map<std::string,std::shared_ptr<TensorMethod>> ext_methods_; //external tensor methods
		std::map<std::string,std::shared_ptr<BytePacket>> ext_data_; //external data

		//Program scopes:
		std::stack<std::pair<std::string,ScopeId>> scopes_; //TAProL scope stack: {Scope name, Scope Id}

		//Tensor operation factory:
		TensorOpFactory * tensor_op_factory_; //tensor operation factory (non-owning pointer)

		//Configuration:
		int logging_; //logging level
		std::ofstream logfile_; //log file
		std::string comp_backend_; //current computational backend
		int num_processes_; //total number of parallel processes in the dedicated MPI communicator
		int process_rank_; //rank of the current parallel process in the dedicated MPI communicator
		int global_process_rank_; //rank of the current parallel process in MPI_COMM_WORLD

src/exatn/tests/NumServerTester.cpp

+4 −1

Original line number	Diff line number	Diff line
		@@ -3807,6 +3807,8 @@ TEST(NumServerTester, CuTensorNet) {
		success = exatn::initTensorRnd("C"); assert(success);
		success = exatn::initTensor("D",0.0); assert(success);

		exatn::switchComputationalBackend("default");

		//Contract tensor network:
		int num_repeats = NUM_REPEATS;
		while(--num_repeats >= 0){
		@@ -3814,8 +3816,9 @@ TEST(NumServerTester, CuTensorNet) {
		std::cout << "D(m,x,n,y)+=A(m,h,k,n)B(u,k,h)C(x,u,y): ";
		auto flops = exatn::getTotalFlopCount();
		auto time_start = exatn::Timer::timeInSecHR();
		success = exatn::evaluateTensorNetworkSync("cuNet","D(m,x,n,y)+=A(m,h,k,n)B(u,k,h)C(x,u,y)");
		success = exatn::evaluateTensorNetwork("cuNet","D(m,x,n,y)+=A(m,h,k,n)B(u,k,h)C(x,u,y)");
		assert(success);
		success = exatn::sync("D"); assert(success);
		auto duration = exatn::Timer::timeInSecHR(time_start);
		flops = exatn::getTotalFlopCount() - flops;
		std::cout << "Performance = " << (flops / (1e9 * duration)) << " Gflop/s" << std::endl;