Loading src/exatn/exatn_numerics.hpp +3 −3 Original line number Diff line number Diff line /** ExaTN::Numerics: General client header (free function API) REVISION: 2022/01/07 REVISION: 2022/01/08 Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/ Loading Loading @@ -866,12 +866,12 @@ inline bool evaluateSync(const ProcessGroup & process_group, //in: chosen group /** Synchronizes all outstanding update operations on a given tensor specified by its symbolic name. If ProcessGroup is not provided, defaults to the local process.**/ inline bool sync(const std::string & name, //in: tensor name bool wait = true) //in: wait versus test for completion bool wait) //in: wait versus test for completion {return numericalServer->sync(name,wait);} inline bool sync(const ProcessGroup & process_group, //in: chosen group of MPI processes const std::string & name, //in: tensor name bool wait = true) //in: wait versus test for completion bool wait) //in: wait versus test for completion {return numericalServer->sync(process_group,name,wait);} Loading src/exatn/num_server.cpp +26 −15 Original line number Diff line number Diff line /** ExaTN::Numerics: Numerical server REVISION: 2022/01/07 REVISION: 2022/01/08 Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/ Loading Loading @@ -198,10 +198,10 @@ void NumServer::reconfigureTensorRuntime(const ParamConf & parameters, void NumServer::switchComputationalBackend(const std::string & backend_name) { bool success = tensor_rt_->sync(); assert(success); //bool success = sync(); assert(success); if(logging_ > 0 && backend_name != comp_backend_){ logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart()) << "]: Switching computational backend to " << backend_name << std::endl << std::flush; << "]: Switched computational backend to " << backend_name << std::endl << std::flush; } if(backend_name == "default"){ comp_backend_ = backend_name; Loading @@ -210,7 +210,8 @@ void NumServer::switchComputationalBackend(const std::string & backend_name) comp_backend_ = backend_name; #endif }else{ std::cout << "#ERROR(exatn::NumServer): switchComputationalBackend: Unknown backend: " << backend_name << std::endl; std::cout << "#ERROR(exatn::NumServer): switchComputationalBackend: Unknown backend: " << backend_name << std::endl << std::flush; std::abort(); } return; Loading Loading @@ -916,15 +917,16 @@ bool NumServer::submit(const ProcessGroup & process_group, assert(local_rank < num_procs); if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart()) << "]: Submitting tensor network <" << network->getName() << "> (" << network->getTensor(0)->getName() << ") for execution via cuQuantum by " << num_procs << " processes with memory limit " << process_group.getMemoryLimitPerProcess() << " bytes" << std::endl << std::flush; << ":" << getTensorNetworkHash(network) << ") for execution via cuQuantum by " << num_procs << " processes with memory limit " << process_group.getMemoryLimitPerProcess() << " bytes\n" << std::flush; if(logging_ > 0) network->printItFile(logfile_); const auto exec_handle = tensor_rt_->submit(network,process_group.getMPICommProxy(),num_procs,local_rank); bool success = (exec_handle != 0); if(success){ auto res = tn_exec_handles_.emplace(std::make_pair(network->getTensor(0)->getTensorHash(),exec_handle)); success = res.second; if(success && logging_ > 0) logfile_ << "Number of submitted networks via cuQuantum = 1" << std::endl << std::flush; if(success && logging_ > 0) logfile_ << "Execution handle of the submitted network via cuQuantum is " << exec_handle << std::endl << std::flush; } return success; } Loading Loading @@ -1076,16 +1078,25 @@ bool NumServer::sync(const ProcessGroup & process_group, const Tensor & tensor, { bool success = true; if(!process_group.rankIsIn(process_rank_)) return success; //process is not in the group: Do nothing auto iter = tensors_.find(tensor.getName()); if(iter != tensors_.end()){ #ifdef CUQUANTUM if(comp_backend_ == "cuquantum"){ auto iter = tn_exec_handles_.find(tensor.getTensorHash()); bool synced = (iter == tn_exec_handles_.end()); if(!synced) synced = tensor_rt_->syncNetwork(iter->second,wait); return synced; auto cuter = tn_exec_handles_.find(iter->second->getTensorHash()); success = (cuter == tn_exec_handles_.end()); if(!success){ success = tensor_rt_->syncNetwork(cuter->second,wait); if(success){ if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart()) << "]: Locally synchronized cuQuantum execution handle " << cuter->second << " via tensor <" << tensor.getName() << ">" << std::endl << std::flush; tn_exec_handles_.erase(cuter); } } return success; } #endif auto iter = tensors_.find(tensor.getName()); if(iter != tensors_.end()){ if(iter->second->isComposite()){ auto composite_tensor = castTensorComposite(iter->second); assert(composite_tensor); for(auto subtens = composite_tensor->begin(); subtens != composite_tensor->end(); ++subtens){ Loading src/exatn/tests/NumServerTester.cpp +8 −7 Original line number Diff line number Diff line Loading @@ -3791,7 +3791,7 @@ TEST(NumServerTester, CuTensorNet) { const int NUM_REPEATS = 1; exatn::resetLoggingLevel(1,2); //debug exatn::resetLoggingLevel(2,2); //debug bool success = true; Loading @@ -3807,23 +3807,24 @@ TEST(NumServerTester, CuTensorNet) { success = exatn::initTensorRnd("C"); assert(success); success = exatn::initTensor("D",0.0); assert(success); exatn::switchComputationalBackend("default"); success = exatn::sync(); assert(success); exatn::switchComputationalBackend("cuquantum"); //Contract tensor network: int num_repeats = NUM_REPEATS; while(--num_repeats >= 0){ success = exatn::sync(); assert(success); std::cout << "D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y): "; auto flops = exatn::getTotalFlopCount(); auto time_start = exatn::Timer::timeInSecHR(); success = exatn::evaluateTensorNetwork("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)"); assert(success); success = exatn::sync("D"); assert(success); success = exatn::evaluateTensorNetwork("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)"); assert(success); success = exatn::sync("D",true); assert(success); auto duration = exatn::Timer::timeInSecHR(time_start); flops = exatn::getTotalFlopCount() - flops; std::cout << "Performance = " << (flops / (1e9 * duration)) << " Gflop/s" << std::endl; std::cout << "Duration = " << duration << " s; Performance = " << (flops / (1e9 * duration)) << " Gflop/s\n"; } //std::this_thread::sleep_for(std::chrono::microseconds(1000000)); //Destroy tensors: success = exatn::sync(); assert(success); success = exatn::destroyTensor("D"); assert(success); Loading src/numerics/tensor_basic.hpp +19 −19 Original line number Diff line number Diff line /** ExaTN: Tensor basic types and parameters REVISION: 2021/10/15 REVISION: 2022/01/07 Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/ Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/ #ifndef EXATN_NUMERICS_TENSOR_BASIC_HPP_ #define EXATN_NUMERICS_TENSOR_BASIC_HPP_ Loading Loading @@ -63,22 +63,22 @@ enum class IndexKind{ //Basic tensor operations: enum class TensorOpCode{ NOOP, //no operation CREATE, //tensor creation DESTROY, //tensor destruction TRANSFORM, //tensor transformation/initialization SLICE, //tensor slicing INSERT, //tensor insertion ADD, //tensor addition CONTRACT, //tensor contraction DECOMPOSE_SVD3, //tensor decomposition via SVD into three tensor factors DECOMPOSE_SVD2, //tensor decomposition via SVD into two tensor factors ORTHOGONALIZE_SVD, //tensor orthogonalization via SVD ORTHOGONALIZE_MGS, //tensor orthogonalization via Modified Gram-Schmidt FETCH, //fetch tensor data from another MPI process (parallel execution only) UPLOAD, //upload tensor data to another MPI process (parallel execution only) BROADCAST, //tensor broadcast (parallel execution only) ALLREDUCE //tensor allreduce (parallel execution only) NOOP, //0: no operation CREATE, //1: tensor creation DESTROY, //2: tensor destruction TRANSFORM, //3: tensor transformation/initialization SLICE, //4: tensor slicing INSERT, //5: tensor insertion ADD, //6: tensor addition CONTRACT, //7: tensor contraction DECOMPOSE_SVD3, //8: tensor decomposition via SVD into three tensor factors DECOMPOSE_SVD2, //9: tensor decomposition via SVD into two tensor factors ORTHOGONALIZE_SVD, //10: tensor orthogonalization via SVD ORTHOGONALIZE_MGS, //11: tensor orthogonalization via Modified Gram-Schmidt FETCH, //12: fetch tensor data from another MPI process (parallel execution only) UPLOAD, //13: upload tensor data to another MPI process (parallel execution only) BROADCAST, //14: tensor broadcast (parallel execution only) ALLREDUCE //15: tensor allreduce (parallel execution only) }; Loading src/runtime/executor/cuquantum/cuquantum_executor.cu +3 −2 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/07 REVISION: 2022/01/08 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -114,7 +114,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth, unsigned int num_processes, unsigned int process_rank): tensor_data_access_func_(std::move(tensor_data_access_func)), pipe_depth_(pipeline_depth), num_processes_(num_processes), process_rank_(process_rank) pipe_depth_(pipeline_depth), num_processes_(num_processes), process_rank_(process_rank), flops_(0.0) { static_assert(std::is_same<cutensornetHandle_t,void*>::value,"#FATAL(exatn::runtime::CuQuantumExecutor): cutensornetHandle_t != (void*)"); Loading Loading @@ -442,6 +442,7 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req) tn_req->opt_info, CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT, &flops,sizeof(flops))); flops_ += flops; } tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning; return; Loading Loading
src/exatn/exatn_numerics.hpp +3 −3 Original line number Diff line number Diff line /** ExaTN::Numerics: General client header (free function API) REVISION: 2022/01/07 REVISION: 2022/01/08 Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/ Loading Loading @@ -866,12 +866,12 @@ inline bool evaluateSync(const ProcessGroup & process_group, //in: chosen group /** Synchronizes all outstanding update operations on a given tensor specified by its symbolic name. If ProcessGroup is not provided, defaults to the local process.**/ inline bool sync(const std::string & name, //in: tensor name bool wait = true) //in: wait versus test for completion bool wait) //in: wait versus test for completion {return numericalServer->sync(name,wait);} inline bool sync(const ProcessGroup & process_group, //in: chosen group of MPI processes const std::string & name, //in: tensor name bool wait = true) //in: wait versus test for completion bool wait) //in: wait versus test for completion {return numericalServer->sync(process_group,name,wait);} Loading
src/exatn/num_server.cpp +26 −15 Original line number Diff line number Diff line /** ExaTN::Numerics: Numerical server REVISION: 2022/01/07 REVISION: 2022/01/08 Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/ Loading Loading @@ -198,10 +198,10 @@ void NumServer::reconfigureTensorRuntime(const ParamConf & parameters, void NumServer::switchComputationalBackend(const std::string & backend_name) { bool success = tensor_rt_->sync(); assert(success); //bool success = sync(); assert(success); if(logging_ > 0 && backend_name != comp_backend_){ logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart()) << "]: Switching computational backend to " << backend_name << std::endl << std::flush; << "]: Switched computational backend to " << backend_name << std::endl << std::flush; } if(backend_name == "default"){ comp_backend_ = backend_name; Loading @@ -210,7 +210,8 @@ void NumServer::switchComputationalBackend(const std::string & backend_name) comp_backend_ = backend_name; #endif }else{ std::cout << "#ERROR(exatn::NumServer): switchComputationalBackend: Unknown backend: " << backend_name << std::endl; std::cout << "#ERROR(exatn::NumServer): switchComputationalBackend: Unknown backend: " << backend_name << std::endl << std::flush; std::abort(); } return; Loading Loading @@ -916,15 +917,16 @@ bool NumServer::submit(const ProcessGroup & process_group, assert(local_rank < num_procs); if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart()) << "]: Submitting tensor network <" << network->getName() << "> (" << network->getTensor(0)->getName() << ") for execution via cuQuantum by " << num_procs << " processes with memory limit " << process_group.getMemoryLimitPerProcess() << " bytes" << std::endl << std::flush; << ":" << getTensorNetworkHash(network) << ") for execution via cuQuantum by " << num_procs << " processes with memory limit " << process_group.getMemoryLimitPerProcess() << " bytes\n" << std::flush; if(logging_ > 0) network->printItFile(logfile_); const auto exec_handle = tensor_rt_->submit(network,process_group.getMPICommProxy(),num_procs,local_rank); bool success = (exec_handle != 0); if(success){ auto res = tn_exec_handles_.emplace(std::make_pair(network->getTensor(0)->getTensorHash(),exec_handle)); success = res.second; if(success && logging_ > 0) logfile_ << "Number of submitted networks via cuQuantum = 1" << std::endl << std::flush; if(success && logging_ > 0) logfile_ << "Execution handle of the submitted network via cuQuantum is " << exec_handle << std::endl << std::flush; } return success; } Loading Loading @@ -1076,16 +1078,25 @@ bool NumServer::sync(const ProcessGroup & process_group, const Tensor & tensor, { bool success = true; if(!process_group.rankIsIn(process_rank_)) return success; //process is not in the group: Do nothing auto iter = tensors_.find(tensor.getName()); if(iter != tensors_.end()){ #ifdef CUQUANTUM if(comp_backend_ == "cuquantum"){ auto iter = tn_exec_handles_.find(tensor.getTensorHash()); bool synced = (iter == tn_exec_handles_.end()); if(!synced) synced = tensor_rt_->syncNetwork(iter->second,wait); return synced; auto cuter = tn_exec_handles_.find(iter->second->getTensorHash()); success = (cuter == tn_exec_handles_.end()); if(!success){ success = tensor_rt_->syncNetwork(cuter->second,wait); if(success){ if(logging_ > 0) logfile_ << "[" << std::fixed << std::setprecision(6) << exatn::Timer::timeInSecHR(getTimeStampStart()) << "]: Locally synchronized cuQuantum execution handle " << cuter->second << " via tensor <" << tensor.getName() << ">" << std::endl << std::flush; tn_exec_handles_.erase(cuter); } } return success; } #endif auto iter = tensors_.find(tensor.getName()); if(iter != tensors_.end()){ if(iter->second->isComposite()){ auto composite_tensor = castTensorComposite(iter->second); assert(composite_tensor); for(auto subtens = composite_tensor->begin(); subtens != composite_tensor->end(); ++subtens){ Loading
src/exatn/tests/NumServerTester.cpp +8 −7 Original line number Diff line number Diff line Loading @@ -3791,7 +3791,7 @@ TEST(NumServerTester, CuTensorNet) { const int NUM_REPEATS = 1; exatn::resetLoggingLevel(1,2); //debug exatn::resetLoggingLevel(2,2); //debug bool success = true; Loading @@ -3807,23 +3807,24 @@ TEST(NumServerTester, CuTensorNet) { success = exatn::initTensorRnd("C"); assert(success); success = exatn::initTensor("D",0.0); assert(success); exatn::switchComputationalBackend("default"); success = exatn::sync(); assert(success); exatn::switchComputationalBackend("cuquantum"); //Contract tensor network: int num_repeats = NUM_REPEATS; while(--num_repeats >= 0){ success = exatn::sync(); assert(success); std::cout << "D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y): "; auto flops = exatn::getTotalFlopCount(); auto time_start = exatn::Timer::timeInSecHR(); success = exatn::evaluateTensorNetwork("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)"); assert(success); success = exatn::sync("D"); assert(success); success = exatn::evaluateTensorNetwork("cuNet","D(m,x,n,y)+=A(m,h,k,n)*B(u,k,h)*C(x,u,y)"); assert(success); success = exatn::sync("D",true); assert(success); auto duration = exatn::Timer::timeInSecHR(time_start); flops = exatn::getTotalFlopCount() - flops; std::cout << "Performance = " << (flops / (1e9 * duration)) << " Gflop/s" << std::endl; std::cout << "Duration = " << duration << " s; Performance = " << (flops / (1e9 * duration)) << " Gflop/s\n"; } //std::this_thread::sleep_for(std::chrono::microseconds(1000000)); //Destroy tensors: success = exatn::sync(); assert(success); success = exatn::destroyTensor("D"); assert(success); Loading
src/numerics/tensor_basic.hpp +19 −19 Original line number Diff line number Diff line /** ExaTN: Tensor basic types and parameters REVISION: 2021/10/15 REVISION: 2022/01/07 Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/ Copyright (C) 2018-2022 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) **/ #ifndef EXATN_NUMERICS_TENSOR_BASIC_HPP_ #define EXATN_NUMERICS_TENSOR_BASIC_HPP_ Loading Loading @@ -63,22 +63,22 @@ enum class IndexKind{ //Basic tensor operations: enum class TensorOpCode{ NOOP, //no operation CREATE, //tensor creation DESTROY, //tensor destruction TRANSFORM, //tensor transformation/initialization SLICE, //tensor slicing INSERT, //tensor insertion ADD, //tensor addition CONTRACT, //tensor contraction DECOMPOSE_SVD3, //tensor decomposition via SVD into three tensor factors DECOMPOSE_SVD2, //tensor decomposition via SVD into two tensor factors ORTHOGONALIZE_SVD, //tensor orthogonalization via SVD ORTHOGONALIZE_MGS, //tensor orthogonalization via Modified Gram-Schmidt FETCH, //fetch tensor data from another MPI process (parallel execution only) UPLOAD, //upload tensor data to another MPI process (parallel execution only) BROADCAST, //tensor broadcast (parallel execution only) ALLREDUCE //tensor allreduce (parallel execution only) NOOP, //0: no operation CREATE, //1: tensor creation DESTROY, //2: tensor destruction TRANSFORM, //3: tensor transformation/initialization SLICE, //4: tensor slicing INSERT, //5: tensor insertion ADD, //6: tensor addition CONTRACT, //7: tensor contraction DECOMPOSE_SVD3, //8: tensor decomposition via SVD into three tensor factors DECOMPOSE_SVD2, //9: tensor decomposition via SVD into two tensor factors ORTHOGONALIZE_SVD, //10: tensor orthogonalization via SVD ORTHOGONALIZE_MGS, //11: tensor orthogonalization via Modified Gram-Schmidt FETCH, //12: fetch tensor data from another MPI process (parallel execution only) UPLOAD, //13: upload tensor data to another MPI process (parallel execution only) BROADCAST, //14: tensor broadcast (parallel execution only) ALLREDUCE //15: tensor allreduce (parallel execution only) }; Loading
src/runtime/executor/cuquantum/cuquantum_executor.cu +3 −2 Original line number Diff line number Diff line /** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum REVISION: 2022/01/07 REVISION: 2022/01/08 Copyright (C) 2018-2022 Dmitry Lyakh Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle) Loading Loading @@ -114,7 +114,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func, unsigned int pipeline_depth, unsigned int num_processes, unsigned int process_rank): tensor_data_access_func_(std::move(tensor_data_access_func)), pipe_depth_(pipeline_depth), num_processes_(num_processes), process_rank_(process_rank) pipe_depth_(pipeline_depth), num_processes_(num_processes), process_rank_(process_rank), flops_(0.0) { static_assert(std::is_same<cutensornetHandle_t,void*>::value,"#FATAL(exatn::runtime::CuQuantumExecutor): cutensornetHandle_t != (void*)"); Loading Loading @@ -442,6 +442,7 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req) tn_req->opt_info, CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT, &flops,sizeof(flops))); flops_ += flops; } tn_req->exec_status = TensorNetworkQueue::ExecStat::Planning; return; Loading