Commit 3b64f0f6 authored by Dmitry I. Lyakh's avatar Dmitry I. Lyakh
Browse files

Fixed excessive synchronization problem in contractTensors



Signed-off-by: default avatarDmitry I. Lyakh <quant4me@gmail.com>
parent 97bb5dc8
Pipeline #165999 failed with stage
in 5 minutes and 46 seconds
......@@ -7,23 +7,12 @@ BUGS:
FEATURES:
- Limit the DAG size:
Option 1: Once the limit has been reached, synchronize and clear it.
Option 2: Once the limit has been reached, create a new empty
DAG and start adding new operations to the new DAG
while completing the execution of the previous one.
Once execution of the previous one has been completed,
destroy the previous DAG.
Issues: Tensor instruction numeration must be global such that
every time a DAG is cleared a global increment will be
introduced in the tensor instruction id numbering.
- GPU-RAM persistence:
1. Create large tensors on GPU;
2. Use COPY_T for smaller tensors;
3. Cache eviction policy: Evict minimally necessary ones (prefer smaller tensors);
4. Limit the Host buffer size to be similar to the combined device buffer size;
4. Limit the Host buffer size to be similar to the combined devices buffer size;
5. Introduce manual knobs to keep tensors on GPU;
- Implement tensor operator builders.
......
/** ExaTN::Numerics: Numerical server
REVISION: 2021/09/22
REVISION: 2021/09/24
Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
......@@ -1306,33 +1306,48 @@ bool NumServer::contractTensors(const std::string & contraction,
op->setIndexPattern(contraction);
op->setScalar(0,std::complex<double>(alpha));
//Create temporary tensors with optimized distributed layout and copy tensor data:
bool redistribution = false;
std::dynamic_pointer_cast<numerics::TensorOpContract>(op)->
introduceOptTemporaries(process_group.getSize(),process_group.getMemoryLimitPerProcess(),left_inds,right_inds,contr_inds,hyper_inds);
auto tensor0a = op->getTensorOperand(0);
auto tensor1a = op->getTensorOperand(1);
auto tensor2a = op->getTensorOperand(2);
if(parsed && tensor0a != tensor0){
//std::cout << "#DEBUG(exatn::NumServer::contractTensors): New temporary for tensor operand 0" << std::endl; //debug
redistribution = true;
parsed = createTensorSync(tensor0a,tensor0->getElementType());
if(parsed) parsed = copyTensor(tensor0a->getName(),tensor0->getName());
}
if(parsed && tensor1a != tensor1){
//std::cout << "#DEBUG(exatn::NumServer::contractTensors): New temporary for tensor operand 1" << std::endl; //debug
redistribution = true;
parsed = createTensorSync(tensor1a,tensor1->getElementType());
if(parsed) parsed = copyTensor(tensor1a->getName(),tensor1->getName());
}
if(parsed && tensor2a != tensor2){
//std::cout << "#DEBUG(exatn::NumServer::contractTensors): New temporary for tensor operand 2" << std::endl; //debug
redistribution = true;
parsed = createTensorSync(tensor2a,tensor2->getElementType());
if(parsed) parsed = copyTensor(tensor2a->getName(),tensor2->getName());
}
if(parsed) parsed = sync(process_group);
#ifdef MPI_ENABLED
if(parsed && redistribution) parsed = sync(process_group);
#endif
//Submit tensor contraction for execution:
parsed = submit(op,getTensorMapper(process_group));
if(parsed) parsed = submit(op,getTensorMapper(process_group));
//Copy the result back:
if(parsed) parsed = sync(process_group);
if(parsed && tensor0a != tensor0) parsed = copyTensor(tensor0->getName(),tensor0a->getName());
if(parsed && tensor0a != tensor0){
#ifdef MPI_ENABLED
if(redistribution) parsed = sync(process_group);
#endif
if(parsed) parsed = copyTensor(tensor0->getName(),tensor0a->getName());
}
//Destroy temporary tensors:
if(parsed && tensor2a != tensor2) parsed = destroyTensor(tensor2a->getName());
if(parsed && tensor1a != tensor1) parsed = destroyTensor(tensor1a->getName());
if(parsed) parsed = sync(process_group);
#ifdef MPI_ENABLED
if(parsed && redistribution) parsed = sync(process_group);
#endif
if(parsed && tensor0a != tensor0) parsed = destroyTensor(tensor0a->getName());
}else{
parsed = true;
......@@ -1415,14 +1430,17 @@ bool NumServer::contractTensorsSync(const std::string & contraction,
auto tensor2a = op->getTensorOperand(2);
//std::cout << "#DEBUG: " << tensor0a->getName() << " " << tensor1a->getName() << " " << tensor2a->getName() << std::endl; //debug
if(parsed && tensor0a != tensor0){
//std::cout << "#DEBUG(exatn::NumServer::contractTensorsSync): New temporary for tensor operand 0" << std::endl; //debug
parsed = createTensorSync(tensor0a,tensor0->getElementType());
if(parsed) parsed = copyTensorSync(tensor0a->getName(),tensor0->getName());
}
if(parsed && tensor1a != tensor1){
//std::cout << "#DEBUG(exatn::NumServer::contractTensorsSync): New temporary for tensor operand 1" << std::endl; //debug
parsed = createTensorSync(tensor1a,tensor1->getElementType());
if(parsed) parsed = copyTensorSync(tensor1a->getName(),tensor1->getName());
}
if(parsed && tensor2a != tensor2){
//std::cout << "#DEBUG(exatn::NumServer::contractTensorsSync): New temporary for tensor operand 2" << std::endl; //debug
parsed = createTensorSync(tensor2a,tensor2->getElementType());
if(parsed) parsed = copyTensorSync(tensor2a->getName(),tensor2->getName());
}
......
......@@ -12,6 +12,8 @@
#include <ios>
#include <utility>
#include <numeric>
#include <chrono>
#include <thread>
#include "errors.hpp"
......@@ -62,16 +64,17 @@ TEST(NumServerTester, PerformanceExaTN)
//3072 for Maxwell, 4096 for Pascal and Volta
const auto TENS_ELEM_TYPE = TensorElementType::REAL32;
//exatn::resetLoggingLevel(1,2); //debug
//exatn::resetLoggingLevel(2,2); //debug
//exatn::resetExecutionSerialization(true,true); //debug
//exatn::activateFastMath(); //fast math (mixed-precision)
std::cout << "Contractions of rank-2 tensors:" << std::endl;
bool success = true;
std::cout << "Contractions of rank-2 tensors:" << std::endl;
//Create tensors:
std::cout << " Creating all tensors ... ";
success = exatn::createTensor("A",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
success = exatn::createTensor("B",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
success = exatn::createTensor("C",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
......@@ -81,8 +84,10 @@ TEST(NumServerTester, PerformanceExaTN)
success = exatn::createTensor("G",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
success = exatn::createTensor("H",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
success = exatn::createTensor("I",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
std::cout << "Done\n";
//Initialize tensors:
std::cout << " Initializing all tensors ... ";
success = exatn::initTensor("A",1e-4); assert(success);
success = exatn::initTensor("B",1e-3); assert(success);
success = exatn::initTensor("C",0.0); assert(success);
......@@ -92,43 +97,68 @@ TEST(NumServerTester, PerformanceExaTN)
success = exatn::initTensor("G",1e-4); assert(success);
success = exatn::initTensor("H",1e-3); assert(success);
success = exatn::initTensor("I",0.0); assert(success);
std::cout << "Done\n";
//Contract tensors (case 1):
std::cout << " Case 1: C=A*B five times: ";
exatn::sync();
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Contract tensors (case 0):
std::cout << " Case 0: C=A*B five times: Warm-up: ";
success = exatn::sync(); assert(success);
auto time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(i,k)*B(k,j)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(j,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(i,k)*B(j,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
exatn::sync();
success = exatn::sync(); assert(success);
auto duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 5.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Contract tensors (case 1):
std::cout << " Case 1: C=A*B five times: Reuse: ";
success = exatn::sync(); assert(success);
time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(i,k)*B(k,j)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(j,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(i,k)*B(j,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
success = exatn::sync(); assert(success);
duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 5.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Contract tensors (case 2):
std::cout << " Case 2: C=A*B | F=D*E | I=G*H pipeline: ";
exatn::sync();
std::cout << " Case 2: C=A*B | F=D*E | I=G*H: Pipeline: ";
success = exatn::sync(); assert(success);
time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("I(i,j)+=G(j,k)*H(i,k)",1.0); assert(success);
success = exatn::contractTensors("F(i,j)+=D(j,k)*E(i,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(j,k)*B(i,k)",1.0); assert(success);
exatn::sync();
success = exatn::sync(); assert(success);
duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 3.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Contract tensors (case 3):
std::cout << " Case 3: I=A*B | I=D*E | I=G*H prefetch: ";
exatn::sync();
std::cout << " Case 3: I=A*B | I=D*E | I=G*H: Prefetch: ";
success = exatn::sync(); assert(success);
time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("I(i,j)+=G(j,k)*H(i,k)",1.0); assert(success);
success = exatn::contractTensors("I(i,j)+=D(j,k)*E(i,k)",1.0); assert(success);
success = exatn::contractTensors("I(i,j)+=A(j,k)*B(i,k)",1.0); assert(success);
exatn::sync();
success = exatn::sync(); assert(success);
duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 3.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Destroy tensors:
std::cout << " Destroying all tensors ... ";
success = exatn::destroyTensor("I"); assert(success);
success = exatn::destroyTensor("H"); assert(success);
success = exatn::destroyTensor("G"); assert(success);
......@@ -138,34 +168,43 @@ TEST(NumServerTester, PerformanceExaTN)
success = exatn::destroyTensor("C"); assert(success);
success = exatn::destroyTensor("B"); assert(success);
success = exatn::destroyTensor("A"); assert(success);
std::cout << "Done\n";
exatn::sync();
success = exatn::sync(); assert(success);
//Create tensors:
std::cout << " Creating all tensors ... ";
success = exatn::createTensor("A",TENS_ELEM_TYPE,TensorShape{DIM,DIM,32ULL}); assert(success);
success = exatn::createTensor("B",TENS_ELEM_TYPE,TensorShape{DIM,DIM,32ULL}); assert(success);
success = exatn::createTensor("C",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
std::cout << "Done\n";
//Initialize tensors:
std::cout << " Initializing all tensors ... ";
success = exatn::initTensor("A",1e-4); assert(success);
success = exatn::initTensor("B",1e-3); assert(success);
success = exatn::initTensor("C",0.0); assert(success);
std::cout << "Done\n";
//Contract tensors:
std::cout << " Case 4: C=A*B out-of-core large dims: ";
exatn::sync();
std::cout << " Case 4: C=A*B: Out-of-core large dims: ";
success = exatn::sync(); assert(success);
time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("C(i,j)+=A(j,k,l)*B(i,k,l)",1.0); assert(success);
exatn::sync();
success = exatn::sync(); assert(success);
duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 2.0*double{DIM}*double{DIM}*double{DIM}*double{32}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Destroy tensors:
std::cout << " Destroying all tensors ... ";
success = exatn::destroyTensor("C"); assert(success);
success = exatn::destroyTensor("B"); assert(success);
success = exatn::destroyTensor("A"); assert(success);
std::cout << "Done\n";
exatn::sync();
success = exatn::sync(); assert(success);
/* REQUIRES at least 48 GB Host RAM:
//Create tensors:
......@@ -227,7 +266,7 @@ TEST(NumServerTester, PerformanceExaTN)
//Synchronize ExaTN server:
exatn::sync();
//exatn::resetLoggingLevel(0,0);
exatn::resetLoggingLevel(0,0);
}
#endif
......@@ -3083,7 +3122,7 @@ TEST(NumServerTester, TensorComposite) {
const auto TENS_ELEM_TYPE = TensorElementType::COMPLEX32;
exatn::resetLoggingLevel(2,2); //debug
//exatn::resetLoggingLevel(2,2); //debug
bool success = true;
......@@ -3173,13 +3212,13 @@ TEST(NumServerTester, TensorComposite) {
norm = 0.0;
success = exatn::computeNorm2Sync("B",norm); assert(success);
std::cout << "2-norm of tensor B = " << (norm * norm) << std::endl;
#if 0
//Contract composite tensors:
success = exatn::contractTensorsSync("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
norm = 0.0;
success = exatn::computeNorm1Sync("C",norm); assert(success);
std::cout << "1-norm of tensor C after contraction = " << norm << std::endl;
#endif
//Destroy composite tensors:
success = exatn::sync(); assert(success);
//success = exatn::destroyTensorSync("C"); assert(success); //let the garbage collector do it
......@@ -3198,7 +3237,7 @@ int main(int argc, char **argv) {
exatn::ParamConf exatn_parameters;
//Set the available CPU Host RAM size to be used by ExaTN:
exatn_parameters.setParameter("host_memory_buffer_size",2L*1024L*1024L*1024L);
exatn_parameters.setParameter("host_memory_buffer_size",4L*1024L*1024L*1024L);
#ifdef MPI_ENABLED
int thread_provided;
int mpi_error = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &thread_provided);
......
/** ExaTN::Numerics: Tensor operation: Contracts two tensors and accumulates the result into another tensor
REVISION: 2021/08/24
REVISION: 2021/09/24
Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
......@@ -61,18 +61,20 @@ DimExtent TensorOpContract::getCombinedDimExtent(IndexKind index_kind) const
{
assert(index_info_);
DimExtent dim_ext = 1;
const auto & tensor0 = *getTensorOperand(0);
const auto & tensor1 = *getTensorOperand(1);
switch(index_kind){
case IndexKind::LEFT:
for(const auto & ind: index_info_->left_indices_) dim_ext *= getTensorOperand(0)->getDimExtent(ind.arg_pos[0]);
for(const auto & ind: index_info_->left_indices_) dim_ext *= tensor0.getDimExtent(ind.arg_pos[0]);
break;
case IndexKind::RIGHT:
for(const auto & ind: index_info_->right_indices_) dim_ext *= getTensorOperand(0)->getDimExtent(ind.arg_pos[0]);
for(const auto & ind: index_info_->right_indices_) dim_ext *= tensor0.getDimExtent(ind.arg_pos[0]);
break;
case IndexKind::CONTR:
for(const auto & ind: index_info_->contr_indices_) dim_ext *= getTensorOperand(1)->getDimExtent(ind.arg_pos[1]);
for(const auto & ind: index_info_->contr_indices_) dim_ext *= tensor1.getDimExtent(ind.arg_pos[1]);
break;
case IndexKind::HYPER:
for(const auto & ind: index_info_->hyper_indices_) dim_ext *= getTensorOperand(0)->getDimExtent(ind.arg_pos[0]);
for(const auto & ind: index_info_->hyper_indices_) dim_ext *= tensor0.getDimExtent(ind.arg_pos[0]);
break;
default:
assert(false);
......@@ -100,6 +102,7 @@ void TensorOpContract::determineNumBisections(unsigned int num_processes,
std::size_t proc_count = num_processes;
std::size_t mem_count = mem_per_process * num_processes;
//Decides whether or not to replicate the tensor at a specific stage of decomposition:
auto dfs = [](std::size_t mem_lim, std::size_t tens_size){
return (tens_size > static_cast<std::size_t>(static_cast<double>(mem_lim)*replication_threshold));
};
......@@ -163,13 +166,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
//Set the splitting depth for the left indices:
unsigned int split_left = 0;
if(bisect_left > 0){
const auto & tensor = *getTensorOperand(0);
auto & indices = index_info_->left_indices_;
int i = indices.size();
auto n = bisect_left;
while(n-- > 0){
if(--i < 0) i = (indices.size() - 1);
const unsigned int new_depth = indices[i].depth + 1;
if(getTensorOperand(0)->getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){
if(tensor.getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){
if((indices[i].depth)++ == 0) ++split_left;
}
}
......@@ -177,13 +181,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
//Set the splitting depth for the right indices:
unsigned int split_right = 0;
if(bisect_right > 0){
const auto & tensor = *getTensorOperand(0);
auto & indices = index_info_->right_indices_;
int i = indices.size();
auto n = bisect_right;
while(n-- > 0){
if(--i < 0) i = (indices.size() - 1);
const unsigned int new_depth = indices[i].depth + 1;
if(getTensorOperand(0)->getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){
if(tensor.getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){
if((indices[i].depth)++ == 0) ++split_right;
}
}
......@@ -191,13 +196,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
//Set the splitting depth for the contracted indices:
unsigned int split_contr = 0;
if(bisect_contr > 0){
const auto & tensor = *getTensorOperand(1);
auto & indices = index_info_->contr_indices_;
int i = indices.size();
auto n = bisect_contr;
while(n-- > 0){
if(--i < 0) i = (indices.size() - 1);
const unsigned int new_depth = indices[i].depth + 1;
if(getTensorOperand(1)->getDimExtent(indices[i].arg_pos[1]) >= (1U << new_depth)){
if(tensor.getDimExtent(indices[i].arg_pos[1]) >= (1U << new_depth)){
if((indices[i].depth)++ == 0) ++split_contr;
}
}
......@@ -205,13 +211,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
//Set the splitting depth for the hyper indices:
unsigned int split_hyper = 0;
if(bisect_hyper > 0){
const auto & tensor = *getTensorOperand(0);
auto & indices = index_info_->hyper_indices_;
int i = indices.size();
auto n = bisect_hyper;
while(n-- > 0){
if(--i < 0) i = (indices.size() - 1);
const unsigned int new_depth = indices[i].depth + 1;
if(getTensorOperand(0)->getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){
if(tensor.getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){
if((indices[i].depth)++ == 0) ++split_hyper;
}
}
......@@ -220,7 +227,7 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
//Replace original tensor operands with new ones with proper decomposition:
//Destination tensor operand:
bool operand_is_composite = getTensorOperand(0)->isComposite();
if(split_left > 0 || split_right > 0 || split_hyper > 0 || operand_is_composite){
if(operand_is_composite && (split_left > 0 || split_right > 0 || split_hyper > 0)){
std::vector<std::pair<unsigned int, unsigned int>> split_dims(split_left+split_right+split_hyper);
unsigned int i = 0;
for(const auto & ind: index_info_->left_indices_){
......@@ -245,7 +252,7 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
}
//Left tensor operand:
operand_is_composite = getTensorOperand(1)->isComposite();
if(split_contr > 0 || split_left > 0 || split_hyper > 0 || operand_is_composite){
if(operand_is_composite && (split_contr > 0 || split_left > 0 || split_hyper > 0)){
std::vector<std::pair<unsigned int, unsigned int>> split_dims(split_contr+split_left+split_hyper);
unsigned int i = 0;
for(const auto & ind: index_info_->contr_indices_){
......@@ -270,7 +277,7 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
}
//Right tensor operand:
operand_is_composite = getTensorOperand(2)->isComposite();
if(split_contr > 0 || split_right > 0 || split_hyper > 0 || operand_is_composite){
if(operand_is_composite && (split_contr > 0 || split_right > 0 || split_hyper > 0)){
std::vector<std::pair<unsigned int, unsigned int>> split_dims(split_contr+split_right+split_hyper);
unsigned int i = 0;
for(const auto & ind: index_info_->contr_indices_){
......
/** ExaTN::Numerics: Tensor operation: Contracts two tensors and accumulates the result into another tensor
REVISION: 2021/08/24
REVISION: 2021/09/24
Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
......@@ -81,9 +81,9 @@ public:
protected:
struct Bisect{
IndexKind index_kind;
unsigned int process_count;
bool serial;
IndexKind index_kind; //index kind
unsigned int process_count; //number of associated processes
bool serial; //DFS/BFS
};
//Information on all indices of a tensor contraction:
......@@ -92,8 +92,8 @@ protected:
std::vector<PosIndexLabel> right_indices_;
std::vector<PosIndexLabel> contr_indices_;
std::vector<PosIndexLabel> hyper_indices_;
std::map<std::string,const PosIndexLabel*> index_map_;
std::list<Bisect> bisect_sequence_;
std::map<std::string,const PosIndexLabel*> index_map_; //index name --> pointer to its description
std::list<Bisect> bisect_sequence_; //pre-calculated sequence of bisections
//Constructor:
IndexInfo(const std::vector<PosIndexLabel> & left_indices, //in: extracted left indices
......
/** ExaTN:: Tensor Runtime: Tensor graph node executor: Talsh
REVISION: 2021/07/27
REVISION: 2021/09/24
Copyright (C) 2018-2021 Dmitry Lyakh, Tiffany Mintz, Alex McCaskey
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle)
......@@ -1505,6 +1505,8 @@ void TalshNodeExecutor::cacheMovedTensors(talsh::TensorTask & talsh_task)
if(arg_coherence == COPY_M || arg_coherence == COPY_K){ //move or copy of tensor body image
auto res = accel_cache_[device].emplace(std::make_pair(const_cast<talsh::Tensor*>(talsh_task.getTensorArgument(oprnd)),
CachedAttr{exatn::Timer::timeInSecHR()}));
//std::cout << "#DEBUG(exatn::runtime::node_executor_talsh): CACHE: Device " << device
// << " cached " << res.first->first << std::endl << std::flush; //debug
}
}
}
......@@ -1542,6 +1544,8 @@ bool TalshNodeExecutor::evictMovedTensors(int device_id, std::size_t required_sp
auto task_res = evictions_.emplace(std::make_pair(iter->first,
std::make_shared<talsh::TensorTask>()));
if(task_res.second){
//std::cout << "#DEBUG(exatn::runtime::node_executor_talsh): CACHE: Device " << dev
// << " evicting " << iter->first << std::endl << std::flush; //debug
bool synced = iter->first->sync(task_res.first->second.get(),DEV_HOST,0,nullptr,!single_device); //initiate a move of the tensor body image back to Host
iter = accel_cache_[dev].erase(iter);
freed_bytes += talsh_tens_size;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment