Commit 3b64f0f6 authored by Dmitry I. Lyakh's avatar Dmitry I. Lyakh
Browse files

Fixed excessive synchronization problem in contractTensors



Signed-off-by: default avatarDmitry I. Lyakh <quant4me@gmail.com>
parent 97bb5dc8
Pipeline #165999 failed with stage
in 5 minutes and 46 seconds
...@@ -7,23 +7,12 @@ BUGS: ...@@ -7,23 +7,12 @@ BUGS:
FEATURES: FEATURES:
- Limit the DAG size:
Option 1: Once the limit has been reached, synchronize and clear it.
Option 2: Once the limit has been reached, create a new empty
DAG and start adding new operations to the new DAG
while completing the execution of the previous one.
Once execution of the previous one has been completed,
destroy the previous DAG.
Issues: Tensor instruction numeration must be global such that
every time a DAG is cleared a global increment will be
introduced in the tensor instruction id numbering.
- GPU-RAM persistence: - GPU-RAM persistence:
1. Create large tensors on GPU; 1. Create large tensors on GPU;
2. Use COPY_T for smaller tensors; 2. Use COPY_T for smaller tensors;
3. Cache eviction policy: Evict minimally necessary ones (prefer smaller tensors); 3. Cache eviction policy: Evict minimally necessary ones (prefer smaller tensors);
4. Limit the Host buffer size to be similar to the combined device buffer size; 4. Limit the Host buffer size to be similar to the combined devices buffer size;
5. Introduce manual knobs to keep tensors on GPU;
- Implement tensor operator builders. - Implement tensor operator builders.
......
/** ExaTN::Numerics: Numerical server /** ExaTN::Numerics: Numerical server
REVISION: 2021/09/22 REVISION: 2021/09/24
Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/ Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
...@@ -1306,33 +1306,48 @@ bool NumServer::contractTensors(const std::string & contraction, ...@@ -1306,33 +1306,48 @@ bool NumServer::contractTensors(const std::string & contraction,
op->setIndexPattern(contraction); op->setIndexPattern(contraction);
op->setScalar(0,std::complex<double>(alpha)); op->setScalar(0,std::complex<double>(alpha));
//Create temporary tensors with optimized distributed layout and copy tensor data: //Create temporary tensors with optimized distributed layout and copy tensor data:
bool redistribution = false;
std::dynamic_pointer_cast<numerics::TensorOpContract>(op)-> std::dynamic_pointer_cast<numerics::TensorOpContract>(op)->
introduceOptTemporaries(process_group.getSize(),process_group.getMemoryLimitPerProcess(),left_inds,right_inds,contr_inds,hyper_inds); introduceOptTemporaries(process_group.getSize(),process_group.getMemoryLimitPerProcess(),left_inds,right_inds,contr_inds,hyper_inds);
auto tensor0a = op->getTensorOperand(0); auto tensor0a = op->getTensorOperand(0);
auto tensor1a = op->getTensorOperand(1); auto tensor1a = op->getTensorOperand(1);
auto tensor2a = op->getTensorOperand(2); auto tensor2a = op->getTensorOperand(2);
if(parsed && tensor0a != tensor0){ if(parsed && tensor0a != tensor0){
//std::cout << "#DEBUG(exatn::NumServer::contractTensors): New temporary for tensor operand 0" << std::endl; //debug
redistribution = true;
parsed = createTensorSync(tensor0a,tensor0->getElementType()); parsed = createTensorSync(tensor0a,tensor0->getElementType());
if(parsed) parsed = copyTensor(tensor0a->getName(),tensor0->getName()); if(parsed) parsed = copyTensor(tensor0a->getName(),tensor0->getName());
} }
if(parsed && tensor1a != tensor1){ if(parsed && tensor1a != tensor1){
//std::cout << "#DEBUG(exatn::NumServer::contractTensors): New temporary for tensor operand 1" << std::endl; //debug
redistribution = true;
parsed = createTensorSync(tensor1a,tensor1->getElementType()); parsed = createTensorSync(tensor1a,tensor1->getElementType());
if(parsed) parsed = copyTensor(tensor1a->getName(),tensor1->getName()); if(parsed) parsed = copyTensor(tensor1a->getName(),tensor1->getName());
} }
if(parsed && tensor2a != tensor2){ if(parsed && tensor2a != tensor2){
//std::cout << "#DEBUG(exatn::NumServer::contractTensors): New temporary for tensor operand 2" << std::endl; //debug
redistribution = true;
parsed = createTensorSync(tensor2a,tensor2->getElementType()); parsed = createTensorSync(tensor2a,tensor2->getElementType());
if(parsed) parsed = copyTensor(tensor2a->getName(),tensor2->getName()); if(parsed) parsed = copyTensor(tensor2a->getName(),tensor2->getName());
} }
if(parsed) parsed = sync(process_group); #ifdef MPI_ENABLED
if(parsed && redistribution) parsed = sync(process_group);
#endif
//Submit tensor contraction for execution: //Submit tensor contraction for execution:
parsed = submit(op,getTensorMapper(process_group)); if(parsed) parsed = submit(op,getTensorMapper(process_group));
//Copy the result back: //Copy the result back:
if(parsed) parsed = sync(process_group); if(parsed && tensor0a != tensor0){
if(parsed && tensor0a != tensor0) parsed = copyTensor(tensor0->getName(),tensor0a->getName()); #ifdef MPI_ENABLED
if(redistribution) parsed = sync(process_group);
#endif
if(parsed) parsed = copyTensor(tensor0->getName(),tensor0a->getName());
}
//Destroy temporary tensors: //Destroy temporary tensors:
if(parsed && tensor2a != tensor2) parsed = destroyTensor(tensor2a->getName()); if(parsed && tensor2a != tensor2) parsed = destroyTensor(tensor2a->getName());
if(parsed && tensor1a != tensor1) parsed = destroyTensor(tensor1a->getName()); if(parsed && tensor1a != tensor1) parsed = destroyTensor(tensor1a->getName());
if(parsed) parsed = sync(process_group); #ifdef MPI_ENABLED
if(parsed && redistribution) parsed = sync(process_group);
#endif
if(parsed && tensor0a != tensor0) parsed = destroyTensor(tensor0a->getName()); if(parsed && tensor0a != tensor0) parsed = destroyTensor(tensor0a->getName());
}else{ }else{
parsed = true; parsed = true;
...@@ -1415,14 +1430,17 @@ bool NumServer::contractTensorsSync(const std::string & contraction, ...@@ -1415,14 +1430,17 @@ bool NumServer::contractTensorsSync(const std::string & contraction,
auto tensor2a = op->getTensorOperand(2); auto tensor2a = op->getTensorOperand(2);
//std::cout << "#DEBUG: " << tensor0a->getName() << " " << tensor1a->getName() << " " << tensor2a->getName() << std::endl; //debug //std::cout << "#DEBUG: " << tensor0a->getName() << " " << tensor1a->getName() << " " << tensor2a->getName() << std::endl; //debug
if(parsed && tensor0a != tensor0){ if(parsed && tensor0a != tensor0){
//std::cout << "#DEBUG(exatn::NumServer::contractTensorsSync): New temporary for tensor operand 0" << std::endl; //debug
parsed = createTensorSync(tensor0a,tensor0->getElementType()); parsed = createTensorSync(tensor0a,tensor0->getElementType());
if(parsed) parsed = copyTensorSync(tensor0a->getName(),tensor0->getName()); if(parsed) parsed = copyTensorSync(tensor0a->getName(),tensor0->getName());
} }
if(parsed && tensor1a != tensor1){ if(parsed && tensor1a != tensor1){
//std::cout << "#DEBUG(exatn::NumServer::contractTensorsSync): New temporary for tensor operand 1" << std::endl; //debug
parsed = createTensorSync(tensor1a,tensor1->getElementType()); parsed = createTensorSync(tensor1a,tensor1->getElementType());
if(parsed) parsed = copyTensorSync(tensor1a->getName(),tensor1->getName()); if(parsed) parsed = copyTensorSync(tensor1a->getName(),tensor1->getName());
} }
if(parsed && tensor2a != tensor2){ if(parsed && tensor2a != tensor2){
//std::cout << "#DEBUG(exatn::NumServer::contractTensorsSync): New temporary for tensor operand 2" << std::endl; //debug
parsed = createTensorSync(tensor2a,tensor2->getElementType()); parsed = createTensorSync(tensor2a,tensor2->getElementType());
if(parsed) parsed = copyTensorSync(tensor2a->getName(),tensor2->getName()); if(parsed) parsed = copyTensorSync(tensor2a->getName(),tensor2->getName());
} }
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
#include <ios> #include <ios>
#include <utility> #include <utility>
#include <numeric> #include <numeric>
#include <chrono>
#include <thread>
#include "errors.hpp" #include "errors.hpp"
...@@ -62,16 +64,17 @@ TEST(NumServerTester, PerformanceExaTN) ...@@ -62,16 +64,17 @@ TEST(NumServerTester, PerformanceExaTN)
//3072 for Maxwell, 4096 for Pascal and Volta //3072 for Maxwell, 4096 for Pascal and Volta
const auto TENS_ELEM_TYPE = TensorElementType::REAL32; const auto TENS_ELEM_TYPE = TensorElementType::REAL32;
//exatn::resetLoggingLevel(1,2); //debug //exatn::resetLoggingLevel(2,2); //debug
//exatn::resetExecutionSerialization(true,true); //debug //exatn::resetExecutionSerialization(true,true); //debug
//exatn::activateFastMath(); //fast math (mixed-precision) //exatn::activateFastMath(); //fast math (mixed-precision)
std::cout << "Contractions of rank-2 tensors:" << std::endl;
bool success = true; bool success = true;
std::cout << "Contractions of rank-2 tensors:" << std::endl;
//Create tensors: //Create tensors:
std::cout << " Creating all tensors ... ";
success = exatn::createTensor("A",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success); success = exatn::createTensor("A",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
success = exatn::createTensor("B",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success); success = exatn::createTensor("B",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
success = exatn::createTensor("C",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success); success = exatn::createTensor("C",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
...@@ -81,8 +84,10 @@ TEST(NumServerTester, PerformanceExaTN) ...@@ -81,8 +84,10 @@ TEST(NumServerTester, PerformanceExaTN)
success = exatn::createTensor("G",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success); success = exatn::createTensor("G",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
success = exatn::createTensor("H",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success); success = exatn::createTensor("H",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
success = exatn::createTensor("I",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success); success = exatn::createTensor("I",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
std::cout << "Done\n";
//Initialize tensors: //Initialize tensors:
std::cout << " Initializing all tensors ... ";
success = exatn::initTensor("A",1e-4); assert(success); success = exatn::initTensor("A",1e-4); assert(success);
success = exatn::initTensor("B",1e-3); assert(success); success = exatn::initTensor("B",1e-3); assert(success);
success = exatn::initTensor("C",0.0); assert(success); success = exatn::initTensor("C",0.0); assert(success);
...@@ -92,43 +97,68 @@ TEST(NumServerTester, PerformanceExaTN) ...@@ -92,43 +97,68 @@ TEST(NumServerTester, PerformanceExaTN)
success = exatn::initTensor("G",1e-4); assert(success); success = exatn::initTensor("G",1e-4); assert(success);
success = exatn::initTensor("H",1e-3); assert(success); success = exatn::initTensor("H",1e-3); assert(success);
success = exatn::initTensor("I",0.0); assert(success); success = exatn::initTensor("I",0.0); assert(success);
std::cout << "Done\n";
//Contract tensors (case 1): std::this_thread::sleep_for(std::chrono::microseconds(100000));
std::cout << " Case 1: C=A*B five times: ";
exatn::sync(); //Contract tensors (case 0):
std::cout << " Case 0: C=A*B five times: Warm-up: ";
success = exatn::sync(); assert(success);
auto time_start = exatn::Timer::timeInSecHR(); auto time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success); success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(i,k)*B(k,j)",1.0); assert(success); success = exatn::contractTensors("C(i,j)+=A(i,k)*B(k,j)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(j,k)",1.0); assert(success); success = exatn::contractTensors("C(i,j)+=A(k,i)*B(j,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(i,k)*B(j,k)",1.0); assert(success); success = exatn::contractTensors("C(i,j)+=A(i,k)*B(j,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success); success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
exatn::sync(); success = exatn::sync(); assert(success);
auto duration = exatn::Timer::timeInSecHR(time_start); auto duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 5.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl; std::cout << "Average performance (GFlop/s) = " << 5.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Contract tensors (case 1):
std::cout << " Case 1: C=A*B five times: Reuse: ";
success = exatn::sync(); assert(success);
time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(i,k)*B(k,j)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(j,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(i,k)*B(j,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
success = exatn::sync(); assert(success);
duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 5.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Contract tensors (case 2): //Contract tensors (case 2):
std::cout << " Case 2: C=A*B | F=D*E | I=G*H pipeline: "; std::cout << " Case 2: C=A*B | F=D*E | I=G*H: Pipeline: ";
exatn::sync(); success = exatn::sync(); assert(success);
time_start = exatn::Timer::timeInSecHR(); time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("I(i,j)+=G(j,k)*H(i,k)",1.0); assert(success); success = exatn::contractTensors("I(i,j)+=G(j,k)*H(i,k)",1.0); assert(success);
success = exatn::contractTensors("F(i,j)+=D(j,k)*E(i,k)",1.0); assert(success); success = exatn::contractTensors("F(i,j)+=D(j,k)*E(i,k)",1.0); assert(success);
success = exatn::contractTensors("C(i,j)+=A(j,k)*B(i,k)",1.0); assert(success); success = exatn::contractTensors("C(i,j)+=A(j,k)*B(i,k)",1.0); assert(success);
exatn::sync(); success = exatn::sync(); assert(success);
duration = exatn::Timer::timeInSecHR(time_start); duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 3.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl; std::cout << "Average performance (GFlop/s) = " << 3.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Contract tensors (case 3): //Contract tensors (case 3):
std::cout << " Case 3: I=A*B | I=D*E | I=G*H prefetch: "; std::cout << " Case 3: I=A*B | I=D*E | I=G*H: Prefetch: ";
exatn::sync(); success = exatn::sync(); assert(success);
time_start = exatn::Timer::timeInSecHR(); time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("I(i,j)+=G(j,k)*H(i,k)",1.0); assert(success); success = exatn::contractTensors("I(i,j)+=G(j,k)*H(i,k)",1.0); assert(success);
success = exatn::contractTensors("I(i,j)+=D(j,k)*E(i,k)",1.0); assert(success); success = exatn::contractTensors("I(i,j)+=D(j,k)*E(i,k)",1.0); assert(success);
success = exatn::contractTensors("I(i,j)+=A(j,k)*B(i,k)",1.0); assert(success); success = exatn::contractTensors("I(i,j)+=A(j,k)*B(i,k)",1.0); assert(success);
exatn::sync(); success = exatn::sync(); assert(success);
duration = exatn::Timer::timeInSecHR(time_start); duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 3.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl; std::cout << "Average performance (GFlop/s) = " << 3.0*2.0*double{DIM}*double{DIM}*double{DIM}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Destroy tensors: //Destroy tensors:
std::cout << " Destroying all tensors ... ";
success = exatn::destroyTensor("I"); assert(success); success = exatn::destroyTensor("I"); assert(success);
success = exatn::destroyTensor("H"); assert(success); success = exatn::destroyTensor("H"); assert(success);
success = exatn::destroyTensor("G"); assert(success); success = exatn::destroyTensor("G"); assert(success);
...@@ -138,34 +168,43 @@ TEST(NumServerTester, PerformanceExaTN) ...@@ -138,34 +168,43 @@ TEST(NumServerTester, PerformanceExaTN)
success = exatn::destroyTensor("C"); assert(success); success = exatn::destroyTensor("C"); assert(success);
success = exatn::destroyTensor("B"); assert(success); success = exatn::destroyTensor("B"); assert(success);
success = exatn::destroyTensor("A"); assert(success); success = exatn::destroyTensor("A"); assert(success);
std::cout << "Done\n";
exatn::sync(); success = exatn::sync(); assert(success);
//Create tensors: //Create tensors:
std::cout << " Creating all tensors ... ";
success = exatn::createTensor("A",TENS_ELEM_TYPE,TensorShape{DIM,DIM,32ULL}); assert(success); success = exatn::createTensor("A",TENS_ELEM_TYPE,TensorShape{DIM,DIM,32ULL}); assert(success);
success = exatn::createTensor("B",TENS_ELEM_TYPE,TensorShape{DIM,DIM,32ULL}); assert(success); success = exatn::createTensor("B",TENS_ELEM_TYPE,TensorShape{DIM,DIM,32ULL}); assert(success);
success = exatn::createTensor("C",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success); success = exatn::createTensor("C",TENS_ELEM_TYPE,TensorShape{DIM,DIM}); assert(success);
std::cout << "Done\n";
//Initialize tensors: //Initialize tensors:
std::cout << " Initializing all tensors ... ";
success = exatn::initTensor("A",1e-4); assert(success); success = exatn::initTensor("A",1e-4); assert(success);
success = exatn::initTensor("B",1e-3); assert(success); success = exatn::initTensor("B",1e-3); assert(success);
success = exatn::initTensor("C",0.0); assert(success); success = exatn::initTensor("C",0.0); assert(success);
std::cout << "Done\n";
//Contract tensors: //Contract tensors:
std::cout << " Case 4: C=A*B out-of-core large dims: "; std::cout << " Case 4: C=A*B: Out-of-core large dims: ";
exatn::sync(); success = exatn::sync(); assert(success);
time_start = exatn::Timer::timeInSecHR(); time_start = exatn::Timer::timeInSecHR();
success = exatn::contractTensors("C(i,j)+=A(j,k,l)*B(i,k,l)",1.0); assert(success); success = exatn::contractTensors("C(i,j)+=A(j,k,l)*B(i,k,l)",1.0); assert(success);
exatn::sync(); success = exatn::sync(); assert(success);
duration = exatn::Timer::timeInSecHR(time_start); duration = exatn::Timer::timeInSecHR(time_start);
std::cout << "Average performance (GFlop/s) = " << 2.0*double{DIM}*double{DIM}*double{DIM}*double{32}/duration/1e9 << std::endl; std::cout << "Average performance (GFlop/s) = " << 2.0*double{DIM}*double{DIM}*double{DIM}*double{32}/duration/1e9 << std::endl;
std::this_thread::sleep_for(std::chrono::microseconds(100000));
//Destroy tensors: //Destroy tensors:
std::cout << " Destroying all tensors ... ";
success = exatn::destroyTensor("C"); assert(success); success = exatn::destroyTensor("C"); assert(success);
success = exatn::destroyTensor("B"); assert(success); success = exatn::destroyTensor("B"); assert(success);
success = exatn::destroyTensor("A"); assert(success); success = exatn::destroyTensor("A"); assert(success);
std::cout << "Done\n";
exatn::sync(); success = exatn::sync(); assert(success);
/* REQUIRES at least 48 GB Host RAM: /* REQUIRES at least 48 GB Host RAM:
//Create tensors: //Create tensors:
...@@ -227,7 +266,7 @@ TEST(NumServerTester, PerformanceExaTN) ...@@ -227,7 +266,7 @@ TEST(NumServerTester, PerformanceExaTN)
//Synchronize ExaTN server: //Synchronize ExaTN server:
exatn::sync(); exatn::sync();
//exatn::resetLoggingLevel(0,0); exatn::resetLoggingLevel(0,0);
} }
#endif #endif
...@@ -3083,7 +3122,7 @@ TEST(NumServerTester, TensorComposite) { ...@@ -3083,7 +3122,7 @@ TEST(NumServerTester, TensorComposite) {
const auto TENS_ELEM_TYPE = TensorElementType::COMPLEX32; const auto TENS_ELEM_TYPE = TensorElementType::COMPLEX32;
exatn::resetLoggingLevel(2,2); //debug //exatn::resetLoggingLevel(2,2); //debug
bool success = true; bool success = true;
...@@ -3173,13 +3212,13 @@ TEST(NumServerTester, TensorComposite) { ...@@ -3173,13 +3212,13 @@ TEST(NumServerTester, TensorComposite) {
norm = 0.0; norm = 0.0;
success = exatn::computeNorm2Sync("B",norm); assert(success); success = exatn::computeNorm2Sync("B",norm); assert(success);
std::cout << "2-norm of tensor B = " << (norm * norm) << std::endl; std::cout << "2-norm of tensor B = " << (norm * norm) << std::endl;
#if 0
//Contract composite tensors: //Contract composite tensors:
success = exatn::contractTensorsSync("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success); success = exatn::contractTensorsSync("C(i,j)+=A(k,i)*B(k,j)",1.0); assert(success);
norm = 0.0; norm = 0.0;
success = exatn::computeNorm1Sync("C",norm); assert(success); success = exatn::computeNorm1Sync("C",norm); assert(success);
std::cout << "1-norm of tensor C after contraction = " << norm << std::endl; std::cout << "1-norm of tensor C after contraction = " << norm << std::endl;
#endif
//Destroy composite tensors: //Destroy composite tensors:
success = exatn::sync(); assert(success); success = exatn::sync(); assert(success);
//success = exatn::destroyTensorSync("C"); assert(success); //let the garbage collector do it //success = exatn::destroyTensorSync("C"); assert(success); //let the garbage collector do it
...@@ -3198,7 +3237,7 @@ int main(int argc, char **argv) { ...@@ -3198,7 +3237,7 @@ int main(int argc, char **argv) {
exatn::ParamConf exatn_parameters; exatn::ParamConf exatn_parameters;
//Set the available CPU Host RAM size to be used by ExaTN: //Set the available CPU Host RAM size to be used by ExaTN:
exatn_parameters.setParameter("host_memory_buffer_size",2L*1024L*1024L*1024L); exatn_parameters.setParameter("host_memory_buffer_size",4L*1024L*1024L*1024L);
#ifdef MPI_ENABLED #ifdef MPI_ENABLED
int thread_provided; int thread_provided;
int mpi_error = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &thread_provided); int mpi_error = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &thread_provided);
......
/** ExaTN::Numerics: Tensor operation: Contracts two tensors and accumulates the result into another tensor /** ExaTN::Numerics: Tensor operation: Contracts two tensors and accumulates the result into another tensor
REVISION: 2021/08/24 REVISION: 2021/09/24
Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh) Copyright (C) 2018-2021 Dmitry I. Lyakh (Liakh)
Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/ Copyright (C) 2018-2021 Oak Ridge National Laboratory (UT-Battelle) **/
...@@ -61,18 +61,20 @@ DimExtent TensorOpContract::getCombinedDimExtent(IndexKind index_kind) const ...@@ -61,18 +61,20 @@ DimExtent TensorOpContract::getCombinedDimExtent(IndexKind index_kind) const
{ {
assert(index_info_); assert(index_info_);
DimExtent dim_ext = 1; DimExtent dim_ext = 1;
const auto & tensor0 = *getTensorOperand(0);
const auto & tensor1 = *getTensorOperand(1);
switch(index_kind){ switch(index_kind){
case IndexKind::LEFT: case IndexKind::LEFT:
for(const auto & ind: index_info_->left_indices_) dim_ext *= getTensorOperand(0)->getDimExtent(ind.arg_pos[0]); for(const auto & ind: index_info_->left_indices_) dim_ext *= tensor0.getDimExtent(ind.arg_pos[0]);
break; break;
case IndexKind::RIGHT: case IndexKind::RIGHT:
for(const auto & ind: index_info_->right_indices_) dim_ext *= getTensorOperand(0)->getDimExtent(ind.arg_pos[0]); for(const auto & ind: index_info_->right_indices_) dim_ext *= tensor0.getDimExtent(ind.arg_pos[0]);
break; break;
case IndexKind::CONTR: case IndexKind::CONTR:
for(const auto & ind: index_info_->contr_indices_) dim_ext *= getTensorOperand(1)->getDimExtent(ind.arg_pos[1]); for(const auto & ind: index_info_->contr_indices_) dim_ext *= tensor1.getDimExtent(ind.arg_pos[1]);
break; break;
case IndexKind::HYPER: case IndexKind::HYPER:
for(const auto & ind: index_info_->hyper_indices_) dim_ext *= getTensorOperand(0)->getDimExtent(ind.arg_pos[0]); for(const auto & ind: index_info_->hyper_indices_) dim_ext *= tensor0.getDimExtent(ind.arg_pos[0]);
break; break;
default: default:
assert(false); assert(false);
...@@ -100,6 +102,7 @@ void TensorOpContract::determineNumBisections(unsigned int num_processes, ...@@ -100,6 +102,7 @@ void TensorOpContract::determineNumBisections(unsigned int num_processes,
std::size_t proc_count = num_processes; std::size_t proc_count = num_processes;
std::size_t mem_count = mem_per_process * num_processes; std::size_t mem_count = mem_per_process * num_processes;
//Decides whether or not to replicate the tensor at a specific stage of decomposition:
auto dfs = [](std::size_t mem_lim, std::size_t tens_size){ auto dfs = [](std::size_t mem_lim, std::size_t tens_size){
return (tens_size > static_cast<std::size_t>(static_cast<double>(mem_lim)*replication_threshold)); return (tens_size > static_cast<std::size_t>(static_cast<double>(mem_lim)*replication_threshold));
}; };
...@@ -163,13 +166,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes, ...@@ -163,13 +166,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
//Set the splitting depth for the left indices: //Set the splitting depth for the left indices:
unsigned int split_left = 0; unsigned int split_left = 0;
if(bisect_left > 0){ if(bisect_left > 0){
const auto & tensor = *getTensorOperand(0);
auto & indices = index_info_->left_indices_; auto & indices = index_info_->left_indices_;
int i = indices.size(); int i = indices.size();
auto n = bisect_left; auto n = bisect_left;
while(n-- > 0){ while(n-- > 0){
if(--i < 0) i = (indices.size() - 1); if(--i < 0) i = (indices.size() - 1);
const unsigned int new_depth = indices[i].depth + 1; const unsigned int new_depth = indices[i].depth + 1;
if(getTensorOperand(0)->getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){ if(tensor.getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){
if((indices[i].depth)++ == 0) ++split_left; if((indices[i].depth)++ == 0) ++split_left;
} }
} }
...@@ -177,13 +181,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes, ...@@ -177,13 +181,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
//Set the splitting depth for the right indices: //Set the splitting depth for the right indices:
unsigned int split_right = 0; unsigned int split_right = 0;
if(bisect_right > 0){ if(bisect_right > 0){
const auto & tensor = *getTensorOperand(0);
auto & indices = index_info_->right_indices_; auto & indices = index_info_->right_indices_;
int i = indices.size(); int i = indices.size();
auto n = bisect_right; auto n = bisect_right;
while(n-- > 0){ while(n-- > 0){
if(--i < 0) i = (indices.size() - 1); if(--i < 0) i = (indices.size() - 1);
const unsigned int new_depth = indices[i].depth + 1; const unsigned int new_depth = indices[i].depth + 1;
if(getTensorOperand(0)->getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){ if(tensor.getDimExtent(indices[i].arg_pos[0]) >= (1U << new_depth)){
if((indices[i].depth)++ == 0) ++split_right; if((indices[i].depth)++ == 0) ++split_right;
} }
} }
...@@ -191,13 +196,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes, ...@@ -191,13 +196,14 @@ void TensorOpContract::introduceOptTemporaries(unsigned int num_processes,
//Set the splitting depth for the contracted indices: //Set the splitting depth for the contracted indices:
unsigned int split_contr = 0; unsigned int split_contr = 0;
if(bisect_contr > 0){ if(bisect_contr > 0){
const auto & tensor = *getTensorOperand(1);
auto & indices = index_info_->contr_indices_; auto & indices = index_info_->contr_indices_;
int i = indices.size(); int i = indices.size();
auto n = bisect_contr; auto n = bisect_contr;
while(n-- > 0){ while(n-- > 0){
if(--i < 0) i = (indices.size() - 1); if(--i < 0) i = (indices.size() - 1);
const unsigned int new_depth = indices[i].depth + 1; const unsigned int new_depth = indices[i].depth + 1;
if(getTensorOperand(1)->getDimExtent(indices[i].arg_pos[1]) >= (1U << new_depth)){ if(tensor.getDimExtent(indices[i].arg_pos[1]) >= (1U << new_depth)){
if((indices[i].depth)++ == 0) ++split_contr; if((indices[i].depth)++ == 0) ++split_contr;