Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
ORNL Quantum Computing Institute
exatn
Commits
ed6dc07f
Commit
ed6dc07f
authored
Jan 05, 2022
by
Dmitry I. Lyakh
Browse files
Finished CuQuantum executor for one GPU per process, needs testing ...
Signed-off-by:
Dmitry I. Lyakh
<
quant4me@gmail.com
>
parent
6516d89e
Changes
5
Hide whitespace changes
Inline
Side-by-side
src/runtime/executor/cuquantum/cuquantum_executor.cu
View file @
ed6dc07f
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/0
4
REVISION: 2022/01/0
5
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
...
...
@@ -91,6 +91,9 @@ struct TensorNetworkReq {
cudaEventDestroy
(
data_finish
);
cudaEventDestroy
(
data_start
);
cudaStreamDestroy
(
stream
);
cutensornetDestroyContractionPlan
(
comp_plan
);
cutensornetDestroyContractionOptimizerConfig
(
opt_config
);
cutensornetDestroyContractionOptimizerInfo
(
opt_info
);
cutensornetDestroyNetworkDescriptor
(
net_descriptor
);
if
(
modes_out
!=
nullptr
)
delete
[]
modes_out
;
if
(
strides_out
!=
nullptr
)
delete
[]
strides_out
;
...
...
@@ -105,8 +108,8 @@ struct TensorNetworkReq {
};
CuQuantumExecutor
::
CuQuantumExecutor
(
TensorImplFunc
tensor_data_access_func
)
:
tensor_data_access_func_
(
std
::
move
(
tensor_data_access_func
))
CuQuantumExecutor
::
CuQuantumExecutor
(
TensorImplFunc
tensor_data_access_func
,
unsigned
int
pipeline_depth
)
:
tensor_data_access_func_
(
std
::
move
(
tensor_data_access_func
))
,
pipe_depth_
(
pipeline_depth
)
{
static_assert
(
std
::
is_same
<
cutensornetHandle_t
,
void
*>::
value
,
"#FATAL(exatn::runtime::CuQuantumExecutor): cutensornetHandle_t != (void*)"
);
...
...
@@ -118,6 +121,7 @@ CuQuantumExecutor::CuQuantumExecutor(TensorImplFunc tensor_data_access_func):
for
(
int
i
=
0
;
i
<
num_gpus
;
++
i
){
if
(
talshDeviceState
(
i
,
DEV_NVIDIA_GPU
)
>=
DEV_ON
){
gpu_attr_
.
emplace_back
(
std
::
make_pair
(
i
,
DeviceAttr
{}));
gpu_attr_
.
back
().
second
.
pipe_level
=
0
;
gpu_attr_
.
back
().
second
.
workspace_ptr
=
talsh
::
getDeviceBufferBasePtr
(
DEV_NVIDIA_GPU
,
i
);
assert
(
reinterpret_cast
<
std
::
size_t
>
(
gpu_attr_
.
back
().
second
.
workspace_ptr
)
%
MEM_ALIGNMENT
==
0
);
gpu_attr_
.
back
().
second
.
buffer_size
=
talsh
::
getDeviceMaxBufferSize
(
DEV_NVIDIA_GPU
,
i
);
...
...
@@ -257,6 +261,19 @@ cutensornetComputeType_t getCutensorComputeType(const TensorElementType elem_typ
}
void
CuQuantumExecutor
::
acquireWorkspace
(
unsigned
int
dev
,
void
**
workspace_ptr
,
uint64_t
*
workspace_size
)
{
assert
(
dev
<
gpu_attr_
.
size
());
auto
&
dev_attr
=
gpu_attr_
[
dev
].
second
;
*
workspace_size
=
dev_attr
.
workspace_size
/
pipe_depth_
;
*
workspace_ptr
=
(
void
*
)((
char
*
)(
dev_attr
.
workspace_ptr
)
+
((
*
workspace_size
)
*
dev_attr
.
pipe_level
));
dev_attr
.
pipe_level
=
(
++
(
dev_attr
.
pipe_level
))
%
pipe_depth_
;
return
;
}
void
CuQuantumExecutor
::
parseTensorNetwork
(
std
::
shared_ptr
<
TensorNetworkReq
>
tn_req
)
{
const
auto
&
net
=
*
(
tn_req
->
network
);
...
...
@@ -266,6 +283,7 @@ void CuQuantumExecutor::parseTensorNetwork(std::shared_ptr<TensorNetworkReq> tn_
tn_req
->
strides_in
=
new
int64_t
*
[
num_input_tensors
];
tn_req
->
modes_in
=
new
int32_t
*
[
num_input_tensors
];
tn_req
->
alignments_in
=
new
uint32_t
[
num_input_tensors
];
tn_req
->
data_in
=
new
void
*
[
num_input_tensors
];
for
(
unsigned
int
i
=
0
;
i
<
num_input_tensors
;
++
i
)
tn_req
->
strides_in
[
i
]
=
NULL
;
for
(
unsigned
int
i
=
0
;
i
<
num_input_tensors
;
++
i
)
tn_req
->
alignments_in
[
i
]
=
MEM_ALIGNMENT
;
...
...
@@ -370,7 +388,21 @@ void CuQuantumExecutor::loadTensors(std::shared_ptr<TensorNetworkReq> tn_req)
}
HANDLE_CUDA_ERROR
(
cudaEventRecord
(
tn_req
->
data_finish
,
tn_req
->
stream
));
tn_req
->
memory_window_ptr
.
emplace_back
(
mem_pool_
[
gpu
].
getFront
());
}
else
{
auto
&
net
=
*
(
tn_req
->
network
);
int32_t
tens_num
=
0
;
for
(
auto
iter
=
net
.
cbegin
();
iter
!=
net
.
cend
();
++
iter
){
const
auto
tens_id
=
iter
->
first
;
const
auto
&
tens
=
iter
->
second
;
const
auto
tens_hash
=
tens
.
getTensor
()
->
getTensorHash
();
auto
descr
=
tn_req
->
tensor_descriptors
.
find
(
tens_hash
);
void
*
dev_ptr
=
descr
->
second
.
dst_ptr
.
back
();
if
(
tens_id
==
0
){
tn_req
->
data_out
=
dev_ptr
;
}
else
{
tn_req
->
data_in
[
tens_num
++
]
=
dev_ptr
;
}
}
}
else
{
//no enough memory currently
//Restore previous memory front:
mem_pool_
[
gpu
].
restorePreviousFront
(
prev_front
);
return
;
...
...
@@ -385,16 +417,22 @@ void CuQuantumExecutor::planExecution(std::shared_ptr<TensorNetworkReq> tn_req)
{
//Configure tensor network contraction on one or all GPUs:
for
(
int
gpu
=
0
;
gpu
<
1
;
++
gpu
){
//`Only one GPU for now
const
auto
gpu_id
=
gpu_attr_
[
gpu
].
first
;
HANDLE_CUDA_ERROR
(
cudaSetDevice
(
gpu_id
));
HANDLE_CTN_ERROR
(
cutensornetCreateContractionOptimizerConfig
(
gpu_attr_
[
gpu
].
second
.
cutn_handle
,
&
(
tn_req
->
opt_config
)));
HANDLE_CTN_ERROR
(
cutensornetCreateContractionOptimizerInfo
(
gpu_attr_
[
gpu
].
second
.
cutn_handle
,
tn_req
->
net_descriptor
,
&
(
tn_req
->
opt_info
)));
tn_req
->
worksize
=
gpu_attr_
[
gpu
].
second
.
workspace_size
/
PIPELINE_DEPTH
;
tn_req
->
workspace
=
gpu_attr_
[
gpu
].
second
.
workspace_ptr
;
//`Need moving window (pipelining)
acquireWorkspace
(
gpu
,
&
(
tn_req
->
workspace
),
&
(
tn_req
->
worksize
));
HANDLE_CTN_ERROR
(
cutensornetContractionOptimize
(
gpu_attr_
[
gpu
].
second
.
cutn_handle
,
tn_req
->
net_descriptor
,
tn_req
->
opt_config
,
tn_req
->
worksize
,
tn_req
->
opt_info
));
tn_req
->
net_descriptor
,
tn_req
->
opt_config
,
tn_req
->
worksize
,
tn_req
->
opt_info
));
HANDLE_CTN_ERROR
(
cutensornetCreateContractionPlan
(
gpu_attr_
[
gpu
].
second
.
cutn_handle
,
tn_req
->
net_descriptor
,
tn_req
->
opt_info
,
tn_req
->
worksize
,
&
(
tn_req
->
comp_plan
)));
tn_req
->
net_descriptor
,
tn_req
->
opt_info
,
tn_req
->
worksize
,
&
(
tn_req
->
comp_plan
)));
double
flops
=
0.0
;
HANDLE_CTN_ERROR
(
cutensornetContractionOptimizerInfoGetAttribute
(
gpu_attr_
[
gpu
].
second
.
cutn_handle
,
tn_req
->
opt_info
,
CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT
,
&
flops
,
sizeof
(
flops
)));
}
tn_req
->
exec_status
=
TensorNetworkQueue
::
ExecStat
::
Planning
;
return
;
...
...
@@ -406,20 +444,20 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
//Execute the contraction plan on one or all GPUs:
for
(
int
gpu
=
0
;
gpu
<
1
;
++
gpu
){
//`Only one GPU for now
const
auto
gpu_id
=
gpu_attr_
[
gpu
].
first
;
HANDLE_CUDA_ERROR
(
cudaSetDevice
(
gpu_id
));
int64_t
num_slices
=
0
;
HANDLE_CTN_ERROR
(
cutensornetContractionOptimizerInfoGetAttribute
(
gpu_attr_
[
gpu
].
second
.
cutn_handle
,
tn_req
->
opt_info
,
CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES
,
&
num_slices
,
sizeof
(
num_slices
)));
tn_req
->
opt_info
,
CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES
,
&
num_slices
,
sizeof
(
num_slices
)));
assert
(
num_slices
>
0
);
HANDLE_CUDA_ERROR
(
cudaSetDevice
(
gpu_id
));
HANDLE_CUDA_ERROR
(
cudaEventRecord
(
tn_req
->
compute_start
,
tn_req
->
stream
));
for
(
int64_t
slice_id
=
0
;
slice_id
<
num_slices
;
++
slice_id
){
HANDLE_CTN_ERROR
(
cutensornetContraction
(
gpu_attr_
[
gpu
].
second
.
cutn_handle
,
tn_req
->
comp_plan
,
tn_req
->
data_in
,
tn_req
->
data_out
,
tn_req
->
workspace
,
tn_req
->
worksize
,
slice_id
,
tn_req
->
stream
));
tn_req
->
comp_plan
,
tn_req
->
data_in
,
tn_req
->
data_out
,
tn_req
->
workspace
,
tn_req
->
worksize
,
slice_id
,
tn_req
->
stream
));
}
HANDLE_CUDA_ERROR
(
cudaEventRecord
(
tn_req
->
compute_finish
,
tn_req
->
stream
));
}
...
...
@@ -431,13 +469,21 @@ void CuQuantumExecutor::contractTensorNetwork(std::shared_ptr<TensorNetworkReq>
void
CuQuantumExecutor
::
testCompletion
(
std
::
shared_ptr
<
TensorNetworkReq
>
tn_req
)
{
//Test work completion on one or all GPUs:
bool
all_completed
=
true
;
for
(
int
gpu
=
0
;
gpu
<
1
;
++
gpu
){
//`Only one GPU for now
const
auto
gpu_id
=
gpu_attr_
[
gpu
].
first
;
HANDLE_CUDA_ERROR
(
cudaSetDevice
(
gpu_id
));
cudaError_t
cuda_error
=
cudaEventQuery
(
tn_req
->
compute_finish
);
if
(
cuda_error
==
cudaSuccess
){
//`Move MemPool back forward using tn_req->memory_window_ptr
tn_req
->
exec_status
=
TensorNetworkQueue
::
ExecStat
::
Completed
;
if
(
tn_req
->
memory_window_ptr
[
gpu
]
!=
nullptr
){
mem_pool_
[
gpu
].
releaseMemory
(
tn_req
->
memory_window_ptr
[
gpu
]);
tn_req
->
memory_window_ptr
[
gpu
]
=
nullptr
;
}
}
else
{
all_completed
=
false
;
}
}
if
(
all_completed
)
tn_req
->
exec_status
=
TensorNetworkQueue
::
ExecStat
::
Completed
;
return
;
}
...
...
src/runtime/executor/cuquantum/cuquantum_executor.hpp
View file @
ed6dc07f
/** ExaTN: Tensor Runtime: Tensor network executor: NVIDIA cuQuantum
REVISION: 2022/01/0
4
REVISION: 2022/01/0
5
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
...
...
@@ -40,7 +40,8 @@ class CuQuantumExecutor {
public:
CuQuantumExecutor
(
TensorImplFunc
tensor_data_access_func
);
CuQuantumExecutor
(
TensorImplFunc
tensor_data_access_func
,
unsigned
int
pipeline_depth
);
CuQuantumExecutor
(
const
CuQuantumExecutor
&
)
=
delete
;
CuQuantumExecutor
&
operator
=
(
CuQuantumExecutor
&
)
=
delete
;
...
...
@@ -65,10 +66,13 @@ public:
protected:
static
constexpr
float
WORKSPACE_FRACTION
=
0.2
;
static
constexpr
unsigned
int
PIPELINE_DEPTH
=
1
;
static
constexpr
float
WORKSPACE_FRACTION
=
0.6
;
static
constexpr
std
::
size_t
MEM_ALIGNMENT
=
256
;
void
acquireWorkspace
(
unsigned
int
dev
,
void
**
workspace_ptr
,
uint64_t
*
workspace_size
);
void
parseTensorNetwork
(
std
::
shared_ptr
<
TensorNetworkReq
>
tn_req
);
void
loadTensors
(
std
::
shared_ptr
<
TensorNetworkReq
>
tn_req
);
void
planExecution
(
std
::
shared_ptr
<
TensorNetworkReq
>
tn_req
);
...
...
@@ -80,6 +84,7 @@ protected:
std
::
size_t
buffer_size
=
0
;
void
*
workspace_ptr
=
nullptr
;
std
::
size_t
workspace_size
=
0
;
unsigned
int
pipe_level
=
0
;
void
*
cutn_handle
;
//cutensornetHandle_t = void*
};
...
...
@@ -91,6 +96,8 @@ protected:
std
::
vector
<
LinearMemoryPool
>
mem_pool_
;
/** Tensor data access function **/
TensorImplFunc
tensor_data_access_func_
;
//numerics::Tensor --> {tensor_body_ptr, size_in_bytes}
/** Pipeline depth **/
const
unsigned
int
pipe_depth_
;
};
}
//namespace runtime
...
...
src/runtime/executor/cuquantum/tensor_network_queue.hpp
View file @
ed6dc07f
/** ExaTN: Tensor Runtime: Tensor network executor: Execution queue
REVISION: 202
1/12/30
REVISION: 202
2/01/05
Copyright (C) 2018-202
1
Dmitry Lyakh
Copyright (C) 2018-202
1
Oak Ridge National Laboratory (UT-Battelle)
Copyright (C) 2018-202
2
Dmitry Lyakh
Copyright (C) 2018-202
2
Oak Ridge National Laboratory (UT-Battelle)
Rationale:
- ExaTN graph executor may accept whole tensor networks for execution
...
...
@@ -144,7 +144,7 @@ public:
return
current_network_
;
}
/** Re
turn
s the current iterator to the beginning of the queue. **/
/** Re
set
s the current iterator to the beginning of the queue. **/
void
reset
()
{
lock
();
current_network_
=
networks_
.
begin
();
...
...
@@ -173,6 +173,12 @@ public:
return
not_over
;
}
/** Returns the distance from the current iterator
to the beginning of the queue. **/
auto
getCurrentPos
()
{
return
std
::
distance
(
networks_
.
begin
(),
current_network_
);
}
/** Locks. **/
inline
void
lock
(){
queue_lock_
.
lock
();}
inline
void
unlock
(){
queue_lock_
.
unlock
();}
...
...
src/runtime/executor/graph_executors/lazy/graph_executor_lazy.cpp
View file @
ed6dc07f
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 2022/01/0
3
REVISION: 2022/01/0
5
Copyright (C) 2018-2022 Dmitry Lyakh
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
...
...
@@ -35,7 +35,8 @@ void LazyGraphExecutor::resetNodeExecutor(std::shared_ptr<TensorNodeExecutor> no
[
this
](
const
numerics
::
Tensor
&
tensor
,
int
device_kind
,
int
device_id
,
std
::
size_t
*
size
){
void
*
data_ptr
=
this
->
node_executor_
->
getTensorImage
(
tensor
,
device_kind
,
device_id
,
size
);
return
data_ptr
;
}
},
CUQUANTUM_PIPELINE_DEPTH
);
}
#endif
...
...
@@ -277,7 +278,7 @@ void LazyGraphExecutor::execute(TensorGraph & dag) {
void
LazyGraphExecutor
::
execute
(
TensorNetworkQueue
&
tensor_network_queue
)
{
#ifdef CUQUANTUM
std
::
cout
<<
"#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Started executing the tensor network queue via cuQuantum: "
<<
tensor_network_queue
.
getSize
()
<<
"
element
s detected"
<<
std
::
endl
;
<<
tensor_network_queue
.
getSize
()
<<
"
network
s detected"
<<
std
::
endl
;
assert
(
node_executor_
);
//Synchronize the node executor:
bool
synced
=
node_executor_
->
sync
();
assert
(
synced
);
...
...
@@ -287,26 +288,35 @@ void LazyGraphExecutor::execute(TensorNetworkQueue & tensor_network_queue) {
tensor_network_queue
.
reset
();
bool
not_over
=
!
tensor_network_queue
.
isOver
();
while
(
not_over
){
int
error_code
=
0
;
const
auto
current
=
tensor_network_queue
.
getCurrent
();
const
auto
exec_handle
=
current
->
second
;
auto
exec_stat
=
cuquantum_executor_
->
sync
(
exec_handle
,
&
error_code
);
//this call will progress tensor network execution
assert
(
error_code
==
0
);
if
(
exec_stat
==
TensorNetworkQueue
::
ExecStat
::
None
){
exec_stat
=
cuquantum_executor_
->
execute
(
current
->
first
,
exec_handle
);
if
(
exec_stat
!=
TensorNetworkQueue
::
ExecStat
::
None
){
const
auto
current_pos
=
tensor_network_queue
.
getCurrentPos
();
if
(
current_pos
<
CUQUANTUM_PIPELINE_DEPTH
){
const
auto
current
=
tensor_network_queue
.
getCurrent
();
const
auto
exec_handle
=
current
->
second
;
int
error_code
=
0
;
auto
exec_stat
=
tensor_network_queue
.
checkExecStatus
(
exec_handle
);
if
(
exec_stat
==
TensorNetworkQueue
::
ExecStat
::
Idle
||
current_pos
==
0
){
exec_stat
=
cuquantum_executor_
->
sync
(
exec_handle
,
&
error_code
);
//this call will progress tensor network execution
assert
(
error_code
==
0
);
}
if
(
exec_stat
==
TensorNetworkQueue
::
ExecStat
::
None
){
exec_stat
=
cuquantum_executor_
->
execute
(
current
->
first
,
exec_handle
);
if
(
exec_stat
!=
TensorNetworkQueue
::
ExecStat
::
None
){
auto
prev_exec_stat
=
tensor_network_queue
.
updateExecStatus
(
exec_handle
,
exec_stat
);
std
::
cout
<<
"#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Submitted tensor network to cuQuantum
\n
"
;
}
not_over
=
tensor_network_queue
.
next
();
}
else
if
(
exec_stat
==
TensorNetworkQueue
::
ExecStat
::
Completed
){
auto
prev_exec_stat
=
tensor_network_queue
.
updateExecStatus
(
exec_handle
,
exec_stat
);
assert
(
current_pos
==
0
);
tensor_network_queue
.
remove
();
std
::
cout
<<
"#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Completed tensor network execution via cuQuantum
\n
"
;
not_over
=
!
tensor_network_queue
.
isOver
();
}
else
{
auto
prev_exec_stat
=
tensor_network_queue
.
updateExecStatus
(
exec_handle
,
exec_stat
);
std
::
cout
<<
"#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Submitted
tensor
network
to cuQuantum
\n
"
;
not_over
=
tensor
_
network
_queue
.
next
()
;
}
not_over
=
tensor_network_queue
.
next
();
}
else
if
(
exec_stat
==
TensorNetworkQueue
::
ExecStat
::
Completed
){
auto
prev_exec_stat
=
tensor_network_queue
.
updateExecStatus
(
exec_handle
,
exec_stat
);
tensor_network_queue
.
remove
();
std
::
cout
<<
"#DEBUG(exatn::runtime::LazyGraphExecutor::execute): Completed tensor network execution via cuQuantum
\n
"
;
not_over
=
!
tensor_network_queue
.
isOver
();
}
else
{
auto
prev_exec_stat
=
tensor_network_queue
.
updateExecStatus
(
exec_handle
,
exec_stat
);
not_over
=
tensor_network_queue
.
next
();
not_over
=
false
;
}
}
}
...
...
src/runtime/executor/graph_executors/lazy/graph_executor_lazy.hpp
View file @
ed6dc07f
/** ExaTN:: Tensor Runtime: Tensor graph executor: Lazy
REVISION: 202
1/12/22
REVISION: 202
2/01/05
Copyright (C) 2018-202
1
Dmitry Lyakh, Alex McCaskey
Copyright (C) 2018-202
1
Oak Ridge National Laboratory (UT-Battelle)
Copyright (C) 2018-202
2
Dmitry Lyakh, Alex McCaskey
Copyright (C) 2018-202
2
Oak Ridge National Laboratory (UT-Battelle)
Rationale:
...
...
@@ -26,6 +26,9 @@ public:
static
constexpr
const
unsigned
int
DEFAULT_PIPELINE_DEPTH
=
16
;
static
constexpr
const
unsigned
int
DEFAULT_PREFETCH_DEPTH
=
4
;
#ifdef CUQUANTUM
static
constexpr
const
unsigned
int
CUQUANTUM_PIPELINE_DEPTH
=
2
;
#endif
LazyGraphExecutor
()
:
pipeline_depth_
(
DEFAULT_PIPELINE_DEPTH
),
prefetch_depth_
(
DEFAULT_PREFETCH_DEPTH
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment