node_executor_talsh.hpp 9.45 KB
Newer Older
1
/** ExaTN:: Tensor Runtime: Tensor graph node executor: Talsh
2
REVISION: 2022/01/17
3

4
5
Copyright (C) 2018-2022 Dmitry Lyakh, Tiffany Mintz, Alex McCaskey
Copyright (C) 2018-2022 Oak Ridge National Laboratory (UT-Battelle)
6
7
8
9
10
11
12
13
14
15

Rationale:

**/

#ifndef EXATN_RUNTIME_TALSH_NODE_EXECUTOR_HPP_
#define EXATN_RUNTIME_TALSH_NODE_EXECUTOR_HPP_

#include "tensor_node_executor.hpp"

16
17
18
#include "talshxx.hpp"

#include <unordered_map>
19
#include <vector>
20
#include <list>
21
#include <memory>
22
#include <atomic>
23

24
25
26
27
28
29
30
namespace exatn {
namespace runtime {

class TalshNodeExecutor : public TensorNodeExecutor {

public:

31
  static constexpr const std::size_t DEFAULT_MEM_BUFFER_SIZE = 2UL * 1024UL * 1024UL * 1024UL; //bytes
32
  static constexpr const int ALLREDUCE_CHUNK_SIZE = 64 * 1024 * 1024; //elements
33

34
  TalshNodeExecutor(): max_tensor_rank_(-1), prefetch_enabled_(true), dry_run_(false) {}
35

36
37
38
39
  TalshNodeExecutor(const TalshNodeExecutor &) = delete;
  TalshNodeExecutor & operator=(const TalshNodeExecutor &) = delete;
  TalshNodeExecutor(TalshNodeExecutor &&) noexcept = delete;
  TalshNodeExecutor & operator=(TalshNodeExecutor &&) noexcept = delete;
40

41
  virtual ~TalshNodeExecutor();
42

43
  void initialize(const ParamConf & parameters) override;
44

45
46
  void activateDryRun(bool dry_run) override;

47
  void activateFastMath() override;
48

49
  std::size_t getMemoryBufferSize() const override;
50

51
52
  std::size_t getMemoryUsage(std::size_t * free_mem) const override;

53
54
  double getTotalFlopCount() const override;

55
56
57
58
59
60
  int execute(numerics::TensorOpCreate & op,
              TensorOpExecHandle * exec_handle) override;
  int execute(numerics::TensorOpDestroy & op,
              TensorOpExecHandle * exec_handle) override;
  int execute(numerics::TensorOpTransform & op,
              TensorOpExecHandle * exec_handle) override;
61
62
63
64
  int execute(numerics::TensorOpSlice & op,
              TensorOpExecHandle * exec_handle) override;
  int execute(numerics::TensorOpInsert & op,
              TensorOpExecHandle * exec_handle) override;
65
66
67
68
  int execute(numerics::TensorOpAdd & op,
              TensorOpExecHandle * exec_handle) override;
  int execute(numerics::TensorOpContract & op,
              TensorOpExecHandle * exec_handle) override;
Dmitry I. Lyakh's avatar
Dmitry I. Lyakh committed
69
70
71
72
73
74
75
76
  int execute(numerics::TensorOpDecomposeSVD3 & op,
              TensorOpExecHandle * exec_handle) override;
  int execute(numerics::TensorOpDecomposeSVD2 & op,
              TensorOpExecHandle * exec_handle) override;
  int execute(numerics::TensorOpOrthogonalizeSVD & op,
              TensorOpExecHandle * exec_handle) override;
  int execute(numerics::TensorOpOrthogonalizeMGS & op,
              TensorOpExecHandle * exec_handle) override;
77
78
79
80
  int execute(numerics::TensorOpFetch & op,
              TensorOpExecHandle * exec_handle) override;
  int execute(numerics::TensorOpUpload & op,
              TensorOpExecHandle * exec_handle) override;
81
82
83
84
  int execute(numerics::TensorOpBroadcast & op,
              TensorOpExecHandle * exec_handle) override;
  int execute(numerics::TensorOpAllreduce & op,
              TensorOpExecHandle * exec_handle) override;
85

86
87
  bool sync(TensorOpExecHandle op_handle,
            int * error_code,
88
89
            bool wait = true) override;

90
  bool sync() override;
91

92
93
  bool discard(TensorOpExecHandle op_handle) override;

94
95
  bool prefetch(const numerics::TensorOperation & op) override;

96
97
  void clearCache() override;

98
  /** Returns a locally stored slice copy of a tensor, or nullptr if no RAM. **/
99
100
101
  std::shared_ptr<talsh::Tensor> getLocalTensor(const numerics::Tensor & tensor,
                 const std::vector<std::pair<DimOffset,DimExtent>> & slice_spec) override;

102
103
  /** Returns a non-owning pointer to a local tensor data image on a given device.
      If unsuccessful, returns nullptr. **/
104
105
106
107
  void * getTensorImage(const numerics::Tensor & tensor,        //in: tensor
                        int device_kind,                        //in: device kind (implementation specific)
                        int device_id,                          //in: device id: [0,1,2,..]
                        std::size_t * size = nullptr) const override; //out: tensor data image size in bytes
108

109
  /** Finishes tensor operand prefetching for a given tensor operation. **/
110
  bool finishPrefetching(const numerics::TensorOperation & op); //in: tensor operation
111

112
113
  /** Caches TAL-SH tensor body images moved/copied to accelerators.  **/
  void cacheMovedTensors(talsh::TensorTask & talsh_task); //in: TAL-SH task associated with the tensor operation
114

115
  /** Evicts some or all idle cached TAL-SH tensor body images
116
117
      from accelerator(s), moving them back to Host. On return,
      returns whether at least one such tensor image has been found. **/
118
119
  bool evictMovedTensors(int device_id = DEV_DEFAULT,     //in: flat device id (TAL-SH numeration), DEV_DEFAULT covers all accelerators, DEV_HOST has no effect
                         std::size_t required_space = 0); //in: required space to free in bytes, 0 will evict all idle tensor images on the chosen device(s)
120

121
122
123
124
  const std::string name() const override {return "talsh-node-executor";}
  const std::string description() const override {return "TALSH tensor graph node executor";}
  std::shared_ptr<TensorNodeExecutor> clone() override {return std::make_shared<TalshNodeExecutor>();}

125
protected:
126

127
128
129
130
  /** Determines whether a given TAL-SH tensor is currently participating
      in an active tensor operation, tensor prefetch or tensor eviction. **/
  bool tensorIsCurrentlyInUse(const talsh::Tensor * talsh_tens) const;

131
132
  struct TensorImpl{
    //TAL-SH tensor with reduced shape (all extent-1 tensor dimensions removed):
133
134
135
136
137
    std::unique_ptr<talsh::Tensor> talsh_tensor;
    //The original full tensor signature (dimension base offsets):
    std::vector<std::size_t> full_base_offsets;
    //The reduced tensor signature (dimension base offsets):
    std::vector<std::size_t> reduced_base_offsets;
138
139
140
141
142
    //The original full tensor shape:
    talsh_tens_shape_t * stored_shape;
    //Flag which tensor shape is currently in use by the TAL-SH tensor:
    bool full_shape_is_on;
    //Lifecycle:
143
144
145
146
147
    TensorImpl(const std::vector<std::size_t> & full_offsets,    //full tensor signature
               const std::vector<DimExtent> & full_extents,      //full tensor shape
               const std::vector<std::size_t> & reduced_offsets, //reduced tensor signature
               const std::vector<int> & reduced_extents,         //reduced tensor shape
               int data_kind);                                   //TAL-SH tensor data kind
148
149
    TensorImpl(const TensorImpl &) = delete;
    TensorImpl & operator=(const TensorImpl &) = delete;
150
151
    TensorImpl(TensorImpl &&) noexcept;
    TensorImpl & operator=(TensorImpl &&) noexcept;
152
153
154
155
156
157
    ~TensorImpl();
    //Resets TAL-SH tensor shape between full and reduced, depending on the operation needs:
    void resetTensorShapeToFull();
    void resetTensorShapeToReduced();
  };

158
  struct CachedAttr{
159
    double last_used; //time stamp of last usage of the cached tensor image
160
161
  };

162
163
  /** Maps generic exatn::numerics::Tensor to its TAL-SH implementation **/
  std::unordered_map<numerics::TensorHashType,TensorImpl> tensors_;
164
  /** Active execution handles associated with tensor operations currently executed by TAL-SH **/
165
  std::unordered_map<TensorOpExecHandle,std::shared_ptr<talsh::TensorTask>> tasks_;
166
  /** Active tensor operand prefetching to accelerators tasks **/
167
  std::unordered_map<numerics::TensorHashType,std::shared_ptr<talsh::TensorTask>> prefetches_;
168
169
  /** Active tensor image eviction from accelerators tasks **/
  std::unordered_map<talsh::Tensor*,std::shared_ptr<talsh::TensorTask>> evictions_;
170
  /** Register (cache) of tensors with body images moved/copied to accelerators **/
171
  std::unordered_map<talsh::Tensor*,CachedAttr> accel_cache_[DEV_MAX]; //cache for each device
172
173
  /** Active MPI requests for non-blocking two-sided messages **/
  std::unordered_map<TensorOpExecHandle,std::list<void*>> mpi_requests_; //owning pointers
174
175
  /** Max encountered actual tensor rank **/
  int max_tensor_rank_;
176
177
  /** Prefetching enabled flag **/
  bool prefetch_enabled_;
178
179
  /** Dry run (no actual computations) **/
  std::atomic<bool> dry_run_;
180
  /** TAL-SH Host memory buffer size (bytes) **/
181
  static std::atomic<std::size_t> talsh_host_mem_buffer_size_;
182
183
  /** TAL-SH submitted Flop count **/
  static std::atomic<double> talsh_submitted_flops_;
184
  /** TAL-SH initialization status **/
Dmitry I. Lyakh's avatar
Dmitry I. Lyakh committed
185
  static std::atomic<bool> talsh_initialized_;
186
  /** Number of instances of TAL-SH node executors **/
Dmitry I. Lyakh's avatar
Dmitry I. Lyakh committed
187
  static std::atomic<int> talsh_node_exec_count_;
188
189
};

190

191
/** ExaTN tensor element kind --> TAL-SH tensor element kind converter **/
192
193
inline int get_talsh_tensor_element_kind(TensorElementType element_type)
{
194
195
  int talsh_data_kind = NO_TYPE;
  switch(element_type){
Dmitry I. Lyakh's avatar
Dmitry I. Lyakh committed
196
197
198
199
    case TensorElementType::REAL32: talsh_data_kind = talsh::REAL32; break;
    case TensorElementType::REAL64: talsh_data_kind = talsh::REAL64; break;
    case TensorElementType::COMPLEX32: talsh_data_kind = talsh::COMPLEX32; break;
    case TensorElementType::COMPLEX64: talsh_data_kind = talsh::COMPLEX64; break;
200
201
202
203
204
205
206
207
  }
  return talsh_data_kind;
}

/** TAL-SH tensor element kind --> ExaTN tensor element kind converter **/
inline TensorElementType get_exatn_tensor_element_kind(int element_type)
{
  switch(element_type){
Dmitry I. Lyakh's avatar
Dmitry I. Lyakh committed
208
209
210
211
    case talsh::REAL32: return TensorElementType::REAL32;
    case talsh::REAL64: return TensorElementType::REAL64;
    case talsh::COMPLEX32: return TensorElementType::COMPLEX32;
    case talsh::COMPLEX64: return TensorElementType::COMPLEX64;
212
213
  }
  return TensorElementType::VOID;
214
215
}

216
217
218
219
} //namespace runtime
} //namespace exatn

#endif //EXATN_RUNTIME_TALSH_NODE_EXECUTOR_HPP_