TECA
The Toolkit for Extreme Climate Analysis
teca_cuda_util.h
Go to the documentation of this file.
1 #ifndef teca_cuda_util_h
2 #define teca_cuda_util_h
3 
4 /// @file
5 
6 #include "teca_config.h"
7 #include "teca_common.h"
8 #include "teca_mpi.h"
9 
10 #include <deque>
11 #include <vector>
12 #include <tuple>
13 
14 #include <cuda.h>
15 #include <cuda_runtime.h>
16 
17 
18 /// A collection of utility classes and functions for integrating with CUDA
19 namespace teca_cuda_util
20 {
21 /** a wrapper for using dynamic sized shared memory in a template function.
22  */
23 template <typename T>
24 __device__ T* shared_memory_proxy()
25 {
26  extern __shared__ unsigned char memory[];
27  return reinterpret_cast<T*>(memory);
28 }
29 
30 /** Query the system for the locally available(on this rank) CUDA device count.
31  * this is an MPI collective call which returns a set of device ids that can be
32  * used locally. Node wide coordination assures that one can put a limit on the
33  * number of ranks per node.
34  *
35  * @param[in] comm MPI communicator defining a set of nodes on which need
36  * access to the available GPUS
37  * @param[in,out] ranks_per_device The number of MPI ranks to use per CUDA
38  * device. When set to 0 no GPUs are used. When
39  * set to -1 all ranks are assigned a GPU but
40  * multiple ranks will share a GPU when there
41  * are more ranks than devices.
42  *
43  * @param[out] local_dev a list of device ids that can be used by the calling
44  * MPI rank.
45  * @returns non-zero on error.
46  */
48 int get_local_cuda_devices(MPI_Comm comm, int &ranks_per_device,
49  std::vector<int> &local_dev);
50 
51 /// set the CUDA device. returns non-zero on error
53 int set_device(int device_id);
54 
55 /// device wide synchronize
58 
59 /// synchronize the default stream
62 
63 /** A flat array is broken into blocks of number of threads where each adjacent
64  * thread accesses adjacent memory locations. To accomplish this we might need
65  * a large number of blocks. If the number of blocks exceeds the max block
66  * dimension in the first and or second block grid dimension then we need to
67  * use a 2d or 3d block grid.
68  *
69  * ::partition_thread_blocks - decides on a partitioning of the data based on
70  * warps_per_block parameter. The resulting decomposition will be either 1,2,
71  * or 3D as needed to accommodate the number of fixed sized blocks. It can
72  * happen that max grid dimensions are hit, in which case you'll need to
73  * increase the number of warps per block.
74  *
75  * ::thread_id_to_array_index - given a thread and block id gets the
76  * array index to update. _this may be out of bounds so be sure
77  * to validate before using it.
78  *
79  * ::index_is_valid - test an index for validity.
80 */
81 /// @name CUDA indexing scheme
82 ///@{
83 
84 /** convert a CUDA index into a flat array index using the partitioning scheme
85  * defined in ::partition_thread_blocks
86  */
87 inline
88 __device__
89 unsigned long thread_id_to_array_index()
90 {
91  return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x
92  + blockIdx.z * gridDim.x * gridDim.y);
93 }
94 
95 /** convert a CUDA index into a flat array index using the partitioning scheme
96  * defined in ::partition_thread_blocks_slab. This gives the index of the first
97  * element in the vertical column.
98  */
99 inline
100 __device__
101 void thread_id_to_array_index_slab(unsigned long &i, unsigned long &k0,
102  unsigned long stride)
103 {
104  // index in the xy slab
105  i = threadIdx.x + blockDim.x * blockIdx.x;
106 
107  // first index in the vertical dimension
108  k0 = stride * blockIdx.y;
109 }
110 
111 /// bounds check the flat index
112 inline
113 __device__
114 int index_is_valid(unsigned long index, unsigned long max_index)
115 {
116  return index < max_index;
117 }
118 
119 /** query properties for the named CUDA device.
120  * @param[in] device_id The device to query, or -1 for the active device.
121  * @param[out] block_grid_max a 3 value array of the maximum number of thread
122  * blocks supported in x,y, and z directions.
123  * @param[out] warp_size the number of threads per warp
124  * @param[out] max_warps_per_block the maximum number of warps per block
125  * supported.
126  * @returns non-zero on error.
127  */
129 int get_launch_props(int device_id,
130  int *block_grid_max, int &warp_size,
131  int &max_warps_per_block);
132 
133 /** Calculate CUDA launch parameters for an arbitrarily large flat array. The
134  * block grid will be 1d if the device can process the array using a 1d block
135  * grid, otherwise dimensions are added to accomodate the array size up to the
136  * largest grid supported by the device. Use ::thread_id_to_array_index in the
137  * kernel to determine the array index to process.
138  *
139  * @param[in] device_id the CUDA device to query launch parameter limits from.
140  * Use -1 to query from the currently active device.
141  * @param[in] array_size the length of the array being processed
142  * @param[in] warps_per_block the number of warps to use per block (your choice)
143  * @param[out] block_grid the block dimension kernel launch control
144  * @param[out] n_blocks the number of blocks
145  * @param[out] thread_grid the thread dimension kernel launch control
146  *
147  * @returns zero if successful and non-zero if an error occurred
148  */
150 int partition_thread_blocks(int device_id, size_t array_size,
151  int warps_per_block, dim3 &block_grid, int &n_blocks,
152  dim3 &thread_grid);
153 
154 /** Calculate CUDA launch parameters for an arbitrarily large flat array. The
155  * block grid will be 1d if the device can process the array using a 1d block
156  * grid, otherwise dimensions are added to accomodate the array size up to the
157  * largest grid supported by the device. Use ::thread_id_to_array_index in the
158  * kernel to determine the array index to process. See ::get_launch_props for
159  * how to query CUDA for block_grid_max and warp_size parameters.
160  *
161  * @param[in] array_size the length of the array being processed
162  * @param[in] warps_per_block the number of warps to use per block (your choice)
163  * @param[in] warp_size the number of threads per warp
164  * @param[in] block_grid_max the maximum number of blocks in the 3D block
165  * grid supported by the CUDA device
166  * @param[out] block_grid the block dimension kernel launch control parameter
167  * @param[out] n_blocks the number of blocks
168  * @param[out] thread_grid the thread dimension kernel launch control parameter
169  *
170  * @returns zero if successful and non-zero if an error occurred
171  */
173 int partition_thread_blocks(size_t array_size,
174  int warps_per_block, int warp_size, int *block_grid_max,
175  dim3 &block_grid, int &n_blocks, dim3 &thread_grid);
176 
177 /** Calculate CUDA launch parameters for an arbitrarily large 3D array that
178  * will be processed by looping over vertical the vertical dimension.
179  * Partitioning in the first dimension occurs over x-y slab sized sections of
180  * the array. In the second dimension the caller declares the number of
181  * elements (stride) desired in vertical dimension loops. A 2d block grid will
182  * be generated up to the limits of the selected device. Use
183  * ::thread_id_to_array_index_slab in the kernel to determine the array index
184  * to process.
185  *
186  * @param[in] device_id the CUDA device to query launch parameter limits from.
187  * Use -1 to query from the currently active device.
188  * @param[in] nxy the size in the xy dimension of the array being processed
189  * @param[in] nz the size in the vertical dimension of the array being processed
190  * @param[in] warps_per_block the number of warps to use per block (your choice)
191  * @param[out] block_grid the block dimension kernel launch control
192  * @param[out] n_blocks_xy the number of nxy sized blocks
193  * @param[out] n_blocks_z the number of blocks in the vertical dimension
194  * @param[out] thread_grid the thread dimension kernel launch control
195  *
196  * @returns zero if successful and non-zero if an error occurred
197  */
199 int partition_thread_blocks_slab(int device_id, size_t nxy, size_t nz,
200  size_t stride, int warps_per_block, dim3 &block_grid, int &n_blocks_xy,
201  int &n_blocks_z, dim3 &thread_grid);
202 
203 /** Calculate CUDA launch parameters for an arbitrarily large 3D array that
204  * will be processed by looping over vertical the vertical dimension.
205  * Partitioning in the first dimension occurs over x-y slab sized sections of
206  * the array. In the second dimension the caller declares the number of
207  * elements (stride) desired in vertical dimension loops. Kernels will compute
208  * the loop bounds from the stride and block index. A 2d block grid will be
209  * generated up to the limits of the selected device. Use
210  * ::thread_id_to_array_index_slab in the kernel to determine the array index
211  * to process. See ::get_launch_props for how to query CUDA for block_grid_max
212  * and warp_size parameters.
213  *
214  * @param[in] nxy the length of the array being processed
215  * @param[in] warps_per_block the number of warps to use per block (your choice)
216  * @param[in] warp_size the number of threads per warp
217  * @param[in] block_grid_max the maximum number of blocks in the 3D block
218  * grid supported by the CUDA device
219  * @param[out] block_grid the block dimension kernel launch control parameter
220  * @param[out] n_blocks_xy the number of nxy sized blocks
221  * @param[out] n_blocks_z the number of blocks in the vertical direction
222  * @param[out] thread_grid the thread dimension kernel launch control parameter
223  *
224  * @returns zero if successful and non-zero if an error occurred
225  */
227 int partition_thread_blocks_slab(size_t nxy, size_t nz, size_t stride,
228  int warps_per_block, int warp_size, int *block_grid_max, dim3 &block_grid,
229  int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid);
230 
231 /** Calculate CUDA launch parameters for an arbitrarily large 1D array.
232  * @param[in] nt the number of threads per block
233  * @param[in] n_vals the size of the 1D array
234  * @returns a tuple containing the number of thread blocks and the number of
235  * threads per block
236  */
237 inline
238 auto partition_thread_blocks_1d(unsigned int nt, size_t n_vals)
239 {
240  return std::make_tuple((n_vals / nt + (n_vals % nt ? 1 : 0)), nt);
241 }
242 ///@}
243 
244 
245 
246 /// A collection of CUDA streams.
247 /** This container always has as its first element the cudaStreamPerThread
248  * stream. If more than one stream is desired one can call ::resize to add
249  * new streams to the collection. For simplicity copying the container is
250  * disabled, but this feature could be added if needed.
251  */
253 {
254 public:
255  cuda_stream_vector() : m_vec(1, cudaStreamPerThread) {}
257 
258  /// prevent copies, OK to enable these if needed
260  void operator=(const cuda_stream_vector &) = delete;
261 
262  /// resize the collection. creates and destroys streams as needed
263  int resize(size_t n);
264 
265  /// get the number of available cuda streams
266  size_t size() const { return m_vec.size(); }
267 
268  /// get the ith cuda stream
269  cudaStream_t &operator[](size_t i) { return m_vec[i]; }
270 
271  /// get the ith cuda stream
272  const cudaStream_t &operator[](size_t i) const { return m_vec[i]; }
273 
274 private:
275  std::vector<cudaStream_t> m_vec;
276 };
277 
278 }
279 #endif
A collection of CUDA streams.
Definition: teca_cuda_util.h:253
size_t size() const
get the number of available cuda streams
Definition: teca_cuda_util.h:266
const cudaStream_t & operator[](size_t i) const
get the ith cuda stream
Definition: teca_cuda_util.h:272
cudaStream_t & operator[](size_t i)
get the ith cuda stream
Definition: teca_cuda_util.h:269
cuda_stream_vector(const cuda_stream_vector &)=delete
prevent copies, OK to enable these if needed
int resize(size_t n)
resize the collection. creates and destroys streams as needed
A collection of utility classes and functions for integrating with CUDA.
Definition: teca_cuda_util.h:20
TECA_EXPORT int set_device(int device_id)
set the CUDA device. returns non-zero on error
TECA_EXPORT int partition_thread_blocks_slab(int device_id, size_t nxy, size_t nz, size_t stride, int warps_per_block, dim3 &block_grid, int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid)
TECA_EXPORT int synchronize_device()
device wide synchronize
__device__ unsigned long thread_id_to_array_index()
Definition: teca_cuda_util.h:89
__device__ T * shared_memory_proxy()
Definition: teca_cuda_util.h:24
TECA_EXPORT int get_local_cuda_devices(MPI_Comm comm, int &ranks_per_device, std::vector< int > &local_dev)
TECA_EXPORT int synchronize_stream()
synchronize the default stream
__device__ int index_is_valid(unsigned long index, unsigned long max_index)
bounds check the flat index
Definition: teca_cuda_util.h:114
auto partition_thread_blocks_1d(unsigned int nt, size_t n_vals)
Definition: teca_cuda_util.h:238
TECA_EXPORT int get_launch_props(int device_id, int *block_grid_max, int &warp_size, int &max_warps_per_block)
TECA_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
__device__ void thread_id_to_array_index_slab(unsigned long &i, unsigned long &k0, unsigned long stride)
Definition: teca_cuda_util.h:101
p_teca_error_handler error_handler TECA_EXPORT
The global error handler instance.