TECA
The Toolkit for Extreme Climate Analysis
teca_cuda_util.h
Go to the documentation of this file.
1 #ifndef teca_cuda_util_h
2 #define teca_cuda_util_h
3 
4 /// @file
5 
6 #include "teca_config.h"
7 #include "teca_common.h"
8 #include "teca_mpi.h"
9 
10 #include <deque>
11 #include <vector>
12 
13 #include <cuda.h>
14 #include <cuda_runtime.h>
15 
16 
17 /// A collection of utility classes and functions for integrating with CUDA
18 namespace teca_cuda_util
19 {
20 /** Query the system for the locally available(on this rank) CUDA device count.
21  * this is an MPI collective call which returns a set of device ids that can be
22  * used locally. If there are as many (or more than) devices on the node than
23  * the number of MPI ranks assigned to the node the list of device ids will be
24  * unique across MPI ranks on the node. Otherwise devices are assigned round
25  * robin fashion.
26  *
27  * @param[in] comm MPI communicator defining a set of nodes on which need
28  * access to the available GPUS
29  * @param[in,out] ranks_per_device The number of MPI ranks to use per CUDA
30  * device. Passing -1 will assign all MPI
31  * ranks a GPU up to a maximum of 8 ranks
32  * per GPU. The number of ranks per GPU
33  * used is returned through this argument.
34  * @param[out] local_dev a list of device ids that can be used by the calling
35  * MPI rank.
36  * @returns non-zero on error.
37  */
39 int get_local_cuda_devices(MPI_Comm comm, int &ranks_per_device,
40  std::vector<int> &local_dev);
41 
42 /// set the CUDA device. returns non-zero on error
44 int set_device(int device_id);
45 
46 /// stop and wait for previously launched kernels to complete
48 int synchronize();
49 
50 /** A flat array is broken into blocks of number of threads where each adjacent
51  * thread accesses adjacent memory locations. To accomplish this we might need
52  * a large number of blocks. If the number of blocks exceeds the max block
53  * dimension in the first and or second block grid dimension then we need to
54  * use a 2d or 3d block grid.
55  *
56  * ::partition_thread_blocks - decides on a partitioning of the data based on
57  * warps_per_block parameter. The resulting decomposition will be either 1,2,
58  * or 3D as needed to accommodate the number of fixed sized blocks. It can
59  * happen that max grid dimensions are hit, in which case you'll need to
60  * increase the number of warps per block.
61  *
62  * ::thread_id_to_array_index - given a thread and block id gets the
63  * array index to update. _this may be out of bounds so be sure
64  * to validate before using it.
65  *
66  * ::index_is_valid - test an index for validity.
67 */
68 /// @name CUDA indexing scheme
69 ///@{
70 
71 /** convert a CUDA index into a flat array index using the partitioning scheme
72  * defined in ::partition_thread_blocks
73  */
74 inline
75 __device__
76 unsigned long thread_id_to_array_index()
77 {
78  return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x
79  + blockIdx.z * gridDim.x * gridDim.y);
80 }
81 
82 /** convert a CUDA index into a flat array index using the partitioning scheme
83  * defined in ::partition_thread_blocks_slab. This gives the index of the first
84  * element in the vertical column.
85  */
86 inline
87 __device__
88 void thread_id_to_array_index_slab(unsigned long &i, unsigned long &k0,
89  unsigned long stride)
90 {
91  // index in the xy slab
92  i = threadIdx.x + blockDim.x * blockIdx.x;
93 
94  // first index in the vertical dimension
95  k0 = stride * blockIdx.y;
96 }
97 
98 /// bounds check the flat index
99 inline
100 __device__
101 int index_is_valid(unsigned long index, unsigned long max_index)
102 {
103  return index < max_index;
104 }
105 
106 /** query properties for the named CUDA device.
107  * @param[in] device_id The device to query, or -1 for the active device.
108  * @param[out] block_grid_max a 3 value array of the maximum number of thread
109  * blocks supported in x,y, and z directions.
110  * @param[out] warp_size the number of threads per warp
111  * @param[out] max_warps_per_block the maximum number of warps per block
112  * supported.
113  * @returns non-zero on error.
114  */
116 int get_launch_props(int device_id,
117  int *block_grid_max, int &warp_size,
118  int &max_warps_per_block);
119 
120 /** Calculate CUDA launch parameters for an arbitrarily large flat array. The
121  * block grid will be 1d if the device can process the array using a 1d block
122  * grid, otherwise dimensions are added to accomodate the array size up to the
123  * largest grid supported by the device. Use ::thread_id_to_array_index in the
124  * kernel to determine the array index to process.
125  *
126  * @param[in] device_id the CUDA device to query launch parameter limits from.
127  * Use -1 to query from the currently active device.
128  * @param[in] array_size the length of the array being processed
129  * @param[in] warps_per_block the number of warps to use per block (your choice)
130  * @param[out] block_grid the block dimension kernel launch control
131  * @param[out] n_blocks the number of blocks
132  * @param[out] thread_grid the thread dimension kernel launch control
133  *
134  * @returns zero if successful and non-zero if an error occurred
135  */
137 int partition_thread_blocks(int device_id, size_t array_size,
138  int warps_per_block, dim3 &block_grid, int &n_blocks,
139  dim3 &thread_grid);
140 
141 /** Calculate CUDA launch parameters for an arbitrarily large flat array. The
142  * block grid will be 1d if the device can process the array using a 1d block
143  * grid, otherwise dimensions are added to accomodate the array size up to the
144  * largest grid supported by the device. Use ::thread_id_to_array_index in the
145  * kernel to determine the array index to process. See ::get_launch_props for
146  * how to query CUDA for block_grid_max and warp_size parameters.
147  *
148  * @param[in] array_size the length of the array being processed
149  * @param[in] warps_per_block the number of warps to use per block (your choice)
150  * @param[in] warp_size the number of threads per warp
151  * @param[in] block_grid_max the maximum number of blocks in the 3D block
152  * grid supported by the CUDA device
153  * @param[out] block_grid the block dimension kernel launch control parameter
154  * @param[out] n_blocks the number of blocks
155  * @param[out] thread_grid the thread dimension kernel launch control parameter
156  *
157  * @returns zero if successful and non-zero if an error occurred
158  */
160 int partition_thread_blocks(size_t array_size,
161  int warps_per_block, int warp_size, int *block_grid_max,
162  dim3 &block_grid, int &n_blocks, dim3 &thread_grid);
163 
164 /** Calculate CUDA launch parameters for an arbitrarily large 3D array that
165  * will be processed by looping over vertical the vertical dimension.
166  * Partitioning in the first dimension occurs over x-y slab sized sections of
167  * the array. In the second dimension the caller declares the number of
168  * elements (stride) desired in vertical dimension loops. A 2d block grid will
169  * be generated up to the limits of the selected device. Use
170  * ::thread_id_to_array_index_slab in the kernel to determine the array index
171  * to process.
172  *
173  * @param[in] device_id the CUDA device to query launch parameter limits from.
174  * Use -1 to query from the currently active device.
175  * @param[in] nxy the size in the xy dimension of the array being processed
176  * @param[in] nz the size in the vertical dimension of the array being processed
177  * @param[in] warps_per_block the number of warps to use per block (your choice)
178  * @param[out] block_grid the block dimension kernel launch control
179  * @param[out] n_blocks_xy the number of nxy sized blocks
180  * @param[out] n_blocks_z the number of blocks in the vertical dimension
181  * @param[out] thread_grid the thread dimension kernel launch control
182  *
183  * @returns zero if successful and non-zero if an error occurred
184  */
186 int partition_thread_blocks_slab(int device_id, size_t nxy, size_t nz,
187  size_t stride, int warps_per_block, dim3 &block_grid, int &n_blocks_xy,
188  int &n_blocks_z, dim3 &thread_grid);
189 
190 /** Calculate CUDA launch parameters for an arbitrarily large 3D array that
191  * will be processed by looping over vertical the vertical dimension.
192  * Partitioning in the first dimension occurs over x-y slab sized sections of
193  * the array. In the second dimension the caller declares the number of
194  * elements (stride) desired in vertical dimension loops. Kernels will compute
195  * the loop bounds from the stride and block index. A 2d block grid will be
196  * generated up to the limits of the selected device. Use
197  * ::thread_id_to_array_index_slab in the kernel to determine the array index
198  * to process. See ::get_launch_props for how to query CUDA for block_grid_max
199  * and warp_size parameters.
200  *
201  * @param[in] nxy the length of the array being processed
202  * @param[in] warps_per_block the number of warps to use per block (your choice)
203  * @param[in] warp_size the number of threads per warp
204  * @param[in] block_grid_max the maximum number of blocks in the 3D block
205  * grid supported by the CUDA device
206  * @param[out] block_grid the block dimension kernel launch control parameter
207  * @param[out] n_blocks_xy the number of nxy sized blocks
208  * @param[out] n_blocks_z the number of blocks in the vertical direction
209  * @param[out] thread_grid the thread dimension kernel launch control parameter
210  *
211  * @returns zero if successful and non-zero if an error occurred
212  */
214 int partition_thread_blocks_slab(size_t nxy, size_t nz, size_t stride,
215  int warps_per_block, int warp_size, int *block_grid_max, dim3 &block_grid,
216  int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid);
217 
218 }
219 ///@}
220 #endif
teca_cuda_util
A collection of utility classes and functions for integrating with CUDA.
Definition: teca_cuda_util.h:18
teca_cuda_util::set_device
TECA_EXPORT int set_device(int device_id)
set the CUDA device. returns non-zero on error
teca_cuda_util::get_local_cuda_devices
TECA_EXPORT int get_local_cuda_devices(MPI_Comm comm, int &ranks_per_device, std::vector< int > &local_dev)
teca_cuda_util::partition_thread_blocks_slab
TECA_EXPORT int partition_thread_blocks_slab(int device_id, size_t nxy, size_t nz, size_t stride, int warps_per_block, dim3 &block_grid, int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid)
teca_cuda_util::partition_thread_blocks
TECA_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
teca_common.h
teca_cuda_util::thread_id_to_array_index_slab
__device__ void thread_id_to_array_index_slab(unsigned long &i, unsigned long &k0, unsigned long stride)
Definition: teca_cuda_util.h:88
teca_cuda_util::synchronize
TECA_EXPORT int synchronize()
stop and wait for previously launched kernels to complete
teca_cuda_util::thread_id_to_array_index
__device__ unsigned long thread_id_to_array_index()
Definition: teca_cuda_util.h:76
teca_cuda_util::get_launch_props
TECA_EXPORT int get_launch_props(int device_id, int *block_grid_max, int &warp_size, int &max_warps_per_block)
teca_cuda_util::index_is_valid
__device__ int index_is_valid(unsigned long index, unsigned long max_index)
bounds check the flat index
Definition: teca_cuda_util.h:101
teca_error::TECA_EXPORT
p_teca_error_handler error_handler TECA_EXPORT
The global error handler instance.