TECA
The Toolkit for Extreme Climate Analysis
teca_cuda_util.h File Reference
#include "teca_config.h"
#include "teca_common.h"
#include "teca_mpi.h"
#include <deque>
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
Include dependency graph for teca_cuda_util.h:

Go to the source code of this file.

Namespaces

 teca_cuda_util
 A collection of utility classes and functions for integrating with CUDA.
 

Functions

TECA_EXPORT int teca_cuda_util::get_local_cuda_devices (MPI_Comm comm, int &ranks_per_device, std::vector< int > &local_dev)
 
TECA_EXPORT int teca_cuda_util::set_device (int device_id)
 set the CUDA device. returns non-zero on error More...
 
TECA_EXPORT int teca_cuda_util::synchronize ()
 stop and wait for previously launched kernels to complete More...
 
CUDA indexing scheme

A flat array is broken into blocks of number of threads where each adjacent thread accesses adjacent memory locations. To accomplish this we might need a large number of blocks. If the number of blocks exceeds the max block dimension in the first and or second block grid dimension then we need to use a 2d or 3d block grid.

::partition_thread_blocks - decides on a partitioning of the data based on warps_per_block parameter. The resulting decomposition will be either 1,2, or 3D as needed to accommodate the number of fixed sized blocks. It can happen that max grid dimensions are hit, in which case you'll need to increase the number of warps per block.

::thread_id_to_array_index - given a thread and block id gets the array index to update. _this may be out of bounds so be sure to validate before using it.

::index_is_valid - test an index for validity.

__device__ unsigned long teca_cuda_util::thread_id_to_array_index ()
 
__device__ void teca_cuda_util::thread_id_to_array_index_slab (unsigned long &i, unsigned long &k0, unsigned long stride)
 
__device__ int teca_cuda_util::index_is_valid (unsigned long index, unsigned long max_index)
 bounds check the flat index More...
 
TECA_EXPORT int teca_cuda_util::get_launch_props (int device_id, int *block_grid_max, int &warp_size, int &max_warps_per_block)
 
TECA_EXPORT int teca_cuda_util::partition_thread_blocks (int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
 
TECA_EXPORT int teca_cuda_util::partition_thread_blocks (size_t array_size, int warps_per_block, int warp_size, int *block_grid_max, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
 
TECA_EXPORT int teca_cuda_util::partition_thread_blocks_slab (int device_id, size_t nxy, size_t nz, size_t stride, int warps_per_block, dim3 &block_grid, int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid)
 
TECA_EXPORT int teca_cuda_util::partition_thread_blocks_slab (size_t nxy, size_t nz, size_t stride, int warps_per_block, int warp_size, int *block_grid_max, dim3 &block_grid, int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid)
 
CUDA indexing scheme

A flat array is broken into blocks of number of threads where each adjacent thread accesses adjacent memory locations. To accomplish this we might need a large number of blocks. If the number of blocks exceeds the max block dimension in the first and or second block grid dimension then we need to use a 2d or 3d block grid.

::partition_thread_blocks - decides on a partitioning of the data based on warps_per_block parameter. The resulting decomposition will be either 1,2, or 3D as needed to accommodate the number of fixed sized blocks. It can happen that max grid dimensions are hit, in which case you'll need to increase the number of warps per block.

::thread_id_to_array_index - given a thread and block id gets the array index to update. _this may be out of bounds so be sure to validate before using it.

::index_is_valid - test an index for validity.

__device__ unsigned long teca_cuda_util::thread_id_to_array_index ()
 
__device__ void teca_cuda_util::thread_id_to_array_index_slab (unsigned long &i, unsigned long &k0, unsigned long stride)
 
__device__ int teca_cuda_util::index_is_valid (unsigned long index, unsigned long max_index)
 bounds check the flat index More...
 
TECA_EXPORT int teca_cuda_util::get_launch_props (int device_id, int *block_grid_max, int &warp_size, int &max_warps_per_block)
 
TECA_EXPORT int teca_cuda_util::partition_thread_blocks (int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
 
TECA_EXPORT int teca_cuda_util::partition_thread_blocks (size_t array_size, int warps_per_block, int warp_size, int *block_grid_max, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
 
TECA_EXPORT int teca_cuda_util::partition_thread_blocks_slab (int device_id, size_t nxy, size_t nz, size_t stride, int warps_per_block, dim3 &block_grid, int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid)
 
TECA_EXPORT int teca_cuda_util::partition_thread_blocks_slab (size_t nxy, size_t nz, size_t stride, int warps_per_block, int warp_size, int *block_grid_max, dim3 &block_grid, int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid)