A collection of utility classes and functions for integrating with CUDA. More...

Classes
class	cuda_stream_vector
	A collection of CUDA streams. More...

Functions
template<typename T >
__device__ T *	shared_memory_proxy ()

TECA_EXPORT int	get_local_cuda_devices (MPI_Comm comm, int &ranks_per_device, std::vector< int > &local_dev)

TECA_EXPORT int	set_device (int device_id)
	set the CUDA device. returns non-zero on error More...

TECA_EXPORT int	synchronize_device ()
	device wide synchronize More...

TECA_EXPORT int	synchronize_stream ()
	synchronize the default stream More...

CUDA indexing scheme
A flat array is broken into blocks of number of threads where each adjacent thread accesses adjacent memory locations. To accomplish this we might need a large number of blocks. If the number of blocks exceeds the max block dimension in the first and or second block grid dimension then we need to use a 2d or 3d block grid. partition_thread_blocks - decides on a partitioning of the data based on warps_per_block parameter. The resulting decomposition will be either 1,2, or 3D as needed to accommodate the number of fixed sized blocks. It can happen that max grid dimensions are hit, in which case you'll need to increase the number of warps per block. thread_id_to_array_index - given a thread and block id gets the array index to update. _this may be out of bounds so be sure to validate before using it. index_is_valid - test an index for validity.
__device__ unsigned long	thread_id_to_array_index ()

__device__ void	thread_id_to_array_index_slab (unsigned long &i, unsigned long &k0, unsigned long stride)

__device__ int	index_is_valid (unsigned long index, unsigned long max_index)
	bounds check the flat index More...

TECA_EXPORT int	get_launch_props (int device_id, int *block_grid_max, int &warp_size, int &max_warps_per_block)

TECA_EXPORT int	partition_thread_blocks (int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)

TECA_EXPORT int	partition_thread_blocks (size_t array_size, int warps_per_block, int warp_size, int *block_grid_max, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)

TECA_EXPORT int	partition_thread_blocks_slab (int device_id, size_t nxy, size_t nz, size_t stride, int warps_per_block, dim3 &block_grid, int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid)

TECA_EXPORT int	partition_thread_blocks_slab (size_t nxy, size_t nz, size_t stride, int warps_per_block, int warp_size, int *block_grid_max, dim3 &block_grid, int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid)

auto	partition_thread_blocks_1d (unsigned int nt, size_t n_vals)

Detailed Description

A collection of utility classes and functions for integrating with CUDA.

Function Documentation

◆ get_launch_props()

TECA_EXPORT int teca_cuda_util::get_launch_props	(	int	device_id,
		int *	block_grid_max,
		int &	warp_size,
		int &	max_warps_per_block
	)

query properties for the named CUDA device.

Parameters

[in]	device_id	The device to query, or -1 for the active device.
[out]	block_grid_max	a 3 value array of the maximum number of thread blocks supported in x,y, and z directions.
[out]	warp_size	the number of threads per warp
[out]	max_warps_per_block	the maximum number of warps per block supported.

Returns: non-zero on error.

◆ get_local_cuda_devices()

TECA_EXPORT int teca_cuda_util::get_local_cuda_devices	(	MPI_Comm	comm,
		int &	ranks_per_device,
		std::vector< int > &	local_dev
	)

Query the system for the locally available(on this rank) CUDA device count. this is an MPI collective call which returns a set of device ids that can be used locally. Node wide coordination assures that one can put a limit on the number of ranks per node.

Parameters

[in]	comm	MPI communicator defining a set of nodes on which need access to the available GPUS
[in,out]	ranks_per_device	The number of MPI ranks to use per CUDA device. When set to 0 no GPUs are used. When set to -1 all ranks are assigned a GPU but multiple ranks will share a GPU when there are more ranks than devices.
[out]	local_dev	a list of device ids that can be used by the calling MPI rank.

Returns: non-zero on error.

◆ index_is_valid()

__device__ int teca_cuda_util::index_is_valid	(	unsigned long	index,
		unsigned long	max_index
	)

inline

bounds check the flat index

◆ partition_thread_blocks() [1/2]

TECA_EXPORT int teca_cuda_util::partition_thread_blocks	(	int	device_id,
		size_t	array_size,
		int	warps_per_block,
		dim3 &	block_grid,
		int &	n_blocks,
		dim3 &	thread_grid
	)

Calculate CUDA launch parameters for an arbitrarily large flat array. The block grid will be 1d if the device can process the array using a 1d block grid, otherwise dimensions are added to accomodate the array size up to the largest grid supported by the device. Use thread_id_to_array_index in the kernel to determine the array index to process.

Parameters

[in]	device_id	the CUDA device to query launch parameter limits from. Use -1 to query from the currently active device.
[in]	array_size	the length of the array being processed
[in]	warps_per_block	the number of warps to use per block (your choice)
[out]	block_grid	the block dimension kernel launch control
[out]	n_blocks	the number of blocks
[out]	thread_grid	the thread dimension kernel launch control

Returns: zero if successful and non-zero if an error occurred

◆ partition_thread_blocks() [2/2]

TECA_EXPORT int teca_cuda_util::partition_thread_blocks	(	size_t	array_size,
		int	warps_per_block,
		int	warp_size,
		int *	block_grid_max,
		dim3 &	block_grid,
		int &	n_blocks,
		dim3 &	thread_grid
	)

Calculate CUDA launch parameters for an arbitrarily large flat array. The block grid will be 1d if the device can process the array using a 1d block grid, otherwise dimensions are added to accomodate the array size up to the largest grid supported by the device. Use thread_id_to_array_index in the kernel to determine the array index to process. See get_launch_props for how to query CUDA for block_grid_max and warp_size parameters.

Parameters

[in]	array_size	the length of the array being processed
[in]	warps_per_block	the number of warps to use per block (your choice)
[in]	warp_size	the number of threads per warp
[in]	block_grid_max	the maximum number of blocks in the 3D block grid supported by the CUDA device
[out]	block_grid	the block dimension kernel launch control parameter
[out]	n_blocks	the number of blocks
[out]	thread_grid	the thread dimension kernel launch control parameter

Returns: zero if successful and non-zero if an error occurred

◆ partition_thread_blocks_1d()

auto teca_cuda_util::partition_thread_blocks_1d	(	unsigned int	nt,
		size_t	n_vals
	)

inline

Calculate CUDA launch parameters for an arbitrarily large 1D array.

Parameters

[in]	nt	the number of threads per block
[in]	n_vals	the size of the 1D array

Returns: a tuple containing the number of thread blocks and the number of threads per block

◆ partition_thread_blocks_slab() [1/2]

TECA_EXPORT int teca_cuda_util::partition_thread_blocks_slab	(	int	device_id,
		size_t	nxy,
		size_t	nz,
		size_t	stride,
		int	warps_per_block,
		dim3 &	block_grid,
		int &	n_blocks_xy,
		int &	n_blocks_z,
		dim3 &	thread_grid
	)

Calculate CUDA launch parameters for an arbitrarily large 3D array that will be processed by looping over vertical the vertical dimension. Partitioning in the first dimension occurs over x-y slab sized sections of the array. In the second dimension the caller declares the number of elements (stride) desired in vertical dimension loops. A 2d block grid will be generated up to the limits of the selected device. Use thread_id_to_array_index_slab in the kernel to determine the array index to process.

Parameters

[in]	device_id	the CUDA device to query launch parameter limits from. Use -1 to query from the currently active device.
[in]	nxy	the size in the xy dimension of the array being processed
[in]	nz	the size in the vertical dimension of the array being processed
[in]	warps_per_block	the number of warps to use per block (your choice)
[out]	block_grid	the block dimension kernel launch control
[out]	n_blocks_xy	the number of nxy sized blocks
[out]	n_blocks_z	the number of blocks in the vertical dimension
[out]	thread_grid	the thread dimension kernel launch control

Returns: zero if successful and non-zero if an error occurred

◆ partition_thread_blocks_slab() [2/2]

TECA_EXPORT int teca_cuda_util::partition_thread_blocks_slab	(	size_t	nxy,
		size_t	nz,
		size_t	stride,
		int	warps_per_block,
		int	warp_size,
		int *	block_grid_max,
		dim3 &	block_grid,
		int &	n_blocks_xy,
		int &	n_blocks_z,
		dim3 &	thread_grid
	)

Calculate CUDA launch parameters for an arbitrarily large 3D array that will be processed by looping over vertical the vertical dimension. Partitioning in the first dimension occurs over x-y slab sized sections of the array. In the second dimension the caller declares the number of elements (stride) desired in vertical dimension loops. Kernels will compute the loop bounds from the stride and block index. A 2d block grid will be generated up to the limits of the selected device. Use thread_id_to_array_index_slab in the kernel to determine the array index to process. See get_launch_props for how to query CUDA for block_grid_max and warp_size parameters.

Parameters

[in]	nxy	the length of the array being processed
[in]	warps_per_block	the number of warps to use per block (your choice)
[in]	warp_size	the number of threads per warp
[in]	block_grid_max	the maximum number of blocks in the 3D block grid supported by the CUDA device
[out]	block_grid	the block dimension kernel launch control parameter
[out]	n_blocks_xy	the number of nxy sized blocks
[out]	n_blocks_z	the number of blocks in the vertical direction
[out]	thread_grid	the thread dimension kernel launch control parameter

Returns: zero if successful and non-zero if an error occurred

◆ set_device()

TECA_EXPORT int teca_cuda_util::set_device ( int device_id )

set the CUDA device. returns non-zero on error

◆ shared_memory_proxy()

template<typename T >

__device__ T* teca_cuda_util::shared_memory_proxy ( )

a wrapper for using dynamic sized shared memory in a template function.

◆ synchronize_device()

TECA_EXPORT int teca_cuda_util::synchronize_device ( )

device wide synchronize

◆ synchronize_stream()

TECA_EXPORT int teca_cuda_util::synchronize_stream ( )

synchronize the default stream

◆ thread_id_to_array_index()

__device__ unsigned long teca_cuda_util::thread_id_to_array_index ( )

inline

convert a CUDA index into a flat array index using the partitioning scheme defined in partition_thread_blocks

◆ thread_id_to_array_index_slab()

__device__ void teca_cuda_util::thread_id_to_array_index_slab	(	unsigned long &	i,
		unsigned long &	k0,
		unsigned long	stride
	)

inline

convert a CUDA index into a flat array index using the partitioning scheme defined in partition_thread_blocks_slab. This gives the index of the first element in the vertical column.

Classes

Functions

Detailed Description

Function Documentation

◆ get_launch_props()

◆ get_local_cuda_devices()

◆ index_is_valid()

◆ partition_thread_blocks() [1/2]

◆ partition_thread_blocks() [2/2]

◆ partition_thread_blocks_1d()

◆ partition_thread_blocks_slab() [1/2]

◆ partition_thread_blocks_slab() [2/2]

◆ set_device()

◆ shared_memory_proxy()

◆ synchronize_device()

◆ synchronize_stream()

◆ thread_id_to_array_index()

◆ thread_id_to_array_index_slab()