1 #ifndef teca_cuda_util_h
2 #define teca_cuda_util_h
6 #include "teca_config.h"
15 #include <cuda_runtime.h>
26 extern __shared__
unsigned char memory[];
27 return reinterpret_cast<T*
>(memory);
49 std::vector<int> &local_dev);
91 return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x
92 + blockIdx.z * gridDim.x * gridDim.y);
102 unsigned long stride)
105 i = threadIdx.x + blockDim.x * blockIdx.x;
108 k0 = stride * blockIdx.y;
116 return index < max_index;
130 int *block_grid_max,
int &warp_size,
131 int &max_warps_per_block);
151 int warps_per_block, dim3 &block_grid,
int &n_blocks,
174 int warps_per_block,
int warp_size,
int *block_grid_max,
175 dim3 &block_grid,
int &n_blocks, dim3 &thread_grid);
200 size_t stride,
int warps_per_block, dim3 &block_grid,
int &n_blocks_xy,
201 int &n_blocks_z, dim3 &thread_grid);
228 int warps_per_block,
int warp_size,
int *block_grid_max, dim3 &block_grid,
229 int &n_blocks_xy,
int &n_blocks_z, dim3 &thread_grid);
240 return std::make_tuple((n_vals / nt + (n_vals % nt ? 1 : 0)), nt);
266 size_t size()
const {
return m_vec.size(); }
272 const cudaStream_t &
operator[](
size_t i)
const {
return m_vec[i]; }
275 std::vector<cudaStream_t> m_vec;
A collection of CUDA streams.
Definition: teca_cuda_util.h:253
size_t size() const
get the number of available cuda streams
Definition: teca_cuda_util.h:266
const cudaStream_t & operator[](size_t i) const
get the ith cuda stream
Definition: teca_cuda_util.h:272
cudaStream_t & operator[](size_t i)
get the ith cuda stream
Definition: teca_cuda_util.h:269
cuda_stream_vector(const cuda_stream_vector &)=delete
prevent copies, OK to enable these if needed
int resize(size_t n)
resize the collection. creates and destroys streams as needed
A collection of utility classes and functions for integrating with CUDA.
Definition: teca_cuda_util.h:20
TECA_EXPORT int set_device(int device_id)
set the CUDA device. returns non-zero on error
TECA_EXPORT int partition_thread_blocks_slab(int device_id, size_t nxy, size_t nz, size_t stride, int warps_per_block, dim3 &block_grid, int &n_blocks_xy, int &n_blocks_z, dim3 &thread_grid)
TECA_EXPORT int synchronize_device()
device wide synchronize
__device__ unsigned long thread_id_to_array_index()
Definition: teca_cuda_util.h:89
__device__ T * shared_memory_proxy()
Definition: teca_cuda_util.h:24
TECA_EXPORT int get_local_cuda_devices(MPI_Comm comm, int &ranks_per_device, std::vector< int > &local_dev)
TECA_EXPORT int synchronize_stream()
synchronize the default stream
__device__ int index_is_valid(unsigned long index, unsigned long max_index)
bounds check the flat index
Definition: teca_cuda_util.h:114
auto partition_thread_blocks_1d(unsigned int nt, size_t n_vals)
Definition: teca_cuda_util.h:238
TECA_EXPORT int get_launch_props(int device_id, int *block_grid_max, int &warp_size, int &max_warps_per_block)
TECA_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
__device__ void thread_id_to_array_index_slab(unsigned long &i, unsigned long &k0, unsigned long stride)
Definition: teca_cuda_util.h:101
p_teca_error_handler error_handler TECA_EXPORT
The global error handler instance.