TECA
The Toolkit for Extreme Climate Analysis
teca_thread_util.h
Go to the documentation of this file.
1 #ifndef teca_thread_utils_h
2 #define teca_thread_utils_h
3 
4 /// @file
5 
6 #include "teca_config.h"
7 #include "teca_common.h"
8 #include "teca_mpi.h"
9 
10 #include <deque>
11 
12 /// Codes for dealing with threading
14 {
15 /** load balances threads across an MPI communication space such that on the
16  * individual nodes physical cores each receive the same number of threads.
17  * This is an MPI collective call. Building the affinity map relies on
18  * features available only in _GNU_SOURCE. On systems where these features are
19  * unavailable, when automated detection of the number of threads is requested,
20  * the call will fail and the n_threads will be set to 1,
21  *
22  * @param[in] comm an MPI communcation space to load balance threads across.
23  * the communicator is used to coordinate affinity mapping such that
24  * each rank can allocate a number of threads bound to unique cores.
25  *
26  * @param[in] base_core_id identifies the core in use by this MPI rank's main
27  * thread. if -1 is passed this will be automatically
28  * determined.
29  *
30  * @param[in] n_requested the number of requested threads per rank. Passing a
31  * value of -1 results in use of all the cores on the
32  * node such that each physical core is assigned exactly
33  * 1 thread. Note that for performance reasons
34  * hyperthreads are not used here. The suggested number
35  * of threads is retruned in n_threads, and the returned
36  * affinity map specifies which core the thread should
37  * be bound to to acheive this. Passing n_requested >= 1
38  * specifies a run time override. This indicates that
39  * caller wants to use a specific number of threads,
40  * rather than one per physical core. Passing
41  * n_requested < -1 specifies a maximum to use if
42  * sufficient cores are available. In all cases the
43  * affinity map is constructed.
44  *
45  * @param[in] n_threads_per_device the number of threads that should service
46  * GPUs. If 0 the run will be CPU only. If -1
47  * the default setting (8 threads per GPU) will
48  * be used. This can be overriden at runtime
49  * with the TECA_THREADS_PER_DEVICE environment
50  * variable.
51  *
52  * @param[in] n_ranks_per_device the number of MPI ranks that should be allowed
53  * to access each GPU. MPI ranks not allowed to
54  * access a GPU will execute on the CPU.
55  *
56  * @param[in] bind if true extra work is done to determine an affinity map such
57  * that each thread can be bound to a unique core on the node.
58  *
59  * @param[in] verbose prints a report decribing the affinity map.
60  *
61  * @param[in,out] n_threads if n_requested is -1, this will be set to the number
62  * of threads one can use such that there is one
63  * thread per phycial core taking into account all
64  * ranks running on the node. if n_requested is >= 1
65  * n_threads will explicitly be set to n_requested. If
66  * n_requested < -1 at most -n_requested threads will
67  * be used. Fewer threads will be used if there are
68  * insufficient cores available. if an error occurs
69  * and n_requested is -1 this will be set to 1.
70  *
71  * @param[out] affinity an affinity map, describing for each of n_threads,
72  * a core id that the thread can be bound to. if
73  * n_requested is -1 then the map will conatin an entry
74  * for each of n_threads where each of the threads is
75  * assigned a unique phyical core. when n_requested is >=
76  * 1 the map contains an enrty for each of the n_requested
77  * threads such that when more threads are requested than
78  * cores each core is assigned approximately the same
79  * number of threads.
80  *
81  * @returns 0 on success
82  *
83  * Environment variables:
84  *
85  * | Variable | Description |
86  * | ----------------------- | ----------- |
87  * | TECA_THREADS_PER_DEVICE | The number of threads that will service each GPU |
88  * | TECA_RANKS_PER_DEVICE | The number of MPI ranks allowed to use each GPU |
89  */
91 int thread_parameters(MPI_Comm comm, int base_core_id, int n_requested,
92  int n_threads_per_device, int n_ranks_per_device, bool bind, bool verbose,
93  int &n_threads, std::deque<int> &affinity, std::vector<int> &device_ids);
94 };
95 
96 #endif
p_teca_error_handler error_handler TECA_EXPORT
The global error handler instance.
Codes for dealing with threading.
Definition: teca_thread_util.h:14
TECA_EXPORT int thread_parameters(MPI_Comm comm, int base_core_id, int n_requested, int n_threads_per_device, int n_ranks_per_device, bool bind, bool verbose, int &n_threads, std::deque< int > &affinity, std::vector< int > &device_ids)