SST/macro
sumi.h
Go to the documentation of this file.
1 #ifndef sumi_msg_api_h
2 #define sumi_msg_api_h
3 
4 #include <sumi/options.h>
5 #include <sumi/message.h>
6 #include <sumi/comm_functions.h>
7 #include <sumi/collective_message.h>
8 #include <sumi/timeout.h>
9 #include <sumi/communicator.h>
10 #include <sumi/thread_safe_set.h>
11 
12 #define main USER_MAIN
13 
14 namespace sumi {
15 
16 void
17 comm_init();
18 
19 void
21 
22 int
23 comm_rank();
24 
25 int
26 comm_nproc();
27 
28 /**
29  @param dst The destination to send to
30 */
31 void
32 comm_send_header(int dst, const message::ptr& msg);
33 
34 void
35 comm_cancel_ping(int dst, int tag);
36 
37 void
38 comm_ping(int dst, int tag, timeout_function* func);
39 
40 void
41 comm_send_payload(int dst, const message::ptr& msg);
42 
43 void
44 comm_send(int dst, message::payload_type_t ev, const message::ptr& msg);
45 
46 void
47 comm_rdma_put(int dst, const message::ptr& msg);
48 
49 void
50 comm_rdma_get(int dst, const message::ptr& msg);
51 
52 void
53 comm_nvram_get(int dst, const message::ptr& msg);
54 
55 void
56 comm_alltoall(void* dst, void* src, int nelems,
57  int type_size, int tag, bool fault_aware = false,
58  int context = options::initial_context, communicator* dom = 0);
59 
60 void
61 comm_allgather(void* dst, void* src, int nelems,
62  int type_size, int tag, bool fault_aware = false,
63  int context = options::initial_context, communicator* dom = 0);
64 
65 void
66 comm_allgatherv(void* dst, void* src, int* recv_counts,
67  int type_size, int tag, bool fault_aware = false,
68  int context = options::initial_context, communicator* dom = 0);
69 
70 void
71 comm_gather(int root, void* dst, void* src, int nelems,
72  int type_size, int tag, bool fault_aware = false,
73  int context = options::initial_context, communicator* dom = 0);
74 
75 void
76 comm_scatter(int root, void* dst, void* src, int nelems,
77  int type_size, int tag, bool fault_aware = false,
78  int context = options::initial_context, communicator* dom = 0);
79 
80 void
81 comm_bcast(int root, void* buffer, int nelems,
82  int type_size, int tag, bool fault_aware = false,
83  int context = options::initial_context, communicator* dom = 0);
84 
85 /**
86 * The total size of the input/result buffer in bytes is nelems*type_size
87 * @param dst Buffer for the result. Can be NULL to ignore payloads.
88 * @param src Buffer for the input. Can be NULL to ignore payloads.
89 * @param nelems The number of elements in the input and result buffer.
90 * @param type_size The size of the input type, i.e. sizeof(int), sizeof(double)
91 * @param tag A unique tag identifier for the collective
92 * @param fxn The function that will actually perform the reduction
93 * @param fault_aware Whether to execute in a fault-aware fashion to detect failures
94 * @param context The context (i.e. initial set of failed procs)
95 */
96 void
97 comm_allreduce(void* dst, void* src, int nelems, int type_size, int tag,
98  reduce_fxn fxn, bool fault_aware=false, int context = options::initial_context,
99  communicator* dom = 0);
100 
101 template <typename data_t, template <typename> class Op>
102 void
103 comm_allreduce(void* dst, void* src, int nelems, int tag, bool fault_aware = false, int context = options::initial_context, communicator* dom = 0){
104  typedef ReduceOp<Op, data_t> op_class_type;
105  comm_allreduce(dst, src, nelems, sizeof(data_t), tag, &op_class_type::op, fault_aware, context, dom);
106 }
107 
108 void
109 comm_reduce(int root, void* dst, void* src, int nelems, int type_size, int tag,
110  reduce_fxn fxn, bool fault_aware=false, int context = options::initial_context,
111  communicator* dom = 0);
112 
113 template <typename data_t, template <typename> class Op>
114 void
115 comm_reduce(int root, void* dst, void* src, int nelems, int tag, bool fault_aware = false, int context = options::initial_context, communicator* dom = 0){
116  typedef ReduceOp<Op, data_t> op_class_type;
117  comm_reduce(root, dst, src, nelems, sizeof(data_t), tag, &op_class_type::op, fault_aware, context, dom);
118 }
119 
120 void
121 comm_barrier(int tag, bool fault_aware = false, communicator* dom = 0);
122 
123 /**
124 * The total size of the input/result buffer in bytes is nelems*type_size
125 * This always run in a fault-tolerant fashion
126 * This uses a dynamic tree structure that reconnects partners when failures are detected
127 * @param vote The vote (currently restricted to integer) from this process
128 * @param nelems The number of elements in the input and result buffer.
129 * @param tag A unique tag identifier for the collective
130 * @param fxn The function that merges vote, usually AND, OR, MAX, MIN
131 * @param context The context (i.e. initial set of failed procs)
132 */
133 void
134 comm_vote(int vote, int tag, vote_fxn fxn, int context = options::initial_context, communicator* dom = 0);
135 
136 template <template <class> class VoteOp>
137 void
138 comm_vote(int vote, int tag, int context = options::initial_context, communicator* dom = 0){
139  typedef VoteOp<int> op_class_type;
140  comm_vote(vote, tag, &op_class_type::op, context, dom);
141 }
142 
143 /**
144 * Helper function. Kill the node that is currently running.
145 * This is invoked by an application. This allows an
146 * application to die at a very, very specific point in application execution.
147 */
148 void
150 
151 /**
152 * Helper function. Kill the process that is currently running.
153 * This only kills the process - it leaves the node alive and well.
154 */
155 void
157 
158 const thread_safe_set<int>&
160 
161 const thread_safe_set<int>&
162 comm_failed_ranks(int context);
163 
164 void
165 comm_start_heartbeat(double interval);
166 
167 void
169 
170 collective_done_message::ptr
171 comm_collective_block(collective::type_t ty, int tag);
172 
173 message::ptr
174 comm_poll();
175 
176 void compute(double sec);
177 
178 void sleep(double sec);
179 
180 void sleep_until(double sec);
181 
182 /**
183  * Map a physical node location to its virtual assignment in the communicator
184  * @param node_id
185  * @return
186  */
187 int
188 comm_partner(long node_id);
189 
190 /**
191  * Every node has exactly the same notion of time - universal, global clock.
192  * Thus, if rank 0 starts and 10 minuts later rank 1 starts,
193  * even though rank 1 has only been running for 30 seconds, the time will still return
194  * 10 mins, 30 seconds.
195  * @return The current system wall-clock time in seconds.
196  * At application launch, time is zero.
197  */
198 double
199 wall_time();
200 
201 transport*
202 sumi_api();
203 
204 }
205 
206 
207 #endif // SIMPMSG_H
208 
const thread_safe_set< int > & comm_failed_ranks()
void comm_ping(int dst, int tag, timeout_function *func)
void comm_send_payload(int dst, const message::ptr &msg)
void comm_bcast(int root, void *buffer, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
void comm_start_heartbeat(double interval)
void comm_rdma_get(int dst, const message::ptr &msg)
void comm_reduce(int root, void *dst, void *src, int nelems, int type_size, int tag, reduce_fxn fxn, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
collective_done_message::ptr comm_collective_block(collective::type_t ty, int tag)
void comm_send_header(int dst, const message::ptr &msg)
void comm_scatter(int root, void *dst, void *src, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
void comm_nvram_get(int dst, const message::ptr &msg)
int comm_nproc()
void comm_allgatherv(void *dst, void *src, int *recv_counts, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
double wall_time()
Every node has exactly the same notion of time - universal, global clock.
void comm_rdma_put(int dst, const message::ptr &msg)
void comm_vote(int vote, int tag, vote_fxn fxn, int context=options::initial_context, communicator *dom=0)
The total size of the input/result buffer in bytes is nelems*type_size This always run in a fault-tol...
void comm_init()
void sleep_until(double sec)
Definition: sumi.h:14
void comm_gather(int root, void *dst, void *src, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
int comm_rank()
void comm_allgather(void *dst, void *src, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
void compute(double sec)
void comm_barrier(int tag, bool fault_aware=false, communicator *dom=0)
void sleep(double sec)
void comm_kill_process()
Helper function.
endpoint_id node_id
Definition: node_address.h:20
void comm_stop_heartbeat()
void comm_kill_node()
Helper function.
transport * sumi_api()
void comm_cancel_ping(int dst, int tag)
void comm_alltoall(void *dst, void *src, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
void comm_allreduce(void *dst, void *src, int nelems, int type_size, int tag, reduce_fxn fxn, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
The total size of the input/result buffer in bytes is nelems*type_size.
message::ptr comm_poll()
void comm_finalize()
void comm_send(int dst, message::payload_type_t ev, const message::ptr &msg)
int comm_partner(long node_id)
Map a physical node location to its virtual assignment in the communicator.