SST/macro
Classes | Functions
sumi Namespace Reference

Classes

class  parsedumpi
 A refactored dumpi parser to read the newer binary-format dumpi trace files. More...
 
class  parsedumpi_callbacks
 Populate C-style callbacks for a libundumpi parser. More...
 
class  sumi_transport
 

Functions

void comm_init ()
 
void comm_finalize ()
 
int comm_rank ()
 
int comm_nproc ()
 
void comm_send_header (int dst, const message::ptr &msg)
 
void comm_cancel_ping (int dst, int tag)
 
void comm_ping (int dst, int tag, timeout_function *func)
 
void comm_send_payload (int dst, const message::ptr &msg)
 
void comm_send (int dst, message::payload_type_t ev, const message::ptr &msg)
 
void comm_rdma_put (int dst, const message::ptr &msg)
 
void comm_rdma_get (int dst, const message::ptr &msg)
 
void comm_nvram_get (int dst, const message::ptr &msg)
 
void comm_alltoall (void *dst, void *src, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 
void comm_allgather (void *dst, void *src, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 
void comm_allgatherv (void *dst, void *src, int *recv_counts, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 
void comm_gather (int root, void *dst, void *src, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 
void comm_scatter (int root, void *dst, void *src, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 
void comm_bcast (int root, void *buffer, int nelems, int type_size, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 
void comm_allreduce (void *dst, void *src, int nelems, int type_size, int tag, reduce_fxn fxn, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 The total size of the input/result buffer in bytes is nelems*type_size. More...
 
template<typename data_t , template< typename > class Op>
void comm_allreduce (void *dst, void *src, int nelems, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 
void comm_reduce (int root, void *dst, void *src, int nelems, int type_size, int tag, reduce_fxn fxn, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 
template<typename data_t , template< typename > class Op>
void comm_reduce (int root, void *dst, void *src, int nelems, int tag, bool fault_aware=false, int context=options::initial_context, communicator *dom=0)
 
void comm_barrier (int tag, bool fault_aware=false, communicator *dom=0)
 
void comm_vote (int vote, int tag, vote_fxn fxn, int context=options::initial_context, communicator *dom=0)
 The total size of the input/result buffer in bytes is nelems*type_size This always run in a fault-tolerant fashion This uses a dynamic tree structure that reconnects partners when failures are detected. More...
 
template<template< class > class VoteOp>
void comm_vote (int vote, int tag, int context=options::initial_context, communicator *dom=0)
 
void comm_kill_node ()
 Helper function. More...
 
void comm_kill_process ()
 Helper function. More...
 
const thread_safe_set< int > & comm_failed_ranks ()
 
const thread_safe_set< int > & comm_failed_ranks (int context)
 
void comm_start_heartbeat (double interval)
 
void comm_stop_heartbeat ()
 
collective_done_message::ptr comm_collective_block (collective::type_t ty, int tag)
 
message::ptr comm_poll ()
 
void compute (double sec)
 
void sleep (double sec)
 
void sleep_until (double sec)
 
int comm_partner (long node_id)
 Map a physical node location to its virtual assignment in the communicator. More...
 
double wall_time ()
 Every node has exactly the same notion of time - universal, global clock. More...
 
transportsumi_api ()
 

Function Documentation

void sumi::comm_allgather ( void *  dst,
void *  src,
int  nelems,
int  type_size,
int  tag,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)
void sumi::comm_allgatherv ( void *  dst,
void *  src,
int *  recv_counts,
int  type_size,
int  tag,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)
void sumi::comm_allreduce ( void *  dst,
void *  src,
int  nelems,
int  type_size,
int  tag,
reduce_fxn  fxn,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)

The total size of the input/result buffer in bytes is nelems*type_size.

Parameters
dstBuffer for the result. Can be NULL to ignore payloads.
srcBuffer for the input. Can be NULL to ignore payloads.
nelemsThe number of elements in the input and result buffer.
type_sizeThe size of the input type, i.e. sizeof(int), sizeof(double)
tagA unique tag identifier for the collective
fxnThe function that will actually perform the reduction
fault_awareWhether to execute in a fault-aware fashion to detect failures
contextThe context (i.e. initial set of failed procs)

Referenced by comm_allreduce().

template<typename data_t , template< typename > class Op>
void sumi::comm_allreduce ( void *  dst,
void *  src,
int  nelems,
int  tag,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)

Definition at line 103 of file sumi.h.

References comm_allreduce(), and comm_reduce().

Here is the call graph for this function:

void sumi::comm_alltoall ( void *  dst,
void *  src,
int  nelems,
int  type_size,
int  tag,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)
void sumi::comm_barrier ( int  tag,
bool  fault_aware = false,
communicator *  dom = 0 
)

Referenced by comm_reduce().

void sumi::comm_bcast ( int  root,
void *  buffer,
int  nelems,
int  type_size,
int  tag,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)
void sumi::comm_cancel_ping ( int  dst,
int  tag 
)
collective_done_message::ptr sumi::comm_collective_block ( collective::type_t  ty,
int  tag 
)

Referenced by comm_vote().

const thread_safe_set<int>& sumi::comm_failed_ranks ( )

Referenced by comm_vote().

const thread_safe_set<int>& sumi::comm_failed_ranks ( int  context)
void sumi::comm_finalize ( )
void sumi::comm_gather ( int  root,
void *  dst,
void *  src,
int  nelems,
int  type_size,
int  tag,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)
void sumi::comm_init ( )
void sumi::comm_kill_node ( )

Helper function.

Kill the node that is currently running. This is invoked by an application. This allows an application to die at a very, very specific point in application execution.

Referenced by comm_vote().

void sumi::comm_kill_process ( )

Helper function.

Kill the process that is currently running. This only kills the process - it leaves the node alive and well.

Referenced by comm_vote().

int sumi::comm_nproc ( )
void sumi::comm_nvram_get ( int  dst,
const message::ptr &  msg 
)
int sumi::comm_partner ( long  node_id)

Map a physical node location to its virtual assignment in the communicator.

Parameters
node_id
Returns

Referenced by comm_vote().

void sumi::comm_ping ( int  dst,
int  tag,
timeout_function *  func 
)
message::ptr sumi::comm_poll ( )

Referenced by comm_vote().

int sumi::comm_rank ( )
void sumi::comm_rdma_get ( int  dst,
const message::ptr &  msg 
)
void sumi::comm_rdma_put ( int  dst,
const message::ptr &  msg 
)
void sumi::comm_reduce ( int  root,
void *  dst,
void *  src,
int  nelems,
int  type_size,
int  tag,
reduce_fxn  fxn,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)

Referenced by comm_allreduce(), and comm_reduce().

template<typename data_t , template< typename > class Op>
void sumi::comm_reduce ( int  root,
void *  dst,
void *  src,
int  nelems,
int  tag,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)

Definition at line 115 of file sumi.h.

References comm_barrier(), comm_reduce(), and comm_vote().

Here is the call graph for this function:

void sumi::comm_scatter ( int  root,
void *  dst,
void *  src,
int  nelems,
int  type_size,
int  tag,
bool  fault_aware = false,
int  context = options::initial_context,
communicator *  dom = 0 
)
void sumi::comm_send ( int  dst,
message::payload_type_t  ev,
const message::ptr &  msg 
)
void sumi::comm_send_header ( int  dst,
const message::ptr &  msg 
)
Parameters
dstThe destination to send to
void sumi::comm_send_payload ( int  dst,
const message::ptr &  msg 
)
void sumi::comm_start_heartbeat ( double  interval)

Referenced by comm_vote().

void sumi::comm_stop_heartbeat ( )

Referenced by comm_vote().

void sumi::comm_vote ( int  vote,
int  tag,
vote_fxn  fxn,
int  context = options::initial_context,
communicator *  dom = 0 
)

The total size of the input/result buffer in bytes is nelems*type_size This always run in a fault-tolerant fashion This uses a dynamic tree structure that reconnects partners when failures are detected.

Parameters
voteThe vote (currently restricted to integer) from this process
nelemsThe number of elements in the input and result buffer.
tagA unique tag identifier for the collective
fxnThe function that merges vote, usually AND, OR, MAX, MIN
contextThe context (i.e. initial set of failed procs)

Referenced by comm_reduce(), and comm_vote().

template<template< class > class VoteOp>
void sumi::comm_vote ( int  vote,
int  tag,
int  context = options::initial_context,
communicator *  dom = 0 
)
void sumi::compute ( double  sec)
void sumi::sleep ( double  sec)
void sumi::sleep_until ( double  sec)

Referenced by comm_vote().

transport* sumi::sumi_api ( )

Referenced by comm_vote().

double sumi::wall_time ( )

Every node has exactly the same notion of time - universal, global clock.

Thus, if rank 0 starts and 10 minuts later rank 1 starts, even though rank 1 has only been running for 30 seconds, the time will still return 10 mins, 30 seconds.

Returns
The current system wall-clock time in seconds. At application launch, time is zero.

Referenced by comm_vote().