Fault Tolerance Interface
fti.h
Go to the documentation of this file.
1 
8 #ifndef _FTI_H
9 #define _FTI_H
10 
11 #include <mpi.h>
12 #include <stdlib.h>
13 #include <stddef.h>
14 #include <stdbool.h>
15 #include <stdint.h>
16 
17 /*---------------------------------------------------------------------------
18  Defines
19  ---------------------------------------------------------------------------*/
20 
22 #define RED "\x1B[31m"
23 
24 #define ORG "\x1B[38;5;202m"
25 
26 #define GRN "\x1B[32m"
27 
28 #define BLU "\x1B[34m"
29 
30 #define RESET "\x1B[0m"
31 
33 #define FTI_BUFS 256
34 
35 #define FTI_WORD 16
36 
37 #define FTI_DONE 1
38 
39 #define FTI_SCES 0
40 
41 #define FTI_NSCS -1
42 
43 #define FTI_NREC -2
44 
46 #define FTI_EROR 4
47 
48 #define FTI_WARN 3
49 
50 #define FTI_IDCP 5
51 
52 #define FTI_INFO 2
53 
54 #define FTI_DBUG 1
55 
57 #define FTI_BASE 990
58 
59 #define FTI_CKTW 991
60 
61 #define FTI_XORW 992
62 
63 #define FTI_RSEW 993
64 
65 #define FTI_PFSW 994
66 
67 #define FTI_ENDW 995
68 
69 #define FTI_REJW 996
70 
71 #define FTI_IO_POSIX 1001
72 
73 #define FTI_IO_MPI 1002
74 
75 #define FTI_IO_FTIFF 1003
76 
78 #define FTI_SI_FAIL 0x4
79 
80 #define FTI_SI_SCES 0x3
81 
82 #define FTI_SI_ACTV 0x2
83 
84 #define FTI_SI_PEND 0x1
85 
86 #define FTI_SI_NINI 0x0
87 
92 #define FTI_SI_MAX_NUM (512L*1024L)
93 
95 #define MD5_DIGEST_LENGTH 16
96 
97 #define MD5_DIGEST_STRING_LENGTH 33
98 
99 #ifdef ENABLE_SIONLIB // --> If SIONlib is installed
100 
101 #define FTI_IO_SIONLIB 1004
102 #endif
103 
105 #define FTI_IO_HDF5 1005
106 #ifdef ENABLE_HDF5 // --> If HDF5 is installed
107  #include "hdf5.h"
108 #endif
109 
110 #define FTI_DCP_MODE_OFFSET 2000
111 #define FTI_DCP_MODE_MD5 2001
112 #define FTI_DCP_MODE_CRC32 2002
113 
114 #ifdef __cplusplus
115 extern "C" {
116 #endif
117 
118  /*---------------------------------------------------------------------------
119  FTI-FF types
120  ---------------------------------------------------------------------------*/
121 
125  typedef enum {
126  FTI_L1 = 1,
136  } FTIT_level;
137 
138  typedef uintptr_t FTI_ADDRVAL;
139  typedef void* FTI_ADDRPTR;
150  typedef struct FTIFF_metaInfo {
152  unsigned char myHash[MD5_DIGEST_LENGTH];
153  long ckptSize;
154  long fs;
155  long maxFs;
156  long ptFs;
157  long timestamp;
158  long dcpSize;
159  long dataSize;
160  } FTIFF_metaInfo;
161 
170  typedef struct FTIT_DataDiffHash
171  {
172  unsigned char* md5hash;
173  unsigned short blockSize;
174  uint32_t bit32hash;
175  bool dirty;
176  bool isValid;
179 
189  typedef struct FTIFF_dbvar {
190  int id;
191  int idx;
193  bool hascontent;
194  bool hasCkpt;
195  uintptr_t dptr;
196  uintptr_t fptr;
197  long chunksize;
199  unsigned char hash[MD5_DIGEST_LENGTH];
200  unsigned char myhash[MD5_DIGEST_LENGTH];
201  bool update;
202  long nbHashes;
204  char *cptr;
205  } FTIFF_dbvar;
206 
214  typedef struct FTIFF_db {
215  int numvars;
216  long dbsize;
217  unsigned char myhash[MD5_DIGEST_LENGTH];
218  bool update;
220  struct FTIFF_db *previous;
221  struct FTIFF_db *next;
222  } FTIFF_db;
223 
224  /*---------------------------------------------------------------------------
225  New types
226  ---------------------------------------------------------------------------*/
227 
237  typedef struct FTIT_StageInfo {
238  int nbRequest;
239  void *request;
240  } FTIT_StageInfo;
241 
248  typedef union FTIT_double {
249  double value;
250  float floatval[2];
251  int intval[2];
252  char byte[8];
253  } FTIT_double;
254 
261  typedef union FTIT_float {
262  float value;
263  int intval;
264  char byte[4];
265  } FTIT_float;
266 
273 
274  typedef struct FTIT_H5Group FTIT_H5Group;
275 
276  typedef struct FTIT_H5Group {
277  int id;
278  char name[FTI_BUFS];
280  int childrenID[FTI_BUFS];
281 #ifdef ENABLE_HDF5
282  hid_t h5groupID;
283 #endif
284  } FTIT_H5Group;
285 
291  typedef struct FTIT_type {
292  int id;
293  int size;
296 #ifdef ENABLE_HDF5
297  hid_t h5datatype;
298 #endif
299  } FTIT_type;
300 
306  typedef struct FTIT_typeField {
307  int typeID;
308  int offset;
309  int rank;
310  int dimLength[32];
311  char name[FTI_BUFS];
312  } FTIT_typeField;
313 
319  typedef struct FTIT_complexType {
320  char name[FTI_BUFS];
321  int length;
324 
330  typedef struct FTIT_dataset {
331  int id;
332  void *ptr;
333  long count;
335  int eleSize;
336  long size;
337  int rank;
338  int dimLength[32];
339  char name[FTI_BUFS];
341  } FTIT_dataset;
342 
348  typedef struct FTIT_metadata {
349  int* exists;
350  long* maxFs;
351  long* fs;
352  long* pfs;
353  char* ckptFile;
355  int* nbVar;
356  int* varID;
357  long* varSize;
358  } FTIT_metadata;
359 
365  typedef struct FTIT_execution {
366  char id[FTI_BUFS];
367  int ckpt;
368  int reco;
369  int ckptLvel;
370  int ckptIntv;
373  double iterTime;
374  double lastIterTime;
375  double meanIterTime;
376  double globMeanIter;
377  double totalIterTime;
378  unsigned int syncIter;
380  unsigned int minuteCnt;
381  bool hasCkpt;
382  unsigned int ckptCnt;
383  unsigned int ckptIcnt;
384  unsigned int ckptID;
385  unsigned int ckptNext;
386  unsigned int ckptLast;
387  long ckptSize;
388  unsigned int nbVar;
389  unsigned int nbVarStored;
390  unsigned int nbType;
391  int nbGroup;
392  int metaAlloc;
393  int initSCES;
394  FTIT_metadata meta[5];
401  MPI_Comm globalComm;
402  MPI_Comm groupComm;
403  MPI_Comm nodeComm;
404  } FTIT_execution;
405 
411  typedef struct FTIT_configuration {
413  bool dcpEnabled;
414  bool keepL4Ckpt;
415  int dcpMode;
417  char cfgFile[FTI_BUFS];
419  int verbosity;
420  int blockSize;
422 #ifdef LUSTRE
423  int stripeUnit;
424  int stripeOffset;
425  int stripeFactor;
426 #endif
427  int ckptTag;
428  int stageTag;
429  int finalTag;
431  int test;
433  int ioMode;
434  char stageDir[FTI_BUFS];
435  char localDir[FTI_BUFS];
436  char glbalDir[FTI_BUFS];
437  char metadDir[FTI_BUFS];
438  char lTmpDir[FTI_BUFS];
439  char gTmpDir[FTI_BUFS];
440  char mTmpDir[FTI_BUFS];
442 
448  typedef struct FTIT_topology {
449  int nbProc;
450  int nbNodes;
451  int myRank;
452  int splitRank;
453  int nodeSize;
454  int nbHeads;
455  int nbApprocs;
456  int groupSize;
457  int sectorID;
458  int nodeID;
459  int groupID;
460  int amIaHead;
461  int headRank;
463  int nodeRank;
464  int groupRank;
465  int right;
466  int left;
467  int body[FTI_BUFS];
468  } FTIT_topology;
469 
470 
476  typedef struct FTIT_checkpoint {
477  char dir[FTI_BUFS];
478  char dcpDir[FTI_BUFS];
479  char archDir[FTI_BUFS];
480  char metaDir[FTI_BUFS];
481  char dcpName[FTI_BUFS];
482  bool isDcp;
483  bool hasDcp;
484  bool hasCkpt;
485  int isInline;
486  int ckptIntv;
487  int ckptCnt;
491  } FTIT_checkpoint;
492 
498  typedef struct FTIT_injection {
499  int rank;
500  int index;
501  int position;
502  int number;
503  int frequency;
504  int counter;
505  double timer;
506  } FTIT_injection;
507 
508  /*---------------------------------------------------------------------------
509  Global variables
510  ---------------------------------------------------------------------------*/
511 
513  extern MPI_Comm FTI_COMM_WORLD;
514 
516  extern FTIT_type FTI_CHAR;
518  extern FTIT_type FTI_SHRT;
520  extern FTIT_type FTI_INTG;
522  extern FTIT_type FTI_LONG;
524  extern FTIT_type FTI_UCHR;
526  extern FTIT_type FTI_USHT;
528  extern FTIT_type FTI_UINT;
530  extern FTIT_type FTI_ULNG;
532  extern FTIT_type FTI_SFLT;
534  extern FTIT_type FTI_DBLE;
536  extern FTIT_type FTI_LDBE;
537 
538  /*---------------------------------------------------------------------------
539  FTI public functions
540  ---------------------------------------------------------------------------*/
541 
542  int FTI_Init(char *configFile, MPI_Comm globalComm);
543  int FTI_Status();
544  int FTI_InitType(FTIT_type* type, int size);
545  int FTI_InitComplexType(FTIT_type* newType, FTIT_complexType* typeDefinition, int length,
546  size_t size, char* name, FTIT_H5Group* h5group);
547  void FTI_AddSimpleField(FTIT_complexType* typeDefinition, FTIT_type* ftiType,
548  size_t offset, int id, char* name);
549  void FTI_AddComplexField(FTIT_complexType* typeDefinition, FTIT_type* ftiType,
550  size_t offset, int rank, int* dimLength, int id, char* name);
551  int FTI_InitGroup(FTIT_H5Group* h5group, char* name, FTIT_H5Group* parent);
552  int FTI_RenameGroup(FTIT_H5Group* h5group, char* name);
553  int FTI_Protect(int id, void* ptr, long count, FTIT_type type);
554  int FTI_DefineDataset(int id, int rank, int* dimLength, char* name, FTIT_H5Group* h5group);
555  long FTI_GetStoredSize(int id);
556  void* FTI_Realloc(int id, void* ptr);
557  int FTI_BitFlip(int datasetID);
558  int FTI_Checkpoint(int id, int level);
559  int FTI_GetStageDir( char* stageDir, int maxLen );
560  int FTI_GetStageStatus( int ID );
561  int FTI_SendFile( char* lpath, char *rpath );
562  int FTI_Recover();
563  int FTI_Snapshot();
564  int FTI_Finalize();
565  int FTI_RecoverVar(int id);
566 
567 #ifdef __cplusplus
568 }
569 #endif
570 
571 #endif /* ----- #ifndef _FTI_H ----- */
int FTI_GetStageDir(char *stageDir, int maxLen)
Places the FTI staging directory path into &#39;stageDir&#39;.
Definition: api.c:409
Definition: fti.h:134
int rank
Definition: fti.h:499
int reco
Definition: fti.h:368
int rank
Definition: fti.h:337
Definition: fti.h:133
int groupRank
Definition: fti.h:464
void * ptr
Definition: fti.h:332
int FTI_InitGroup(FTIT_H5Group *h5group, char *name, FTIT_H5Group *parent)
It initialize a HDF5 group.
Definition: api.c:593
int nbNodes
Definition: fti.h:450
double lastIterTime
Definition: fti.h:374
Definition: fti.h:411
int containerid
Definition: fti.h:192
bool update
Definition: fti.h:218
int ckptIntv
Definition: fti.h:486
int nodeSize
Definition: fti.h:453
int nodeRank
Definition: fti.h:463
unsigned char myHash[MD5_DIGEST_LENGTH]
Definition: fti.h:152
bool hasCkpt
Definition: fti.h:381
long dcpSize
Definition: fti.h:158
FTIT_H5Group * h5group
Definition: fti.h:295
int * nbVar
Definition: fti.h:355
Definition: fti.h:306
int nbApprocs
Definition: fti.h:455
double totalIterTime
Definition: fti.h:377
Definition: fti.h:498
int FTI_BitFlip(int datasetID)
Bit-flip injection following the injection instructions.
Definition: api.c:936
Definition: fti.h:189
struct FTIT_checkpoint FTIT_checkpoint
Checkpoint metadata.
FTIT_type FTI_UCHR
Definition: api.c:72
uintptr_t FTI_ADDRVAL
Definition: fti.h:138
FTIT_H5Group * h5group
Definition: fti.h:340
uintptr_t fptr
Definition: fti.h:196
int FTI_Protect(int id, void *ptr, long count, FTIT_type type)
It sets/resets the pointer and type to a protected variable.
Definition: api.c:663
int blockSize
Definition: fti.h:420
FTIT_StageInfo * stageInfo
Definition: fti.h:400
FTIFF_metaInfo FTIFFMeta
Definition: fti.h:397
unsigned int ckptIcnt
Definition: fti.h:383
struct FTIFF_db FTIFF_db
Information about current datablock.
struct FTIT_StageInfo FTIT_StageInfo
Staging meta info.
int eleSize
Definition: fti.h:335
uintptr_t dptr
Definition: fti.h:195
FTIT_type FTI_UINT
Definition: api.c:76
int FTI_InitType(FTIT_type *type, int size)
It initializes a data type.
Definition: api.c:201
Definition: fti.h:330
void FTI_AddSimpleField(FTIT_complexType *typeDefinition, FTIT_type *ftiType, size_t offset, int id, char *name)
It adds a simple field in complex data type.
Definition: api.c:350
int FTI_Finalize()
It closes FTI properly on the application processes.
Definition: api.c:1350
struct FTIT_H5Group FTIT_H5Group
Definition: fti.h:274
int offset
Definition: fti.h:308
int ckptIntv
Definition: fti.h:370
FTIT_type FTI_LONG
Definition: api.c:70
Definition: fti.h:248
long nbHashes
Definition: fti.h:202
unsigned int nbVar
Definition: fti.h:388
long * pfs
Definition: fti.h:352
FTIFF_dbvar * dbvars
Definition: fti.h:219
bool hasCkpt
Definition: fti.h:194
FTIFF_db * firstdb
Definition: fti.h:395
long maxFs
Definition: fti.h:155
int ckptCnt
Definition: fti.h:487
void * FTI_Realloc(int id, void *ptr)
Reallocates dataset to last checkpoint size.
Definition: api.c:837
struct FTIT_DataDiffHash FTIT_DataDiffHash
dCP information about data block.
struct FTIT_metadata FTIT_metadata
Metadata for restart.
FTIT_type ** FTI_Type
Definition: fti.h:398
int FTI_Snapshot()
Takes an FTI snapshot or recovers the data if it is a restart.
Definition: api.c:1284
Definition: fti.h:365
int nodeID
Definition: fti.h:458
unsigned int nbVarStored
Definition: fti.h:389
int id
Definition: fti.h:331
FTIT_type FTI_USHT
Definition: api.c:74
int size
Definition: fti.h:293
long timestamp
Definition: fti.h:157
Definition: fti.h:135
long ptFs
Definition: fti.h:156
int frequency
Definition: fti.h:503
int FTI_RenameGroup(FTIT_H5Group *h5group, char *name)
Renames a HDF5 group.
Definition: api.c:640
struct FTIT_type FTIT_type
Type recognized by FTI.
int numvars
Definition: fti.h:215
Definition: fti.h:132
int lastCkptLvel
Definition: fti.h:371
int ckptDcpCnt
Definition: fti.h:489
double globMeanIter
Definition: fti.h:376
FTIT_type * type
Definition: fti.h:334
int generalTag
Definition: fti.h:430
int ckptDcpIntv
Definition: fti.h:488
int rank
Definition: fti.h:309
int initSCES
Definition: fti.h:393
bool keepL4Ckpt
Definition: fti.h:414
int splitRank
Definition: fti.h:452
double iterTime
Definition: fti.h:373
char * cptr
Definition: fti.h:204
FTIT_type FTI_CHAR
Definition: api.c:64
FTIFF_db * lastdb
Definition: fti.h:396
int counter
Definition: fti.h:504
FTIT_level
holds the level id.
Definition: fti.h:125
int FTI_RecoverVar(int id)
During the restart, recovers the given variable.
Definition: api.c:1469
Definition: fti.h:237
long containersize
Definition: fti.h:198
int FTI_Recover()
It loads the checkpoint data.
Definition: api.c:1187
long chunksize
Definition: fti.h:197
struct FTIT_dataset FTIT_dataset
Dataset metadata.
Definition: fti.h:291
int FTI_SendFile(char *lpath, char *rpath)
Copies file asynchronously from &#39;lpath&#39; to &#39;rpath&#39;.
Definition: api.c:526
#define FTI_BUFS
Definition: fti.h:33
int nbGroup
Definition: fti.h:391
int right
Definition: fti.h:465
int dcpMode
Definition: fti.h:415
int ckpt
Definition: fti.h:367
Definition: fti.h:276
#define MD5_DIGEST_LENGTH
Definition: fti.h:95
FTIT_H5Group ** H5groups
Definition: fti.h:399
uint32_t bit32hash
Definition: fti.h:174
int l3WordSize
Definition: fti.h:432
Definition: fti.h:170
double timer
Definition: fti.h:505
long fs
Definition: fti.h:154
MPI_Comm FTI_COMM_WORLD
Definition: api.c:61
unsigned int nbType
Definition: fti.h:390
FTIT_type FTI_INTG
Definition: api.c:68
struct FTIT_configuration FTIT_configuration
Configuration metadata.
bool hascontent
Definition: fti.h:193
struct FTIT_injection FTIT_injection
Type to describe failure injections in FTI.
Definition: fti.h:128
long FTI_GetStoredSize(int id)
Returns size saved in metadata of variable.
Definition: api.c:800
int idx
Definition: fti.h:191
int ckptLvel
Definition: fti.h:369
int length
Definition: fti.h:321
Definition: fti.h:130
int finalTag
Definition: fti.h:429
bool hasCkpt
Definition: fti.h:484
int intval
Definition: fti.h:263
int nbHeads
Definition: fti.h:454
void * FTI_ADDRPTR
Definition: fti.h:139
long size
Definition: fti.h:336
Definition: fti.h:126
MPI_Comm nodeComm
Definition: fti.h:403
Definition: fti.h:348
bool dcpEnabled
Definition: fti.h:413
int left
Definition: fti.h:466
int * varID
Definition: fti.h:356
unsigned int ckptID
Definition: fti.h:384
int nbRequest
Definition: fti.h:238
int nbProc
Definition: fti.h:449
Definition: fti.h:129
int id
Definition: fti.h:190
FTIT_DataDiffHash * dataDiffHash
Definition: fti.h:203
int ioMode
Definition: fti.h:433
double meanIterTime
Definition: fti.h:375
int dcpBlockSize
Definition: fti.h:416
int number
Definition: fti.h:502
long * fs
Definition: fti.h:351
int headRankNode
Definition: fti.h:462
unsigned int ckptNext
Definition: fti.h:385
struct FTIT_execution FTIT_execution
Execution metadata.
int groupID
Definition: fti.h:459
int ckptTag
Definition: fti.h:427
unsigned int ckptCnt
Definition: fti.h:382
struct FTIFF_metaInfo FTIFF_metaInfo
Meta Information about file.
int FTI_Init(char *configFile, MPI_Comm globalComm)
Initializes FTI.
Definition: api.c:101
Definition: fti.h:319
bool hasDcp
Definition: fti.h:483
long ckptSize
Definition: fti.h:153
bool update
Definition: fti.h:201
int metaAlloc
Definition: fti.h:392
int saveLastCkpt
Definition: fti.h:418
FTIT_type FTI_SHRT
Definition: api.c:66
void * request
Definition: fti.h:239
Definition: fti.h:150
MPI_Comm groupComm
Definition: fti.h:402
long * maxFs
Definition: fti.h:350
int * exists
Definition: fti.h:349
int headRank
Definition: fti.h:461
int sectorID
Definition: fti.h:457
union FTIT_double FTIT_double
Double mapped as two integers to allow bit-wise operations.
int test
Definition: fti.h:431
long dataSize
Definition: fti.h:159
Definition: fti.h:476
char * ckptFile
Definition: fti.h:353
struct FTIT_typeField FTIT_typeField
Holds info about field in complex type.
long * varSize
Definition: fti.h:357
unsigned int syncIter
Definition: fti.h:378
int FTI_DefineDataset(int id, int rank, int *dimLength, char *name, FTIT_H5Group *h5group)
Defines the dataset.
Definition: api.c:729
int groupSize
Definition: fti.h:456
unsigned char * md5hash
Definition: fti.h:172
struct FTIT_topology FTIT_topology
Topology metadata.
int FTI_InitComplexType(FTIT_type *newType, FTIT_complexType *typeDefinition, int length, size_t size, char *name, FTIT_H5Group *h5group)
It initializes a complex data type.
Definition: api.c:269
int stageTag
Definition: fti.h:428
int amIaHead
Definition: fti.h:460
MPI_Comm globalComm
Definition: fti.h:401
struct FTIFF_db * next
Definition: fti.h:221
int verbosity
Definition: fti.h:419
unsigned short blockSize
Definition: fti.h:173
long count
Definition: fti.h:333
FTIT_type FTI_SFLT
Definition: api.c:80
struct FTIFF_dbvar FTIFF_dbvar
Information about protected variable in datablock.
union FTIT_float FTIT_float
Float mapped as integer to allow bit-wise operations.
Definition: fti.h:448
Definition: fti.h:214
bool isValid
Definition: fti.h:176
bool isDcp
Definition: fti.h:482
#define MD5_DIGEST_STRING_LENGTH
Definition: fti.h:97
int id
Definition: fti.h:277
bool stagingEnabled
Definition: fti.h:412
int childrenNo
Definition: fti.h:279
int FTI_GetStageStatus(int ID)
Returns status of staging request.
Definition: api.c:467
struct FTIFF_db * previous
Definition: fti.h:220
Definition: fti.h:131
char checksum[MD5_DIGEST_STRING_LENGTH]
Definition: fti.h:151
void FTI_AddComplexField(FTIT_complexType *typeDefinition, FTIT_type *ftiType, size_t offset, int rank, int *dimLength, int id, char *name)
It adds a simple field in complex data type.
Definition: api.c:381
unsigned int ckptLast
Definition: fti.h:386
int syncIterMax
Definition: fti.h:379
bool dirty
Definition: fti.h:175
int myRank
Definition: fti.h:451
int index
Definition: fti.h:500
FTIT_type FTI_ULNG
Definition: api.c:78
long dbsize
Definition: fti.h:216
Definition: fti.h:127
FTIT_complexType * structure
Definition: fti.h:294
int FTI_Status()
It returns the current status of the recovery flag.
Definition: api.c:183
double value
Definition: fti.h:249
int isInline
Definition: fti.h:485
int position
Definition: fti.h:501
unsigned int minuteCnt
Definition: fti.h:380
char * currentL4CkptFile
Definition: fti.h:354
FTIT_type FTI_DBLE
Definition: api.c:82
FTIT_type FTI_LDBE
Definition: api.c:84
int FTI_Checkpoint(int id, int level)
It takes the checkpoint and triggers the post-ckpt. work.
Definition: api.c:994
float value
Definition: fti.h:262
long ckptSize
Definition: fti.h:387
int wasLastOffline
Definition: fti.h:372
int typeID
Definition: fti.h:307
Definition: fti.h:261
int transferSize
Definition: fti.h:421
struct FTIT_complexType FTIT_complexType
Type that consists of other FTI types.
Definition: fti.h:272
int id
Definition: fti.h:292