Fault Tolerance Interface
interface.h
Go to the documentation of this file.
1 
39 #ifndef _FTI_INTERFACE_H
40 #define _FTI_INTERFACE_H
41 
42 #include "fti.h"
43 #include "ftiff.h"
44 
45 #include "../deps/iniparser/iniparser.h"
46 #include "../deps/iniparser/dictionary.h"
47 
48 #include "../deps/jerasure/include/galois.h"
49 #include "../deps/jerasure/include/jerasure.h"
50 
51 #ifdef ENABLE_SIONLIB // --> If SIONlib is installed
52 # include <sion.h>
53 #endif
54 
55 #ifdef ENABLE_HDF5
56 #include "hdf5.h"
57 #include "hdf5_hl.h"
58 #endif
59 
60 #include "stage.h"
61 
62 #include <stdint.h>
63 #include "../deps/md5/md5.h"
64 
65 #define CHUNK_SIZE 131072
67 #include <fcntl.h>
68 #include <sys/mman.h>
69 #include <sys/types.h>
70 #include <sys/stat.h>
71 #include <string.h>
72 #include <stdlib.h>
73 #include <stdio.h>
74 #include <unistd.h>
75 #include <time.h>
76 #include <errno.h>
77 #include <math.h>
78 #include <limits.h>
79 #include <inttypes.h>
80 #include <dirent.h>
81 #include <stdbool.h>
82 #include <stdint.h>
83 #include <time.h>
84 #include <libgen.h>
85 
86 #ifdef LUSTRE
87 # include "lustreapi.h"
88 #endif
89 
90 /*---------------------------------------------------------------------------
91  Defines
92  ---------------------------------------------------------------------------*/
93 
95 #define talloc(type, num) (type *)malloc(sizeof(type) * (num))
96 
97 extern int FTI_filemetastructsize;
98 extern int FTI_dbstructsize;
99 extern int FTI_dbvarstructsize;
101 /*---------------------------------------------------------------------------
102  FTI private functions
103  ---------------------------------------------------------------------------*/
104 void FTI_PrintMeta(FTIT_execution* FTI_Exec, FTIT_topology* FTI_Topo);
105 int FTI_FloatBitFlip(float *target, int bit);
106 int FTI_DoubleBitFlip(double *target, int bit);
107 void FTI_Print(char *msg, int priority);
108 
109 int FTI_UpdateIterTime(FTIT_execution* FTI_Exec);
110 int FTI_WriteCkpt(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
111  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
112  FTIT_dataset* FTI_Data);
113 #ifdef ENABLE_SIONLIB // --> If SIONlib is installed
114 int FTI_WriteSionlib(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
115  FTIT_topology* FTI_Topo,FTIT_dataset* FTI_Data);
116 #endif
117 int FTI_WriteMPI(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
118  FTIT_topology* FTI_Topo,FTIT_dataset* FTI_Data);
119 int FTI_WritePar(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
120  FTIT_topology* FTI_Topo,FTIT_dataset* FTI_Data);
121 int FTI_WritePosix(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
122  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
123  FTIT_dataset* FTI_Data);
124 int FTI_PostCkpt(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
125  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
126 int FTI_Listen(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
127  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt)
128  __attribute__((noreturn));
130  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
132  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int source);
133 
134 int FTI_UpdateConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
135  int restart);
136 int FTI_ReadConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
137  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
138  FTIT_injection *FTI_Inje);
139 int FTI_TestConfig(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo,
140  FTIT_checkpoint* FTI_Ckpt, FTIT_execution* FTI_Exec);
141 int FTI_TestDirectories(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo);
142 int FTI_CreateDirs(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
143  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
144 int FTI_LoadConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
145  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
146  FTIT_injection *FTI_Inje);
147 
148 #ifdef ENABLE_HDF5
149  int FTI_WriteHDF5(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
150  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
151  FTIT_dataset* FTI_Data);
152  int FTI_RecoverHDF5(FTIT_execution* FTI_Exec, FTIT_checkpoint* FTI_Ckpt,
153  FTIT_dataset* FTI_Data);
154  int FTI_RecoverVarHDF5(FTIT_execution* FTI_Exec, FTIT_checkpoint* FTI_Ckpt,
155  FTIT_dataset* FTI_Data, int id);
156 #endif
157 
158 int FTI_GetChecksums(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
159  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
160  char* checksum, char* ptnerChecksum, char* rsChecksum);
162  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
163  int rank, char* checksum);
164 int FTI_LoadTmpMeta(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
165  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
166 int FTI_LoadMeta(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
167  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
168 int FTI_WriteMetadata(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
169  FTIT_topology* FTI_Topo, long* fs, long mfs, char* fnl,
170  char* checksums, int* allVarIDs, long* allVarSizes);
171 int FTI_CreateMetadata(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
172  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
173  FTIT_dataset* FTI_Data);
175  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt );
177  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt );
179  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt );
180 
181 int FTI_Local(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
182  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
183 int FTI_Ptner(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
184  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
185 int FTI_RSenc(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
186  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
187 int FTI_Flush(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
188  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int level);
189 int FTI_FlushPosix(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
190  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int level);
191 int FTI_FlushMPI(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
192  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int level);
193 #ifdef ENABLE_SIONLIB // --> If SIONlib is installed
194 int FTI_FlushSionlib(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
195  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int level);
196 #endif
197 int FTI_Decode(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
198  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int *erased);
199 int FTI_RecoverL1(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
200  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
201 int FTI_RecoverL2(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
202  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
203 int FTI_RecoverL3(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
204  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
205 int FTI_RecoverL4(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
206  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
207 int FTI_RecoverL4Posix(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
208  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
209 int FTI_RecoverL4Mpi(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
210  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
211 #ifdef ENABLE_SIONLIB // --> If SIONlib is installed
213  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
214 #endif
215 int FTI_CheckFile(char *fn, long fs, char* checksum);
216 int FTI_CheckErasures(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
217  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
218  int *erased);
219 int FTI_RecoverFiles(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
220  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt);
221 
222 int FTI_Checksum(FTIT_execution* FTI_Exec, FTIT_dataset* FTI_Data,
223  FTIT_configuration* FTI_Conf, char* checksum);
224 int FTI_VerifyChecksum(char* fileName, char* checksumToCmp);
225 int FTI_Try(int result, char* message);
226 void FTI_MallocMeta(FTIT_execution* FTI_Exec, FTIT_topology* FTI_Topo);
227 void FTI_FreeMeta(FTIT_execution* FTI_Exec);
228 void FTI_FreeTypesAndGroups(FTIT_execution* FTI_Exec);
229 #ifdef ENABLE_HDF5
230  void FTI_CreateComplexType(FTIT_type* ftiType, FTIT_type** FTI_Type);
231  void FTI_CloseComplexType(FTIT_type* ftiType, FTIT_type** FTI_Type);
232  void FTI_CreateGroup(FTIT_H5Group* ftiGroup, hid_t parentGroup, FTIT_H5Group** FTI_Group);
233  void FTI_OpenGroup(FTIT_H5Group* ftiGroup, hid_t parentGroup, FTIT_H5Group** FTI_Group);
234  void FTI_CloseGroup(FTIT_H5Group* ftiGroup, FTIT_H5Group** FTI_Group);
235 #endif
237 int FTI_InitBasicTypes(FTIT_dataset* FTI_Data);
238 int FTI_InitExecVars(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
239  FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
240  FTIT_injection* FTI_Inje);
241 int FTI_RmDir(char path[FTI_BUFS], int flag);
242 int FTI_Clean(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo,
243  FTIT_checkpoint* FTI_Ckpt, int level);
244 
245 int FTI_SaveTopo(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, char *nameList);
246 int FTI_ReorderNodes(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo,
247  int *nodeList, char *nameList);
248 int FTI_BuildNodeList(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
249  FTIT_topology* FTI_Topo, int *nodeList, char *nameList);
250 int FTI_CreateComms(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
251  FTIT_topology* FTI_Topo, int *userProcList,
252  int *distProcList, int* nodeList);
253 int FTI_Topology(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
254  FTIT_topology* FTI_Topo);
255 int FTI_ArchiveL4Ckpt( FTIT_configuration* FTI_Conf, FTIT_execution *FTI_Exec, FTIT_checkpoint *FTI_Ckpt,
256  FTIT_topology *FTI_Topo );
257 void FTI_PrintStatus( FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int ID, int source );
258 
259 #endif
260 
261 // DIFFERENTIAL CHECKPOINTING
262 
263 #ifdef FTI_NOZLIB
264 extern const uint32_t crc32_tab[];
265 
266 static inline uint32_t crc32_raw(const void *buf, size_t size, uint32_t crc)
267 {
268  const uint8_t *p = (const uint8_t *)buf;
269 
270  while (size--)
271  crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
272  return (crc);
273 }
274 
275 static inline uint32_t crc32(const void *buf, size_t size)
276 {
277  uint32_t crc;
278 
279  crc = crc32_raw(buf, size, ~0U);
280  return (crc ^ ~0U);
281 }
282 #endif
283 
284 typedef uintptr_t FTI_ADDRVAL;
285 typedef void* FTI_ADDRPTR;
287 int FTI_FinalizeDcp( FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec );
288 int FTI_InitDcp(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, FTIT_dataset* FTI_Data);
289 int FTI_ReceiveDataChunk(FTI_ADDRVAL* buffer_offset, FTI_ADDRVAL* buffer_size, FTIFF_dbvar* dbvar, FTIT_dataset* FTI_Data);
290 long FTI_CalcNumHashes( long chunkSize );
291 int FTI_InitBlockHashArray( FTIFF_dbvar* dbvar );
294 int FTI_GetDcpMode();
296 int FTI_HashCmp( long hashIdx, FTIFF_dbvar* dbvar );
297 int FTI_UpdateDcpChanges(FTIT_dataset* FTI_Data, FTIT_execution* FTI_Exec);
298 
int FTI_HashCmp(long hashIdx, FTIFF_dbvar *dbvar)
Checks if data block is dirty, clean or invalid.
Definition: diff-checkpoint.c:497
int FTI_SaveTopo(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, char *nameList)
It writes the topology in a file for recovery.
Definition: topo.c:55
int FTI_TestDirectories(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo)
It tests that the directories given is correct.
Definition: conf.c:411
int FTI_FloatBitFlip(float *target, int bit)
It corrupts a bit of the given float.
Definition: api.c:886
int FTI_InitBlockHashArray(FTIFF_dbvar *dbvar)
Initializes a new hash meta data structure for data chunk.
Definition: diff-checkpoint.c:242
Definition: fti.h:411
int FTI_RecoverL1(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It checks that all L1 ckpt. files are present.
Definition: postreco.c:456
int FTI_WriteCkptMetaData(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
Creates or updates checkpoint meta data.
Definition: meta.c:564
Definition: fti.h:498
Definition: fti.h:189
int FTI_WriteMPI(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data)
Writes ckpt to PFS using MPI I/O.
Definition: checkpoint.c:636
int FTI_GetChecksums(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, char *checksum, char *ptnerChecksum, char *rsChecksum)
It gets the checksums from metadata.
Definition: meta.c:59
void FTI_PrintStatus(FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int ID, int source)
int FTI_FlushMPI(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It flushes the local ckpt. files in to the PFS using MPI-I/O.
Definition: postckpt.c:793
int FTI_FlushPosix(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It flushes the local ckpt. files in to the PFS using POSIX.
Definition: postckpt.c:689
int FTI_RecoverL4(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L4 ckpt. files from the PFS.
Definition: postreco.c:853
int FTI_BuildNodeList(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int *nodeList, char *nameList)
It builds the list of nodes in the current execution.
Definition: topo.c:239
Definition: fti.h:330
int FTI_RecoverL2(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L2 ckpt. files using the partner copy.
Definition: postreco.c:623
int FTI_InitGroupsAndTypes(FTIT_execution *FTI_Exec)
It mallocs memory for the metadata.
Definition: tools.c:398
int FTI_WriteRSedChecksum(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int rank, char *checksum)
It writes the RSed file checksum to metadata.
Definition: meta.c:121
int FTI_UpdateIterTime(FTIT_execution *FTI_Exec)
It updates the local and global mean iteration time.
Definition: checkpoint.c:59
int FTI_dbstructsize
Definition: tools.c:43
Definition: fti.h:365
int FTI_CheckErasures(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int *erased)
It detects all the erasures for a particular level.
Definition: recover.c:101
int FTI_WriteCkpt(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data)
It writes the checkpoint data in the target file.
Definition: checkpoint.c:122
int FTI_Checksum(FTIT_execution *FTI_Exec, FTIT_dataset *FTI_Data, FTIT_configuration *FTI_Conf, char *checksum)
It calculates checksum of the checkpoint file.
Definition: tools.c:209
int FTI_InitExecVars(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje)
Init of the static variables.
Definition: tools.c:56
int FTI_Ptner(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It copies ckpt. files in to the partner node.
Definition: postckpt.c:194
int FTI_WriteSionlib(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data)
Writes ckpt to PFS using SIONlib.
Definition: checkpoint.c:749
Definition: fti.h:291
void FTI_MallocMeta(FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo)
It mallocs memory for the metadata.
Definition: tools.c:330
int FTI_FinalizeDcp(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec)
Finalizes dCP.
Definition: diff-checkpoint.c:114
#define FTI_BUFS
Definition: fti.h:33
int FTI_RecoverL4Mpi(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L4 ckpt. files from the PFS using MPI-I/O.
Definition: postreco.c:1022
void FTI_Print(char *msg, int priority)
Prints FTI messages.
Definition: api.c:1575
int FTI_LoadConf(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje)
It reads and tests the configuration given.
Definition: conf.c:550
Definition: fti.h:276
int FTI_RecoverL4Posix(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L4 ckpt. files from the PFS using POSIX.
Definition: postreco.c:890
int FTI_InitDcp(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_dataset *FTI_Data)
Initializes dCP.
Definition: diff-checkpoint.c:159
int FTI_RecoverL3(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L3 ckpt. files ordering the RS decoding algorithm.
Definition: postreco.c:778
int FTI_RSenc(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It performs RS encoding with the ckpt. files in to the group.
Definition: postckpt.c:256
int FTI_UpdateDcpChanges(FTIT_dataset *FTI_Data, FTIT_execution *FTI_Exec)
Updates data chunk hash meta data.
Definition: diff-checkpoint.c:550
int FTI_RecoverFiles(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It decides wich action take depending on the restart level.
Definition: recover.c:170
int FTI_ReorderNodes(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, int *nodeList, char *nameList)
It reorders the nodes following the previous topology.
Definition: topo.c:129
int FTI_LoadMeta(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It gets the metadata to recover the data after a failure.
Definition: meta.c:289
long FTI_CalcNumHashes(long chunkSize)
Computes number of hashblocks for chunk size.
Definition: diff-checkpoint.c:472
int FTI_WritePar(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data)
int FTI_Local(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It returns FTI_SCES.
Definition: postckpt.c:54
int FTI_RmDir(char path[FTI_BUFS], int flag)
It erases a directory and all its files.
Definition: tools.c:706
int FTI_Clean(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It erases the previous checkpoints and their metadata.
Definition: tools.c:766
int FTI_ReadConf(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje)
It reads the configuration given in the configuration file.
Definition: conf.c:118
int FTI_RecoverL4Sionlib(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L4 ckpt. files from the PFS using SIONlib.
Definition: postreco.c:1149
int FTI_WritePosix(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data)
Writes ckpt to PFS using POSIX.
Definition: checkpoint.c:563
int FTI_TestConfig(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_execution *FTI_Exec)
It tests that the configuration given is correct.
Definition: conf.c:255
int FTI_dbvarstructsize
Definition: tools.c:44
int FTI_CollapseBlockHashArray(FTIFF_dbvar *dbvar)
Shrinks an existing hash meta data structure for data chunk.
Definition: diff-checkpoint.c:295
int FTI_HandleCkptRequest(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
handles checkpoint requests from application ranks (if head).
Definition: checkpoint.c:460
void FTI_FreeMeta(FTIT_execution *FTI_Exec)
It frees memory for the metadata.
Definition: tools.c:370
Header file for the FTI library.
Definition: fti.h:476
int FTI_CreateMetadata(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data)
It writes the metadata to recover the data after a failure.
Definition: meta.c:769
int FTI_HandleStageRequest(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int source)
This function asynchronously stages the local file to the PFS.
Definition: stage.c:797
void FTI_FreeTypesAndGroups(FTIT_execution *FTI_Exec)
It frees memory for the types.
Definition: tools.c:430
int FTI_ArchiveL4Ckpt(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_checkpoint *FTI_Ckpt, FTIT_topology *FTI_Topo)
It moves the level 4 ckpt. to the archive folder.
Definition: postckpt.c:603
int FTI_UpdateConf(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, int restart)
Sets the exec. ID and failure parameters in the conf. file.
Definition: conf.c:58
Definition: fti.h:448
void FTI_PrintMeta(FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo)
int FTI_DoubleBitFlip(double *target, int bit)
It corrupts a bit of the given float.
Definition: api.c:911
int FTI_VerifyChecksum(char *fileName, char *checksumToCmp)
It compares checksum of the checkpoint file.
Definition: tools.c:247
int FTI_Flush(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It flushes the local ckpt. files in to the PFS.
Definition: postckpt.c:528
Header file for the FTI File Format (FTI-FF).
int FTI_LoadL4CkptMetaData(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
Loads relevant data from checkpoint meta data.
Definition: meta.c:429
int FTI_filemetastructsize
Definition: tools.c:42
int FTI_Try(int result, char *message)
It receives the return code of a function and prints a message.
Definition: tools.c:304
int FTI_WriteMetadata(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, long *fs, long mfs, char *fnl, char *checksums, int *allVarIDs, long *allVarSizes)
It writes the metadata to recover the data after a failure.
Definition: meta.c:660
int FTI_Decode(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int *erased)
It recovers a set of ckpt. files using RS decoding.
Definition: postreco.c:55
int FTI_InitBasicTypes(FTIT_dataset *FTI_Data)
It creates the basic datatypes and the dataset array.
Definition: tools.c:671
int FTI_PostCkpt(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
Decides wich action start depending on the ckpt. level.
Definition: checkpoint.c:267
int FTI_CreateComms(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int *userProcList, int *distProcList, int *nodeList)
It builds the list of nodes in the current execution.
Definition: topo.c:318
unsigned short dcpBLK_t
unsigned short (0 - 65535).
Definition: ftiff.h:81
int FTI_ExpandBlockHashArray(FTIFF_dbvar *dbvar)
Expands an existing hash meta data structure for data chunk.
Definition: diff-checkpoint.c:387
void * FTI_ADDRPTR
Definition: interface.h:285
int FTI_GetDcpMode()
Returns the dCP mode.
Definition: diff-checkpoint.c:224
int FTI_Listen(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) __attribute__((noreturn))
It listens for checkpoint notifications.
Definition: checkpoint.c:380
dcpBLK_t FTI_GetDiffBlockSize()
Returns the dCP block size.
Definition: diff-checkpoint.c:214
uintptr_t FTI_ADDRVAL
Definition: interface.h:284
int FTI_LoadCkptMetaData(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
Loads relevant data from checkpoint meta data.
Definition: meta.c:486
int FTI_CheckFile(char *fn, long fs, char *checksum)
It checks if a file exist and that its size is &#39;correct&#39;.
Definition: recover.c:54
int FTI_ReceiveDataChunk(FTI_ADDRVAL *buffer_offset, FTI_ADDRVAL *buffer_size, FTIFF_dbvar *dbvar, FTIT_dataset *FTI_Data)
Returns pointer and size of buffer to write during checkpoint.
Definition: diff-checkpoint.c:622
int FTI_CreateDirs(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It creates the directories required for current execution.
Definition: conf.c:466
int FTI_LoadTmpMeta(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It gets the temporary metadata.
Definition: meta.c:205
int FTI_Topology(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo)
It builds and saves the topology of the current execution.
Definition: topo.c:376
int FTI_FlushSionlib(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It flushes the local ckpt. files in to the PFS using SIONlib.
Definition: postckpt.c:958