39 #ifndef _FTI_INTERFACE_H 40 #define _FTI_INTERFACE_H 45 #include "../deps/iniparser/iniparser.h" 46 #include "../deps/iniparser/dictionary.h" 48 #include "../deps/jerasure/include/galois.h" 49 #include "../deps/jerasure/include/jerasure.h" 51 #ifdef ENABLE_SIONLIB // --> If SIONlib is installed 63 #include "../deps/md5/md5.h" 65 #define CHUNK_SIZE 131072 69 #include <sys/types.h> 87 # include "lustreapi.h" 95 #define talloc(type, num) (type *)malloc(sizeof(type) * (num)) 113 #ifdef ENABLE_SIONLIB // --> If SIONlib is installed 128 __attribute__((noreturn));
160 char* checksum,
char* ptnerChecksum,
char* rsChecksum);
163 int rank,
char* checksum);
170 char* checksums,
int* allVarIDs,
long* allVarSizes);
193 #ifdef ENABLE_SIONLIB // --> If SIONlib is installed 211 #ifdef ENABLE_SIONLIB // --> If SIONlib is installed 225 int FTI_Try(
int result,
char* message);
247 int *nodeList,
char *nameList);
252 int *distProcList,
int* nodeList);
264 extern const uint32_t crc32_tab[];
266 static inline uint32_t crc32_raw(
const void *buf,
size_t size, uint32_t crc)
268 const uint8_t *p = (
const uint8_t *)buf;
271 crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
275 static inline uint32_t crc32(
const void *buf,
size_t size)
279 crc = crc32_raw(buf, size, ~0U);
int FTI_HashCmp(long hashIdx, FTIFF_dbvar *dbvar)
Checks if data block is dirty, clean or invalid.
Definition: diff-checkpoint.c:497
int FTI_SaveTopo(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, char *nameList)
It writes the topology in a file for recovery.
Definition: topo.c:55
int FTI_TestDirectories(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo)
It tests that the directories given is correct.
Definition: conf.c:411
int FTI_FloatBitFlip(float *target, int bit)
It corrupts a bit of the given float.
Definition: api.c:886
int FTI_InitBlockHashArray(FTIFF_dbvar *dbvar)
Initializes a new hash meta data structure for data chunk.
Definition: diff-checkpoint.c:242
int FTI_RecoverL1(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It checks that all L1 ckpt. files are present.
Definition: postreco.c:456
int FTI_WriteCkptMetaData(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
Creates or updates checkpoint meta data.
Definition: meta.c:564
int FTI_WriteMPI(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data)
Writes ckpt to PFS using MPI I/O.
Definition: checkpoint.c:636
int FTI_GetChecksums(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, char *checksum, char *ptnerChecksum, char *rsChecksum)
It gets the checksums from metadata.
Definition: meta.c:59
void FTI_PrintStatus(FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int ID, int source)
int FTI_FlushMPI(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It flushes the local ckpt. files in to the PFS using MPI-I/O.
Definition: postckpt.c:793
int FTI_FlushPosix(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It flushes the local ckpt. files in to the PFS using POSIX.
Definition: postckpt.c:689
int FTI_RecoverL4(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L4 ckpt. files from the PFS.
Definition: postreco.c:853
int FTI_BuildNodeList(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int *nodeList, char *nameList)
It builds the list of nodes in the current execution.
Definition: topo.c:239
int FTI_RecoverL2(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L2 ckpt. files using the partner copy.
Definition: postreco.c:623
int FTI_InitGroupsAndTypes(FTIT_execution *FTI_Exec)
It mallocs memory for the metadata.
Definition: tools.c:398
int FTI_WriteRSedChecksum(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int rank, char *checksum)
It writes the RSed file checksum to metadata.
Definition: meta.c:121
int FTI_UpdateIterTime(FTIT_execution *FTI_Exec)
It updates the local and global mean iteration time.
Definition: checkpoint.c:59
int FTI_dbstructsize
Definition: tools.c:43
int FTI_CheckErasures(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int *erased)
It detects all the erasures for a particular level.
Definition: recover.c:101
int FTI_WriteCkpt(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data)
It writes the checkpoint data in the target file.
Definition: checkpoint.c:122
int FTI_Checksum(FTIT_execution *FTI_Exec, FTIT_dataset *FTI_Data, FTIT_configuration *FTI_Conf, char *checksum)
It calculates checksum of the checkpoint file.
Definition: tools.c:209
int FTI_InitExecVars(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje)
Init of the static variables.
Definition: tools.c:56
int FTI_Ptner(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It copies ckpt. files in to the partner node.
Definition: postckpt.c:194
int FTI_WriteSionlib(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data)
Writes ckpt to PFS using SIONlib.
Definition: checkpoint.c:749
void FTI_MallocMeta(FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo)
It mallocs memory for the metadata.
Definition: tools.c:330
int FTI_FinalizeDcp(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec)
Finalizes dCP.
Definition: diff-checkpoint.c:114
#define FTI_BUFS
Definition: fti.h:33
int FTI_RecoverL4Mpi(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L4 ckpt. files from the PFS using MPI-I/O.
Definition: postreco.c:1022
void FTI_Print(char *msg, int priority)
Prints FTI messages.
Definition: api.c:1575
int FTI_LoadConf(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje)
It reads and tests the configuration given.
Definition: conf.c:550
int FTI_RecoverL4Posix(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L4 ckpt. files from the PFS using POSIX.
Definition: postreco.c:890
int FTI_InitDcp(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_dataset *FTI_Data)
Initializes dCP.
Definition: diff-checkpoint.c:159
int FTI_RecoverL3(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L3 ckpt. files ordering the RS decoding algorithm.
Definition: postreco.c:778
int FTI_RSenc(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It performs RS encoding with the ckpt. files in to the group.
Definition: postckpt.c:256
int FTI_UpdateDcpChanges(FTIT_dataset *FTI_Data, FTIT_execution *FTI_Exec)
Updates data chunk hash meta data.
Definition: diff-checkpoint.c:550
int FTI_RecoverFiles(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It decides wich action take depending on the restart level.
Definition: recover.c:170
int FTI_ReorderNodes(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, int *nodeList, char *nameList)
It reorders the nodes following the previous topology.
Definition: topo.c:129
int FTI_LoadMeta(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It gets the metadata to recover the data after a failure.
Definition: meta.c:289
long FTI_CalcNumHashes(long chunkSize)
Computes number of hashblocks for chunk size.
Definition: diff-checkpoint.c:472
int FTI_WritePar(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data)
int FTI_Local(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It returns FTI_SCES.
Definition: postckpt.c:54
int FTI_RmDir(char path[FTI_BUFS], int flag)
It erases a directory and all its files.
Definition: tools.c:706
int FTI_Clean(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It erases the previous checkpoints and their metadata.
Definition: tools.c:766
int FTI_ReadConf(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje)
It reads the configuration given in the configuration file.
Definition: conf.c:118
int FTI_RecoverL4Sionlib(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It recovers L4 ckpt. files from the PFS using SIONlib.
Definition: postreco.c:1149
int FTI_WritePosix(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data)
Writes ckpt to PFS using POSIX.
Definition: checkpoint.c:563
int FTI_TestConfig(FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_execution *FTI_Exec)
It tests that the configuration given is correct.
Definition: conf.c:255
int FTI_dbvarstructsize
Definition: tools.c:44
int FTI_CollapseBlockHashArray(FTIFF_dbvar *dbvar)
Shrinks an existing hash meta data structure for data chunk.
Definition: diff-checkpoint.c:295
int FTI_HandleCkptRequest(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
handles checkpoint requests from application ranks (if head).
Definition: checkpoint.c:460
void FTI_FreeMeta(FTIT_execution *FTI_Exec)
It frees memory for the metadata.
Definition: tools.c:370
Header file for the FTI library.
int FTI_CreateMetadata(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data)
It writes the metadata to recover the data after a failure.
Definition: meta.c:769
int FTI_HandleStageRequest(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int source)
This function asynchronously stages the local file to the PFS.
Definition: stage.c:797
void FTI_FreeTypesAndGroups(FTIT_execution *FTI_Exec)
It frees memory for the types.
Definition: tools.c:430
int FTI_ArchiveL4Ckpt(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_checkpoint *FTI_Ckpt, FTIT_topology *FTI_Topo)
It moves the level 4 ckpt. to the archive folder.
Definition: postckpt.c:603
int FTI_UpdateConf(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, int restart)
Sets the exec. ID and failure parameters in the conf. file.
Definition: conf.c:58
void FTI_PrintMeta(FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo)
int FTI_DoubleBitFlip(double *target, int bit)
It corrupts a bit of the given float.
Definition: api.c:911
int FTI_VerifyChecksum(char *fileName, char *checksumToCmp)
It compares checksum of the checkpoint file.
Definition: tools.c:247
int FTI_Flush(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It flushes the local ckpt. files in to the PFS.
Definition: postckpt.c:528
Header file for the FTI File Format (FTI-FF).
int FTI_LoadL4CkptMetaData(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
Loads relevant data from checkpoint meta data.
Definition: meta.c:429
int FTI_filemetastructsize
Definition: tools.c:42
int FTI_Try(int result, char *message)
It receives the return code of a function and prints a message.
Definition: tools.c:304
int FTI_WriteMetadata(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, long *fs, long mfs, char *fnl, char *checksums, int *allVarIDs, long *allVarSizes)
It writes the metadata to recover the data after a failure.
Definition: meta.c:660
int FTI_Decode(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int *erased)
It recovers a set of ckpt. files using RS decoding.
Definition: postreco.c:55
int FTI_InitBasicTypes(FTIT_dataset *FTI_Data)
It creates the basic datatypes and the dataset array.
Definition: tools.c:671
int FTI_PostCkpt(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
Decides wich action start depending on the ckpt. level.
Definition: checkpoint.c:267
int FTI_CreateComms(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int *userProcList, int *distProcList, int *nodeList)
It builds the list of nodes in the current execution.
Definition: topo.c:318
unsigned short dcpBLK_t
unsigned short (0 - 65535).
Definition: ftiff.h:81
int FTI_ExpandBlockHashArray(FTIFF_dbvar *dbvar)
Expands an existing hash meta data structure for data chunk.
Definition: diff-checkpoint.c:387
void * FTI_ADDRPTR
Definition: interface.h:285
int FTI_GetDcpMode()
Returns the dCP mode.
Definition: diff-checkpoint.c:224
int FTI_Listen(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) __attribute__((noreturn))
It listens for checkpoint notifications.
Definition: checkpoint.c:380
dcpBLK_t FTI_GetDiffBlockSize()
Returns the dCP block size.
Definition: diff-checkpoint.c:214
uintptr_t FTI_ADDRVAL
Definition: interface.h:284
int FTI_LoadCkptMetaData(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
Loads relevant data from checkpoint meta data.
Definition: meta.c:486
int FTI_CheckFile(char *fn, long fs, char *checksum)
It checks if a file exist and that its size is 'correct'.
Definition: recover.c:54
int FTI_ReceiveDataChunk(FTI_ADDRVAL *buffer_offset, FTI_ADDRVAL *buffer_size, FTIFF_dbvar *dbvar, FTIT_dataset *FTI_Data)
Returns pointer and size of buffer to write during checkpoint.
Definition: diff-checkpoint.c:622
int FTI_CreateDirs(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It creates the directories required for current execution.
Definition: conf.c:466
int FTI_LoadTmpMeta(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
It gets the temporary metadata.
Definition: meta.c:205
int FTI_Topology(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo)
It builds and saves the topology of the current execution.
Definition: topo.c:376
int FTI_FlushSionlib(FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
It flushes the local ckpt. files in to the PFS using SIONlib.
Definition: postckpt.c:958