Fault Tolerance Interface
|
Header file for the FTI library private functions. More...
#include "fti.h"
#include "ftiff.h"
#include "../deps/iniparser/iniparser.h"
#include "../deps/iniparser/dictionary.h"
#include "../deps/jerasure/include/galois.h"
#include "../deps/jerasure/include/jerasure.h"
#include <sion.h>
#include "stage.h"
#include <stdint.h>
#include "../deps/md5/md5.h"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
#include <errno.h>
#include <math.h>
#include <limits.h>
#include <inttypes.h>
#include <dirent.h>
#include <stdbool.h>
#include <libgen.h>
Go to the source code of this file.
Macros | |
#define | CHUNK_SIZE 131072 |
#define | talloc(type, num) (type *)malloc(sizeof(type) * (num)) |
Typedefs | |
typedef uintptr_t | FTI_ADDRVAL |
typedef void * | FTI_ADDRPTR |
Functions | |
void | FTI_PrintMeta (FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo) |
int | FTI_FloatBitFlip (float *target, int bit) |
It corrupts a bit of the given float. More... | |
int | FTI_DoubleBitFlip (double *target, int bit) |
It corrupts a bit of the given float. More... | |
void | FTI_Print (char *msg, int priority) |
Prints FTI messages. More... | |
int | FTI_UpdateIterTime (FTIT_execution *FTI_Exec) |
It updates the local and global mean iteration time. More... | |
int | FTI_WriteCkpt (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data) |
It writes the checkpoint data in the target file. More... | |
int | FTI_WriteSionlib (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data) |
Writes ckpt to PFS using SIONlib. More... | |
int | FTI_WriteMPI (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data) |
Writes ckpt to PFS using MPI I/O. More... | |
int | FTI_WritePar (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data) |
int | FTI_WritePosix (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data) |
Writes ckpt to PFS using POSIX. More... | |
int | FTI_PostCkpt (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
Decides wich action start depending on the ckpt. level. More... | |
int | FTI_Listen (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) __attribute__((noreturn)) |
It listens for checkpoint notifications. More... | |
int | FTI_HandleCkptRequest (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
handles checkpoint requests from application ranks (if head). More... | |
int | FTI_HandleStageRequest (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int source) |
This function asynchronously stages the local file to the PFS. More... | |
int | FTI_UpdateConf (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, int restart) |
Sets the exec. ID and failure parameters in the conf. file. More... | |
int | FTI_ReadConf (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje) |
It reads the configuration given in the configuration file. More... | |
int | FTI_TestConfig (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_execution *FTI_Exec) |
It tests that the configuration given is correct. More... | |
int | FTI_TestDirectories (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo) |
It tests that the directories given is correct. More... | |
int | FTI_CreateDirs (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It creates the directories required for current execution. More... | |
int | FTI_LoadConf (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje) |
It reads and tests the configuration given. More... | |
int | FTI_GetChecksums (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, char *checksum, char *ptnerChecksum, char *rsChecksum) |
It gets the checksums from metadata. More... | |
int | FTI_WriteRSedChecksum (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int rank, char *checksum) |
It writes the RSed file checksum to metadata. More... | |
int | FTI_LoadTmpMeta (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It gets the temporary metadata. More... | |
int | FTI_LoadMeta (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It gets the metadata to recover the data after a failure. More... | |
int | FTI_WriteMetadata (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, long *fs, long mfs, char *fnl, char *checksums, int *allVarIDs, long *allVarSizes) |
It writes the metadata to recover the data after a failure. More... | |
int | FTI_CreateMetadata (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data) |
It writes the metadata to recover the data after a failure. More... | |
int | FTI_WriteCkptMetaData (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
Creates or updates checkpoint meta data. More... | |
int | FTI_LoadCkptMetaData (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
Loads relevant data from checkpoint meta data. More... | |
int | FTI_LoadL4CkptMetaData (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
Loads relevant data from checkpoint meta data. More... | |
int | FTI_Local (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It returns FTI_SCES. More... | |
int | FTI_Ptner (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It copies ckpt. files in to the partner node. More... | |
int | FTI_RSenc (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It performs RS encoding with the ckpt. files in to the group. More... | |
int | FTI_Flush (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level) |
It flushes the local ckpt. files in to the PFS. More... | |
int | FTI_FlushPosix (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level) |
It flushes the local ckpt. files in to the PFS using POSIX. More... | |
int | FTI_FlushMPI (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level) |
It flushes the local ckpt. files in to the PFS using MPI-I/O. More... | |
int | FTI_FlushSionlib (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level) |
It flushes the local ckpt. files in to the PFS using SIONlib. More... | |
int | FTI_Decode (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int *erased) |
It recovers a set of ckpt. files using RS decoding. More... | |
int | FTI_RecoverL1 (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It checks that all L1 ckpt. files are present. More... | |
int | FTI_RecoverL2 (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It recovers L2 ckpt. files using the partner copy. More... | |
int | FTI_RecoverL3 (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It recovers L3 ckpt. files ordering the RS decoding algorithm. More... | |
int | FTI_RecoverL4 (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It recovers L4 ckpt. files from the PFS. More... | |
int | FTI_RecoverL4Posix (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It recovers L4 ckpt. files from the PFS using POSIX. More... | |
int | FTI_RecoverL4Mpi (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It recovers L4 ckpt. files from the PFS using MPI-I/O. More... | |
int | FTI_RecoverL4Sionlib (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It recovers L4 ckpt. files from the PFS using SIONlib. More... | |
int | FTI_CheckFile (char *fn, long fs, char *checksum) |
It checks if a file exist and that its size is 'correct'. More... | |
int | FTI_CheckErasures (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int *erased) |
It detects all the erasures for a particular level. More... | |
int | FTI_RecoverFiles (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) |
It decides wich action take depending on the restart level. More... | |
int | FTI_Checksum (FTIT_execution *FTI_Exec, FTIT_dataset *FTI_Data, FTIT_configuration *FTI_Conf, char *checksum) |
It calculates checksum of the checkpoint file. More... | |
int | FTI_VerifyChecksum (char *fileName, char *checksumToCmp) |
It compares checksum of the checkpoint file. More... | |
int | FTI_Try (int result, char *message) |
It receives the return code of a function and prints a message. More... | |
void | FTI_MallocMeta (FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo) |
It mallocs memory for the metadata. More... | |
void | FTI_FreeMeta (FTIT_execution *FTI_Exec) |
It frees memory for the metadata. More... | |
void | FTI_FreeTypesAndGroups (FTIT_execution *FTI_Exec) |
It frees memory for the types. More... | |
int | FTI_InitGroupsAndTypes (FTIT_execution *FTI_Exec) |
It mallocs memory for the metadata. More... | |
int | FTI_InitBasicTypes (FTIT_dataset *FTI_Data) |
It creates the basic datatypes and the dataset array. More... | |
int | FTI_InitExecVars (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje) |
Init of the static variables. More... | |
int | FTI_RmDir (char path[FTI_BUFS], int flag) |
It erases a directory and all its files. More... | |
int | FTI_Clean (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level) |
It erases the previous checkpoints and their metadata. More... | |
int | FTI_SaveTopo (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, char *nameList) |
It writes the topology in a file for recovery. More... | |
int | FTI_ReorderNodes (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, int *nodeList, char *nameList) |
It reorders the nodes following the previous topology. More... | |
int | FTI_BuildNodeList (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int *nodeList, char *nameList) |
It builds the list of nodes in the current execution. More... | |
int | FTI_CreateComms (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int *userProcList, int *distProcList, int *nodeList) |
It builds the list of nodes in the current execution. More... | |
int | FTI_Topology (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo) |
It builds and saves the topology of the current execution. More... | |
int | FTI_ArchiveL4Ckpt (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_checkpoint *FTI_Ckpt, FTIT_topology *FTI_Topo) |
It moves the level 4 ckpt. to the archive folder. More... | |
void | FTI_PrintStatus (FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int ID, int source) |
int | FTI_FinalizeDcp (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec) |
Finalizes dCP. More... | |
int | FTI_InitDcp (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_dataset *FTI_Data) |
Initializes dCP. More... | |
int | FTI_ReceiveDataChunk (FTI_ADDRVAL *buffer_offset, FTI_ADDRVAL *buffer_size, FTIFF_dbvar *dbvar, FTIT_dataset *FTI_Data) |
Returns pointer and size of buffer to write during checkpoint. More... | |
long | FTI_CalcNumHashes (long chunkSize) |
Computes number of hashblocks for chunk size. More... | |
int | FTI_InitBlockHashArray (FTIFF_dbvar *dbvar) |
Initializes a new hash meta data structure for data chunk. More... | |
int | FTI_ExpandBlockHashArray (FTIFF_dbvar *dbvar) |
Expands an existing hash meta data structure for data chunk. More... | |
int | FTI_CollapseBlockHashArray (FTIFF_dbvar *dbvar) |
Shrinks an existing hash meta data structure for data chunk. More... | |
int | FTI_GetDcpMode () |
Returns the dCP mode. More... | |
dcpBLK_t | FTI_GetDiffBlockSize () |
Returns the dCP block size. More... | |
int | FTI_HashCmp (long hashIdx, FTIFF_dbvar *dbvar) |
Checks if data block is dirty, clean or invalid. More... | |
int | FTI_UpdateDcpChanges (FTIT_dataset *FTI_Data, FTIT_execution *FTI_Exec) |
Updates data chunk hash meta data. More... | |
Variables | |
int | FTI_filemetastructsize |
int | FTI_dbstructsize |
int | FTI_dbvarstructsize |
Header file for the FTI library private functions.
Copyright (c) 2017 Leonardo A. Bautista-Gomez All rights reserved
FTI - A multi-level checkpointing library for C/C++/Fortran applications
Revision 1.0 : Fault Tolerance Interface (FTI)
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CHUNK_SIZE 131072 |
MD5 algorithm chunk size.
#define talloc | ( | type, | |
num | |||
) | (type *)malloc(sizeof(type) * (num)) |
Malloc macro.
typedef void* FTI_ADDRPTR |
void ptr type
typedef uintptr_t FTI_ADDRVAL |
for ptr manipulation
int FTI_ArchiveL4Ckpt | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
FTIT_topology * | FTI_Topo | ||
) |
It moves the level 4 ckpt. to the archive folder.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function is called if keepL4Ckpt is enabled in the configuration file. It moves the old level 4 ckpt file to the archive folder before the l4 folder in the global directory is deleted.
int FTI_BuildNodeList | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
int * | nodeList, | ||
char * | nameList | ||
) |
It builds the list of nodes in the current execution.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
nodeList | The list of the nodes to fill. |
nameList | The list of the node names to fill. |
This function makes all the processes to detect in which node are they located and distributes the information globally to create an uniform mapping structure between processes and nodes.
long FTI_CalcNumHashes | ( | long | chunkSize | ) |
Computes number of hashblocks for chunk size.
chunkSize | chunk size of data chunk |
This function computes the number of hash blocks according to the set dCP block size corresponding to chunkSize.
int FTI_CheckErasures | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
int * | erased | ||
) |
It detects all the erasures for a particular level.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
erased | The array of erasures to fill. |
This function detects all the erasures for L1, L2 and L3. It return the results in the erased array. The search for erasures is done at the three levels independently on the current recovery level.
int FTI_CheckFile | ( | char * | fn, |
long | fs, | ||
char * | checksum | ||
) |
It checks if a file exist and that its size is 'correct'.
fn | The ckpt. file name to check. |
fs | The ckpt. file size to check. |
checksum | The file checksum to check. |
This function checks whether a file exist or not and if its size is the expected one.
int FTI_Checksum | ( | FTIT_execution * | FTI_Exec, |
FTIT_dataset * | FTI_Data, | ||
FTIT_configuration * | FTI_Conf, | ||
char * | checksum | ||
) |
It calculates checksum of the checkpoint file.
FTI_Exec | Execution metadata. |
FTI_Data | Dataset metadata. |
checksum | Checksum that is calculated. |
This function calculates checksum of the checkpoint file based on MD5 algorithm and saves it in checksum.
int FTI_Clean | ( | FTIT_configuration * | FTI_Conf, |
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
int | level | ||
) |
It erases the previous checkpoints and their metadata.
FTI_Conf | Configuration metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
level | Level of cleaning. |
This function erases previous checkpoint depending on the level of the current checkpoint. Level 5 means complete clean up. Level 6 means clean up local nodes but keep last checkpoint data and metadata in the PFS.
int FTI_CollapseBlockHashArray | ( | FTIFF_dbvar * | dbvar | ) |
Shrinks an existing hash meta data structure for data chunk.
dbvar | Datchunk metadata. |
This function re-allocates memory for the 'dataDiffHash' member of the 'FTIFF_dbvar' structure and if dCP mode is MD5 also for the MD5 digest array placed in the member 'md5hash' of the 'dataDiffHash' structure.
It also updates the other members of the 'dataDiffHash' structure.
int FTI_CreateComms | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
int * | userProcList, | ||
int * | distProcList, | ||
int * | nodeList | ||
) |
It builds the list of nodes in the current execution.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
userProcList | The list of the app. processess. |
distProcList | The list of the distributed processes. |
nodeList | The list of the nodes to fill. |
This function makes all the processes to detect in which node are they located and distributes the information globally to create an uniform mapping structure between processes and nodes.
int FTI_CreateDirs | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It creates the directories required for current execution.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function creates the temporary metadata, local and global directories required for the current execution.
int FTI_CreateMetadata | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
FTIT_dataset * | FTI_Data | ||
) |
It writes the metadata to recover the data after a failure.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
FTI_Data | Dataset metadata. |
This function gathers information about the checkpoint files in the group (name and sizes), and creates the metadata file used to recover in case of failure.
int FTI_Decode | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
int * | erased | ||
) |
It recovers a set of ckpt. files using RS decoding.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
erased | The array of erasures. |
This function tries to recover the L3 ckpt. files missing using the RS decoding.
int FTI_DoubleBitFlip | ( | double * | target, |
int | bit | ||
) |
It corrupts a bit of the given float.
target | Pointer to the float to corrupt. |
bit | Position of the bit to corrupt. |
This function filps the bit of the target float.
int FTI_ExpandBlockHashArray | ( | FTIFF_dbvar * | dbvar | ) |
Expands an existing hash meta data structure for data chunk.
dbvar | Datchunk metadata. |
This function re-allocates memory for the 'dataDiffHash' member of the 'FTIFF_dbvar' structure and if dCP mode is MD5 also for the MD5 digest array placed in the member 'md5hash' of the 'dataDiffHash' structure.
It also updates the other members of the 'dataDiffHash' structure.
int FTI_FinalizeDcp | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec | ||
) |
Finalizes dCP.
Function Definitions
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
This function deallocates structures used for dCP and exposes the status dcp disabled to FTI. It is also called for failures during dCP creation.
int FTI_FloatBitFlip | ( | float * | target, |
int | bit | ||
) |
It corrupts a bit of the given float.
target | Pointer to the float to corrupt. |
bit | Position of the bit to corrupt. |
This function filps the bit of the target float.
int FTI_Flush | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
int | level | ||
) |
It flushes the local ckpt. files in to the PFS.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
level | The level from which ckpt. files are flushed. |
This function flushes the local checkpoint files in to the PFS.
FTI_Flush is either executed by application processes during FTI_Finalize or by the heads during FTI_PostCkpt.
int FTI_FlushMPI | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
int | level | ||
) |
It flushes the local ckpt. files in to the PFS using MPI-I/O.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
level | The level from which ckpt. files are flushed. |
This function flushes the local checkpoint files in to the PFS.
int FTI_FlushPosix | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
int | level | ||
) |
It flushes the local ckpt. files in to the PFS using POSIX.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
level | The level from which ckpt. files are flushed. |
This function flushes the local checkpoint files in to the PFS.
int FTI_FlushSionlib | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
int | level | ||
) |
It flushes the local ckpt. files in to the PFS using SIONlib.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
level | The level from which ckpt. files are flushed. |
This function flushes the local checkpoint files in to the PFS.
void FTI_FreeMeta | ( | FTIT_execution * | FTI_Exec | ) |
It frees memory for the metadata.
FTI_Exec | Execution metadata. |
This function frees the memory used for the metadata storage.
void FTI_FreeTypesAndGroups | ( | FTIT_execution * | FTI_Exec | ) |
It frees memory for the types.
FTI_Exec | Execution metadata. |
This function frees the memory used for the type storage.
int FTI_GetChecksums | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
char * | checksum, | ||
char * | ptnerChecksum, | ||
char * | rsChecksum | ||
) |
It gets the checksums from metadata.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
checksum | Pointer to fill the checkpoint checksum. |
ptnerChecksum | Pointer to fill the ptner file checksum. |
rsChecksum | Pointer to fill the RS file checksum. |
This function reads the metadata file created during checkpointing and recovers the checkpoint checksum. If there is no RS file, rsChecksum string length is 0.
int FTI_GetDcpMode | ( | ) |
Returns the dCP mode.
dcpBLK_t FTI_GetDiffBlockSize | ( | ) |
Returns the dCP block size.
int FTI_HandleCkptRequest | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
handles checkpoint requests from application ranks (if head).
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
int FTI_HandleStageRequest | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
int | source | ||
) |
This function asynchronously stages the local file to the PFS.
string | 'lpath', absolute path of local file. |
string | 'rpath', absolute path of remote file. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Conf | Configuration metadata. |
integer | 'source', application rank of stage request. |
int FTI_HashCmp | ( | long | hashIdx, |
FTIFF_dbvar * | dbvar | ||
) |
Checks if data block is dirty, clean or invalid.
hashIdx | index for hash meta data in data chunk meta data. |
dbvar | Data chunk meta data. |
This function checks if data block corresponding to the hash meta data element is clean, dirty or invalid.
It returns -1 if hashIdx is out of range.
int FTI_InitBasicTypes | ( | FTIT_dataset * | FTI_Data | ) |
It creates the basic datatypes and the dataset array.
FTI_Data | Dataset metadata. |
This function creates the basic data types using FTIT_Type.
int FTI_InitBlockHashArray | ( | FTIFF_dbvar * | dbvar | ) |
Initializes a new hash meta data structure for data chunk.
dbvar | Datchunk metadata. |
This function allocates memory for the 'dataDiffHash' member of the 'FTIFF_dbvar' structure and if dCP mode is MD5 also for the MD5 digest array placed in the member 'md5hash' of the 'dataDiffHash' structure.
It also initializes the other members of the 'dataDiffHash' structure.
int FTI_InitDcp | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_dataset * | FTI_Data | ||
) |
Initializes dCP.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Data | Dataset metadata. |
This function looks for environment variables set for the dCP mode and dCP block size and overwrites, if found, the values from the configuration file.
It also initializes the file local variables 'dcpEnabled', 'DCP_MODE' and 'DCP_BLOCK_SIZE'.
int FTI_InitExecVars | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
FTIT_injection * | FTI_Inje | ||
) |
Init of the static variables.
This function initializes all static variables to zero.
int FTI_InitGroupsAndTypes | ( | FTIT_execution * | FTI_Exec | ) |
It mallocs memory for the metadata.
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
This function mallocs the memory used for the metadata storage.
int FTI_Listen | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It listens for checkpoint notifications.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
int FTI_LoadCkptMetaData | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
Loads relevant data from checkpoint meta data.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
int FTI_LoadConf | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
FTIT_injection * | FTI_Inje | ||
) |
It reads and tests the configuration given.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
FTI_Inje | Type to describe failure injections in FTI. |
This function reads the configuration file. Then test that the configuration parameters are correct (including directories).
int FTI_LoadL4CkptMetaData | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
Loads relevant data from checkpoint meta data.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
int FTI_LoadMeta | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It gets the metadata to recover the data after a failure.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function reads the metadata file created during checkpointing and recovers the checkpoint file name, file size, partner file size and the size of the largest file in the group (for padding if necessary during decoding).
int FTI_LoadTmpMeta | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It gets the temporary metadata.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function reads the temporary metadata file created during checkpointing and recovers the checkpoint file name, file size, partner file size and the size of the largest file in the group (for padding if necessary during decoding).
int FTI_Local | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It returns FTI_SCES.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function just returns FTI_SCES to have homogeneous code.
void FTI_MallocMeta | ( | FTIT_execution * | FTI_Exec, |
FTIT_topology * | FTI_Topo | ||
) |
It mallocs memory for the metadata.
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
This function mallocs the memory used for the metadata storage.
int FTI_PostCkpt | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
Decides wich action start depending on the ckpt. level.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function launches the required action dependeing on the ckpt. level. It does that for each group (application process in the node) if executed by the head, or only locally if executed by an application process. The parameter pr determines if the for loops have 1 or number of App. procs. iterations. The group parameter helps determine the groupID in both cases.
void FTI_Print | ( | char * | msg, |
int | priority | ||
) |
Prints FTI messages.
msg | Message to print. |
priority | Priority of the message to be printed. |
This function prints messages depending on their priority and the verbosity level set by the user. DEBUG messages are printed by all processes with their rank. INFO messages are printed by one process. ERROR messages are printed with errno.
void FTI_PrintMeta | ( | FTIT_execution * | FTI_Exec, |
FTIT_topology * | FTI_Topo | ||
) |
void FTI_PrintStatus | ( | FTIT_execution * | FTI_Exec, |
FTIT_topology * | FTI_Topo, | ||
int | ID, | ||
int | source | ||
) |
int FTI_Ptner | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It copies ckpt. files in to the partner node.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function copies the checkpoint files into the partner node. It follows a ring, where the ring size is the group size given in the FTI configuration file.
int FTI_ReadConf | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
FTIT_injection * | FTI_Inje | ||
) |
It reads the configuration given in the configuration file.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
FTI_Inje | Type to describe failure injections in FTI. |
This function reads the configuration given in the FTI configuration file and sets other required parameters.
int FTI_ReceiveDataChunk | ( | FTI_ADDRVAL * | buffer_addr, |
FTI_ADDRVAL * | buffer_size, | ||
FTIFF_dbvar * | dbvar, | ||
FTIT_dataset * | FTI_Data | ||
) |
Returns pointer and size of buffer to write during checkpoint.
buffer_addr | Pointer to buffer. |
buffer_size | Size of buffer. |
dbvar | Data chunk meta data. |
FTI_Data | Dataset metadata. |
This function is called repeatedly for each data chunk. If it returns 1, 'buffer_addr' holds the pointer to a memory region inside the data chunk and 'buffer_size' holds the size of the region. For dCP disabled, this region is the whole data chunk. For dCP enabled, the function returns a pointer to contiguous dirty regions until no further dirty regions are found in which case 0 is returned.
int FTI_RecoverFiles | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It decides wich action take depending on the restart level.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function launches the required action depending on the recovery level. The recovery level is detected from the checkpoint ID of the last checkpoint taken.
int FTI_RecoverL1 | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It checks that all L1 ckpt. files are present.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function detects all the erasures for L1. If there is at least one, L1 is not considered as recoverable.
int FTI_RecoverL2 | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It recovers L2 ckpt. files using the partner copy.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function tries to recover the L2 ckpt. files missing using the partner copy. If a ckpt. file and its copy are both missing, then we consider this checkpoint unavailable.
int FTI_RecoverL3 | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It recovers L3 ckpt. files ordering the RS decoding algorithm.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function tries to recover the L3 ckpt. files missing using the RS decoding. If to many files are missing in the group, then we consider this checkpoint unavailable.
int FTI_RecoverL4 | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It recovers L4 ckpt. files from the PFS.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function tries to recover the ckpt. files using the L4 ckpt. files stored in the PFS. If at least one ckpt. file is missing in the PFS, we consider this checkpoint unavailable.
int FTI_RecoverL4Mpi | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It recovers L4 ckpt. files from the PFS using MPI-I/O.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function tries to recover the ckpt. files using the L4 ckpt. files stored in the PFS. If at least one ckpt. file is missing in the PFS, we consider this checkpoint unavailable.
int FTI_RecoverL4Posix | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It recovers L4 ckpt. files from the PFS using POSIX.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function tries to recover the ckpt. files using the L4 ckpt. files stored in the PFS. If at least one ckpt. file is missing in the PFS, we consider this checkpoint unavailable.
int FTI_RecoverL4Sionlib | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It recovers L4 ckpt. files from the PFS using SIONlib.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function tries to recover the ckpt. files using the L4 ckpt. files stored in the PFS. If at least one ckpt. file is missing in the PFS, we consider this checkpoint unavailable.
int FTI_ReorderNodes | ( | FTIT_configuration * | FTI_Conf, |
FTIT_topology * | FTI_Topo, | ||
int * | nodeList, | ||
char * | nameList | ||
) |
It reorders the nodes following the previous topology.
FTI_Conf | Configuration metadata. |
FTI_Topo | Topology metadata. |
nodeList | The list of the nodes. |
nameList | The list of the node names. |
This function writes the topology of the system (List of nodes and their ID) in a topology file that will be read during recovery to detect which nodes (and therefore checkpoit files) are missing in the new topology.
int FTI_RmDir | ( | char | path[FTI_BUFS], |
int | flag | ||
) |
It erases a directory and all its files.
path | Path to the directory we want to erase. |
flag | Set to 1 to activate. |
This function erases a directory and all its files. It focusses on the checkpoint directories created by FTI so it does NOT handle recursive erasing if the given directory includes other directories.
int FTI_RSenc | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
It performs RS encoding with the ckpt. files in to the group.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
This function performs the Reed-Solomon encoding for a given group. The checkpoint files are padded to the maximum size of the largest checkpoint file in the group +- the extra space to be a multiple of block size.
int FTI_SaveTopo | ( | FTIT_configuration * | FTI_Conf, |
FTIT_topology * | FTI_Topo, | ||
char * | nameList | ||
) |
It writes the topology in a file for recovery.
FTI_Conf | Configuration metadata. |
FTI_Topo | Topology metadata. |
nameList | The list of the node names. |
This function writes the topology of the system (List of nodes and their ID) in a topology file that will be read during recovery to detect which nodes (and therefore checkpoit files) are missing in the new topology.
int FTI_TestConfig | ( | FTIT_configuration * | FTI_Conf, |
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
FTIT_execution * | FTI_Exec | ||
) |
It tests that the configuration given is correct.
FTI_Conf | Configuration metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
FTI_Exec | Execution metadata. |
This function tests the FTI configuration to make sure that all parameter's values are correct.
int FTI_TestDirectories | ( | FTIT_configuration * | FTI_Conf, |
FTIT_topology * | FTI_Topo | ||
) |
It tests that the directories given is correct.
FTI_Conf | Configuration metadata. |
FTI_Topo | Topology metadata. |
This function tests that the directories given in the FTI configuration are correct.
int FTI_Topology | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo | ||
) |
It builds and saves the topology of the current execution.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
This function builds the topology of the system, detects and replaces missing nodes in case of recovery and creates the communicators required for FTI to work. It stores all required information in FTI_Topo.
int FTI_Try | ( | int | result, |
char * | message | ||
) |
It receives the return code of a function and prints a message.
result | Result to check. |
message | Message to print. |
This function checks the result from a function and then decides to print the message either as a debug message or as a warning.
int FTI_UpdateConf | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
int | restart | ||
) |
Sets the exec. ID and failure parameters in the conf. file.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
restart | Value to set in the conf. file (0 or 1). |
This function sets the execution ID and failure parameters in the configuration file. This is to avoid forcing the user to change these values manually in case of recovery needed. In this way, relaunching the execution in the same way as the initial time will make FTI detect that it is a restart. It also allows to set the failure parameter back to 0 at the end of a successful execution.
int FTI_UpdateDcpChanges | ( | FTIT_dataset * | FTI_Data, |
FTIT_execution * | FTI_Exec | ||
) |
Updates data chunk hash meta data.
FTI_Exec | Execution metadata. |
FTI_Data | Dataset metadata. |
This function updates the hashes of data blocks that were identified as dirty and initializes the hashes for data blocks that are invalid.
int FTI_UpdateIterTime | ( | FTIT_execution * | FTI_Exec | ) |
It updates the local and global mean iteration time.
FTI_Exec | Execution metadata. |
This function updates the local and global mean iteration time. It also recomputes the checkpoint interval in iterations and corrects the next checkpointing iteration based on the observed mean iteration duration.
int FTI_VerifyChecksum | ( | char * | fileName, |
char * | checksumToCmp | ||
) |
It compares checksum of the checkpoint file.
fileName | Filename of the checkpoint. |
checksumToCmp | Checksum to compare. |
This function calculates checksum of the checkpoint file based on MD5 algorithm. It compares calculated hash value with the one saved in the file.
int FTI_WriteCkpt | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
FTIT_dataset * | FTI_Data | ||
) |
It writes the checkpoint data in the target file.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
FTI_Data | Dataset metadata. |
This function checks whether the checkpoint needs to be local or remote, opens the target file and writes dataset per dataset, the checkpoint data, it finally flushes and closes the checkpoint file.
int FTI_WriteCkptMetaData | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt | ||
) |
Creates or updates checkpoint meta data.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
Writes checkpoint meta data in checkpoint meta data file.
int FTI_WriteMetadata | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
long * | fs, | ||
long | mfs, | ||
char * | fnl, | ||
char * | checksums, | ||
int * | allVarIDs, | ||
long * | allVarSizes | ||
) |
It writes the metadata to recover the data after a failure.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
fs | Pointer to the list of checkpoint sizes. |
mfs | The maximum checkpoint file size. |
fnl | Pointer to the list of checkpoint names. |
checksums | Checksums array. |
allVarIDs | IDs of vars from all processes in group. |
allVarSizes | Sizes of vars from all processes in group. |
This function should be executed only by one process per group. It writes the metadata file used to recover in case of failure.
int FTI_WriteMPI | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_dataset * | FTI_Data | ||
) |
Writes ckpt to PFS using MPI I/O.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Data | Dataset metadata. |
In here it is taken into account, that in MPIIO the count parameter in both, MPI_Type_contiguous and MPI_File_write_at, are integer types. The ckpt data is split into chunks of maximal (MAX_INT-1)/2 elements to form contiguous data types. It was experienced, that if the size is greater then that, it may lead to problems.
int FTI_WritePar | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_dataset * | FTI_Data | ||
) |
int FTI_WritePosix | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
FTIT_dataset * | FTI_Data | ||
) |
Writes ckpt to PFS using POSIX.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
FTI_Data | Dataset metadata. |
int FTI_WriteRSedChecksum | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_checkpoint * | FTI_Ckpt, | ||
int | rank, | ||
char * | checksum | ||
) |
It writes the RSed file checksum to metadata.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Ckpt | Checkpoint metadata. |
rank | global rank of the process |
checksum | Pointer to the checksum. |
This function should be executed only by one process per group. It writes the RSed checksum to the metadata file.
int FTI_WriteSionlib | ( | FTIT_configuration * | FTI_Conf, |
FTIT_execution * | FTI_Exec, | ||
FTIT_topology * | FTI_Topo, | ||
FTIT_dataset * | FTI_Data | ||
) |
Writes ckpt to PFS using SIONlib.
FTI_Conf | Configuration metadata. |
FTI_Exec | Execution metadata. |
FTI_Topo | Topology metadata. |
FTI_Data | Dataset metadata. |
int FTI_dbvarstructsize |
size of FTIFF_dbvar in file
size of FTIFF_db struct in file
int FTI_filemetastructsize |
size of FTIFF_metaInfo in file
size of FTIFF_db struct in file