Fault Tolerance Interface
interface.h File Reference

Header file for the FTI library private functions. More...

#include "fti.h"
#include "ftiff.h"
#include "../deps/iniparser/iniparser.h"
#include "../deps/iniparser/dictionary.h"
#include "../deps/jerasure/include/galois.h"
#include "../deps/jerasure/include/jerasure.h"
#include <sion.h>
#include "stage.h"
#include <stdint.h>
#include "../deps/md5/md5.h"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
#include <errno.h>
#include <math.h>
#include <limits.h>
#include <inttypes.h>
#include <dirent.h>
#include <stdbool.h>
#include <libgen.h>
Include dependency graph for interface.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define CHUNK_SIZE   131072
 
#define talloc(type, num)   (type *)malloc(sizeof(type) * (num))
 

Typedefs

typedef uintptr_t FTI_ADDRVAL
 
typedef void * FTI_ADDRPTR
 

Functions

void FTI_PrintMeta (FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo)
 
int FTI_FloatBitFlip (float *target, int bit)
 It corrupts a bit of the given float. More...
 
int FTI_DoubleBitFlip (double *target, int bit)
 It corrupts a bit of the given float. More...
 
void FTI_Print (char *msg, int priority)
 Prints FTI messages. More...
 
int FTI_UpdateIterTime (FTIT_execution *FTI_Exec)
 It updates the local and global mean iteration time. More...
 
int FTI_WriteCkpt (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data)
 It writes the checkpoint data in the target file. More...
 
int FTI_WriteSionlib (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data)
 Writes ckpt to PFS using SIONlib. More...
 
int FTI_WriteMPI (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data)
 Writes ckpt to PFS using MPI I/O. More...
 
int FTI_WritePar (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_dataset *FTI_Data)
 
int FTI_WritePosix (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data)
 Writes ckpt to PFS using POSIX. More...
 
int FTI_PostCkpt (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 Decides wich action start depending on the ckpt. level. More...
 
int FTI_Listen (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt) __attribute__((noreturn))
 It listens for checkpoint notifications. More...
 
int FTI_HandleCkptRequest (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 handles checkpoint requests from application ranks (if head). More...
 
int FTI_HandleStageRequest (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int source)
 This function asynchronously stages the local file to the PFS. More...
 
int FTI_UpdateConf (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, int restart)
 Sets the exec. ID and failure parameters in the conf. file. More...
 
int FTI_ReadConf (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje)
 It reads the configuration given in the configuration file. More...
 
int FTI_TestConfig (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_execution *FTI_Exec)
 It tests that the configuration given is correct. More...
 
int FTI_TestDirectories (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo)
 It tests that the directories given is correct. More...
 
int FTI_CreateDirs (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It creates the directories required for current execution. More...
 
int FTI_LoadConf (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje)
 It reads and tests the configuration given. More...
 
int FTI_GetChecksums (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, char *checksum, char *ptnerChecksum, char *rsChecksum)
 It gets the checksums from metadata. More...
 
int FTI_WriteRSedChecksum (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int rank, char *checksum)
 It writes the RSed file checksum to metadata. More...
 
int FTI_LoadTmpMeta (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It gets the temporary metadata. More...
 
int FTI_LoadMeta (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It gets the metadata to recover the data after a failure. More...
 
int FTI_WriteMetadata (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, long *fs, long mfs, char *fnl, char *checksums, int *allVarIDs, long *allVarSizes)
 It writes the metadata to recover the data after a failure. More...
 
int FTI_CreateMetadata (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_dataset *FTI_Data)
 It writes the metadata to recover the data after a failure. More...
 
int FTI_WriteCkptMetaData (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 Creates or updates checkpoint meta data. More...
 
int FTI_LoadCkptMetaData (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 Loads relevant data from checkpoint meta data. More...
 
int FTI_LoadL4CkptMetaData (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 Loads relevant data from checkpoint meta data. More...
 
int FTI_Local (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It returns FTI_SCES. More...
 
int FTI_Ptner (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It copies ckpt. files in to the partner node. More...
 
int FTI_RSenc (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It performs RS encoding with the ckpt. files in to the group. More...
 
int FTI_Flush (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
 It flushes the local ckpt. files in to the PFS. More...
 
int FTI_FlushPosix (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
 It flushes the local ckpt. files in to the PFS using POSIX. More...
 
int FTI_FlushMPI (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
 It flushes the local ckpt. files in to the PFS using MPI-I/O. More...
 
int FTI_FlushSionlib (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
 It flushes the local ckpt. files in to the PFS using SIONlib. More...
 
int FTI_Decode (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int *erased)
 It recovers a set of ckpt. files using RS decoding. More...
 
int FTI_RecoverL1 (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It checks that all L1 ckpt. files are present. More...
 
int FTI_RecoverL2 (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It recovers L2 ckpt. files using the partner copy. More...
 
int FTI_RecoverL3 (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It recovers L3 ckpt. files ordering the RS decoding algorithm. More...
 
int FTI_RecoverL4 (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It recovers L4 ckpt. files from the PFS. More...
 
int FTI_RecoverL4Posix (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It recovers L4 ckpt. files from the PFS using POSIX. More...
 
int FTI_RecoverL4Mpi (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It recovers L4 ckpt. files from the PFS using MPI-I/O. More...
 
int FTI_RecoverL4Sionlib (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It recovers L4 ckpt. files from the PFS using SIONlib. More...
 
int FTI_CheckFile (char *fn, long fs, char *checksum)
 It checks if a file exist and that its size is 'correct'. More...
 
int FTI_CheckErasures (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int *erased)
 It detects all the erasures for a particular level. More...
 
int FTI_RecoverFiles (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt)
 It decides wich action take depending on the restart level. More...
 
int FTI_Checksum (FTIT_execution *FTI_Exec, FTIT_dataset *FTI_Data, FTIT_configuration *FTI_Conf, char *checksum)
 It calculates checksum of the checkpoint file. More...
 
int FTI_VerifyChecksum (char *fileName, char *checksumToCmp)
 It compares checksum of the checkpoint file. More...
 
int FTI_Try (int result, char *message)
 It receives the return code of a function and prints a message. More...
 
void FTI_MallocMeta (FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo)
 It mallocs memory for the metadata. More...
 
void FTI_FreeMeta (FTIT_execution *FTI_Exec)
 It frees memory for the metadata. More...
 
void FTI_FreeTypesAndGroups (FTIT_execution *FTI_Exec)
 It frees memory for the types. More...
 
int FTI_InitGroupsAndTypes (FTIT_execution *FTI_Exec)
 It mallocs memory for the metadata. More...
 
int FTI_InitBasicTypes (FTIT_dataset *FTI_Data)
 It creates the basic datatypes and the dataset array. More...
 
int FTI_InitExecVars (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, FTIT_injection *FTI_Inje)
 Init of the static variables. More...
 
int FTI_RmDir (char path[FTI_BUFS], int flag)
 It erases a directory and all its files. More...
 
int FTI_Clean (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, FTIT_checkpoint *FTI_Ckpt, int level)
 It erases the previous checkpoints and their metadata. More...
 
int FTI_SaveTopo (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, char *nameList)
 It writes the topology in a file for recovery. More...
 
int FTI_ReorderNodes (FTIT_configuration *FTI_Conf, FTIT_topology *FTI_Topo, int *nodeList, char *nameList)
 It reorders the nodes following the previous topology. More...
 
int FTI_BuildNodeList (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int *nodeList, char *nameList)
 It builds the list of nodes in the current execution. More...
 
int FTI_CreateComms (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int *userProcList, int *distProcList, int *nodeList)
 It builds the list of nodes in the current execution. More...
 
int FTI_Topology (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo)
 It builds and saves the topology of the current execution. More...
 
int FTI_ArchiveL4Ckpt (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_checkpoint *FTI_Ckpt, FTIT_topology *FTI_Topo)
 It moves the level 4 ckpt. to the archive folder. More...
 
void FTI_PrintStatus (FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int ID, int source)
 
int FTI_FinalizeDcp (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec)
 Finalizes dCP. More...
 
int FTI_InitDcp (FTIT_configuration *FTI_Conf, FTIT_execution *FTI_Exec, FTIT_dataset *FTI_Data)
 Initializes dCP. More...
 
int FTI_ReceiveDataChunk (FTI_ADDRVAL *buffer_offset, FTI_ADDRVAL *buffer_size, FTIFF_dbvar *dbvar, FTIT_dataset *FTI_Data)
 Returns pointer and size of buffer to write during checkpoint. More...
 
long FTI_CalcNumHashes (long chunkSize)
 Computes number of hashblocks for chunk size. More...
 
int FTI_InitBlockHashArray (FTIFF_dbvar *dbvar)
 Initializes a new hash meta data structure for data chunk. More...
 
int FTI_ExpandBlockHashArray (FTIFF_dbvar *dbvar)
 Expands an existing hash meta data structure for data chunk. More...
 
int FTI_CollapseBlockHashArray (FTIFF_dbvar *dbvar)
 Shrinks an existing hash meta data structure for data chunk. More...
 
int FTI_GetDcpMode ()
 Returns the dCP mode. More...
 
dcpBLK_t FTI_GetDiffBlockSize ()
 Returns the dCP block size. More...
 
int FTI_HashCmp (long hashIdx, FTIFF_dbvar *dbvar)
 Checks if data block is dirty, clean or invalid. More...
 
int FTI_UpdateDcpChanges (FTIT_dataset *FTI_Data, FTIT_execution *FTI_Exec)
 Updates data chunk hash meta data. More...
 

Variables

int FTI_filemetastructsize
 
int FTI_dbstructsize
 
int FTI_dbvarstructsize
 

Detailed Description

Header file for the FTI library private functions.

Copyright (c) 2017 Leonardo A. Bautista-Gomez All rights reserved

FTI - A multi-level checkpointing library for C/C++/Fortran applications

Revision 1.0 : Fault Tolerance Interface (FTI)

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

  1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
  3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Date
October, 2017

Macro Definition Documentation

#define CHUNK_SIZE   131072

MD5 algorithm chunk size.

#define talloc (   type,
  num 
)    (type *)malloc(sizeof(type) * (num))

Malloc macro.

Typedef Documentation

typedef void* FTI_ADDRPTR

void ptr type

typedef uintptr_t FTI_ADDRVAL

for ptr manipulation

Function Documentation

int FTI_ArchiveL4Ckpt ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_checkpoint FTI_Ckpt,
FTIT_topology FTI_Topo 
)

It moves the level 4 ckpt. to the archive folder.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function is called if keepL4Ckpt is enabled in the configuration file. It moves the old level 4 ckpt file to the archive folder before the l4 folder in the global directory is deleted.

Here is the call graph for this function:

int FTI_BuildNodeList ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
int *  nodeList,
char *  nameList 
)

It builds the list of nodes in the current execution.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
nodeListThe list of the nodes to fill.
nameListThe list of the node names to fill.
Returns
integer FTI_SCES if successful.

This function makes all the processes to detect in which node are they located and distributes the information globally to create an uniform mapping structure between processes and nodes.

Here is the call graph for this function:

long FTI_CalcNumHashes ( long  chunkSize)

Computes number of hashblocks for chunk size.

Parameters
chunkSizechunk size of data chunk
Returns
long FTI_SCES if successful.

This function computes the number of hash blocks according to the set dCP block size corresponding to chunkSize.

int FTI_CheckErasures ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
int *  erased 
)

It detects all the erasures for a particular level.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
erasedThe array of erasures to fill.
Returns
integer FTI_SCES if successful.

This function detects all the erasures for L1, L2 and L3. It return the results in the erased array. The search for erasures is done at the three levels independently on the current recovery level.

Here is the call graph for this function:

int FTI_CheckFile ( char *  fn,
long  fs,
char *  checksum 
)

It checks if a file exist and that its size is 'correct'.

Parameters
fnThe ckpt. file name to check.
fsThe ckpt. file size to check.
checksumThe file checksum to check.
Returns
integer 0 if file exists, 1 if not or wrong size.

This function checks whether a file exist or not and if its size is the expected one.

Here is the call graph for this function:

int FTI_Checksum ( FTIT_execution FTI_Exec,
FTIT_dataset FTI_Data,
FTIT_configuration FTI_Conf,
char *  checksum 
)

It calculates checksum of the checkpoint file.

Parameters
FTI_ExecExecution metadata.
FTI_DataDataset metadata.
checksumChecksum that is calculated.
Returns
integer FTI_SCES if successful.

This function calculates checksum of the checkpoint file based on MD5 algorithm and saves it in checksum.

int FTI_Clean ( FTIT_configuration FTI_Conf,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
int  level 
)

It erases the previous checkpoints and their metadata.

Parameters
FTI_ConfConfiguration metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
levelLevel of cleaning.
Returns
integer FTI_SCES if successful.

This function erases previous checkpoint depending on the level of the current checkpoint. Level 5 means complete clean up. Level 6 means clean up local nodes but keep last checkpoint data and metadata in the PFS.

Here is the call graph for this function:

int FTI_CollapseBlockHashArray ( FTIFF_dbvar dbvar)

Shrinks an existing hash meta data structure for data chunk.

Parameters
dbvarDatchunk metadata.
Returns
integer FTI_SCES if successful.

This function re-allocates memory for the 'dataDiffHash' member of the 'FTIFF_dbvar' structure and if dCP mode is MD5 also for the MD5 digest array placed in the member 'md5hash' of the 'dataDiffHash' structure.

It also updates the other members of the 'dataDiffHash' structure.

Here is the call graph for this function:

int FTI_CreateComms ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
int *  userProcList,
int *  distProcList,
int *  nodeList 
)

It builds the list of nodes in the current execution.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
userProcListThe list of the app. processess.
distProcListThe list of the distributed processes.
nodeListThe list of the nodes to fill.
Returns
integer FTI_SCES if successful.

This function makes all the processes to detect in which node are they located and distributes the information globally to create an uniform mapping structure between processes and nodes.

int FTI_CreateDirs ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It creates the directories required for current execution.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function creates the temporary metadata, local and global directories required for the current execution.

Here is the call graph for this function:

int FTI_CreateMetadata ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
FTIT_dataset FTI_Data 
)

It writes the metadata to recover the data after a failure.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
FTI_DataDataset metadata.
Returns
integer FTI_SCES if successful.

This function gathers information about the checkpoint files in the group (name and sizes), and creates the metadata file used to recover in case of failure.

Here is the call graph for this function:

int FTI_Decode ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
int *  erased 
)

It recovers a set of ckpt. files using RS decoding.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
erasedThe array of erasures.
Returns
integer FTI_SCES if successful.

This function tries to recover the L3 ckpt. files missing using the RS decoding.

Here is the call graph for this function:

int FTI_DoubleBitFlip ( double *  target,
int  bit 
)

It corrupts a bit of the given float.

Parameters
targetPointer to the float to corrupt.
bitPosition of the bit to corrupt.
Returns
integer FTI_SCES if successful.

This function filps the bit of the target float.

Here is the call graph for this function:

int FTI_ExpandBlockHashArray ( FTIFF_dbvar dbvar)

Expands an existing hash meta data structure for data chunk.

Parameters
dbvarDatchunk metadata.
Returns
integer FTI_SCES if successful.

This function re-allocates memory for the 'dataDiffHash' member of the 'FTIFF_dbvar' structure and if dCP mode is MD5 also for the MD5 digest array placed in the member 'md5hash' of the 'dataDiffHash' structure.

It also updates the other members of the 'dataDiffHash' structure.

Here is the call graph for this function:

int FTI_FinalizeDcp ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec 
)

Finalizes dCP.

Function Definitions

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
Returns
integer FTI_SCES if successful.

This function deallocates structures used for dCP and exposes the status dcp disabled to FTI. It is also called for failures during dCP creation.

int FTI_FloatBitFlip ( float *  target,
int  bit 
)

It corrupts a bit of the given float.

Parameters
targetPointer to the float to corrupt.
bitPosition of the bit to corrupt.
Returns
integer FTI_SCES if successful.

This function filps the bit of the target float.

int FTI_Flush ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
int  level 
)

It flushes the local ckpt. files in to the PFS.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
levelThe level from which ckpt. files are flushed.
Returns
integer FTI_SCES if successful.

This function flushes the local checkpoint files in to the PFS.

FTI_Flush is either executed by application processes during FTI_Finalize or by the heads during FTI_PostCkpt.

Here is the call graph for this function:

int FTI_FlushMPI ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
int  level 
)

It flushes the local ckpt. files in to the PFS using MPI-I/O.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
levelThe level from which ckpt. files are flushed.
Returns
integer FTI_SCES if successful.

This function flushes the local checkpoint files in to the PFS.

Here is the call graph for this function:

int FTI_FlushPosix ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
int  level 
)

It flushes the local ckpt. files in to the PFS using POSIX.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
levelThe level from which ckpt. files are flushed.
Returns
integer FTI_SCES if successful.

This function flushes the local checkpoint files in to the PFS.

Here is the call graph for this function:

int FTI_FlushSionlib ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
int  level 
)

It flushes the local ckpt. files in to the PFS using SIONlib.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
levelThe level from which ckpt. files are flushed.
Returns
integer FTI_SCES if successful.

This function flushes the local checkpoint files in to the PFS.

Here is the call graph for this function:

void FTI_FreeMeta ( FTIT_execution FTI_Exec)

It frees memory for the metadata.

Parameters
FTI_ExecExecution metadata.

This function frees the memory used for the metadata storage.

void FTI_FreeTypesAndGroups ( FTIT_execution FTI_Exec)

It frees memory for the types.

Parameters
FTI_ExecExecution metadata.

This function frees the memory used for the type storage.

Here is the call graph for this function:

int FTI_GetChecksums ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
char *  checksum,
char *  ptnerChecksum,
char *  rsChecksum 
)

It gets the checksums from metadata.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
checksumPointer to fill the checkpoint checksum.
ptnerChecksumPointer to fill the ptner file checksum.
rsChecksumPointer to fill the RS file checksum.
Returns
integer FTI_SCES if successful.

This function reads the metadata file created during checkpointing and recovers the checkpoint checksum. If there is no RS file, rsChecksum string length is 0.

Here is the call graph for this function:

int FTI_GetDcpMode ( )

Returns the dCP mode.

dcpBLK_t FTI_GetDiffBlockSize ( )

Returns the dCP block size.

int FTI_HandleCkptRequest ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

handles checkpoint requests from application ranks (if head).

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

Here is the call graph for this function:

int FTI_HandleStageRequest ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
int  source 
)

This function asynchronously stages the local file to the PFS.

Parameters
string'lpath', absolute path of local file.
string'rpath', absolute path of remote file.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_ConfConfiguration metadata.
integer'source', application rank of stage request.
Returns
'FTI_SCES' on success, 'FTI_NSCS' else.
int FTI_HashCmp ( long  hashIdx,
FTIFF_dbvar dbvar 
)

Checks if data block is dirty, clean or invalid.

Parameters
hashIdxindex for hash meta data in data chunk meta data.
dbvarData chunk meta data.
Returns
integer 0 if data block is clean.
integer 1 if data block is dirty or invalid.
integer -1 if hashIdx not in range.

This function checks if data block corresponding to the hash meta data element is clean, dirty or invalid.

It returns -1 if hashIdx is out of range.

int FTI_InitBasicTypes ( FTIT_dataset FTI_Data)

It creates the basic datatypes and the dataset array.

Parameters
FTI_DataDataset metadata.
Returns
integer FTI_SCES if successful.

This function creates the basic data types using FTIT_Type.

Here is the call graph for this function:

int FTI_InitBlockHashArray ( FTIFF_dbvar dbvar)

Initializes a new hash meta data structure for data chunk.

Parameters
dbvarDatchunk metadata.
Returns
integer FTI_SCES if successful.

This function allocates memory for the 'dataDiffHash' member of the 'FTIFF_dbvar' structure and if dCP mode is MD5 also for the MD5 digest array placed in the member 'md5hash' of the 'dataDiffHash' structure.

It also initializes the other members of the 'dataDiffHash' structure.

Here is the call graph for this function:

int FTI_InitDcp ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_dataset FTI_Data 
)

Initializes dCP.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_DataDataset metadata.
Returns
integer FTI_SCES if successful.

This function looks for environment variables set for the dCP mode and dCP block size and overwrites, if found, the values from the configuration file.

It also initializes the file local variables 'dcpEnabled', 'DCP_MODE' and 'DCP_BLOCK_SIZE'.

Here is the call graph for this function:

int FTI_InitExecVars ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
FTIT_injection FTI_Inje 
)

Init of the static variables.

Returns
integer FTI_SCES if successful.

This function initializes all static variables to zero.

int FTI_InitGroupsAndTypes ( FTIT_execution FTI_Exec)

It mallocs memory for the metadata.

Parameters
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.

This function mallocs the memory used for the metadata storage.

int FTI_Listen ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It listens for checkpoint notifications.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful. This function listens for notifications from the application processes and takes the required actions after notification. This function is only executed by the head of the nodes and its complementary with the FTI_Checkpoint function in terms of communications.

Here is the call graph for this function:

int FTI_LoadCkptMetaData ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

Loads relevant data from checkpoint meta data.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.

Here is the call graph for this function:

int FTI_LoadConf ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
FTIT_injection FTI_Inje 
)

It reads and tests the configuration given.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
FTI_InjeType to describe failure injections in FTI.
Returns
integer FTI_SCES if successful.

This function reads the configuration file. Then test that the configuration parameters are correct (including directories).

Here is the call graph for this function:

int FTI_LoadL4CkptMetaData ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

Loads relevant data from checkpoint meta data.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.

Here is the call graph for this function:

int FTI_LoadMeta ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It gets the metadata to recover the data after a failure.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function reads the metadata file created during checkpointing and recovers the checkpoint file name, file size, partner file size and the size of the largest file in the group (for padding if necessary during decoding).

Here is the call graph for this function:

int FTI_LoadTmpMeta ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It gets the temporary metadata.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function reads the temporary metadata file created during checkpointing and recovers the checkpoint file name, file size, partner file size and the size of the largest file in the group (for padding if necessary during decoding).

Here is the call graph for this function:

int FTI_Local ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It returns FTI_SCES.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES.

This function just returns FTI_SCES to have homogeneous code.

Here is the call graph for this function:

void FTI_MallocMeta ( FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo 
)

It mallocs memory for the metadata.

Parameters
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.

This function mallocs the memory used for the metadata storage.

int FTI_PostCkpt ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

Decides wich action start depending on the ckpt. level.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function launches the required action dependeing on the ckpt. level. It does that for each group (application process in the node) if executed by the head, or only locally if executed by an application process. The parameter pr determines if the for loops have 1 or number of App. procs. iterations. The group parameter helps determine the groupID in both cases.

Here is the call graph for this function:

void FTI_Print ( char *  msg,
int  priority 
)

Prints FTI messages.

Parameters
msgMessage to print.
priorityPriority of the message to be printed.
Returns
void

This function prints messages depending on their priority and the verbosity level set by the user. DEBUG messages are printed by all processes with their rank. INFO messages are printed by one process. ERROR messages are printed with errno.

void FTI_PrintMeta ( FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo 
)
void FTI_PrintStatus ( FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
int  ID,
int  source 
)
int FTI_Ptner ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It copies ckpt. files in to the partner node.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function copies the checkpoint files into the partner node. It follows a ring, where the ring size is the group size given in the FTI configuration file.

Here is the call graph for this function:

int FTI_ReadConf ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
FTIT_injection FTI_Inje 
)

It reads the configuration given in the configuration file.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
FTI_InjeType to describe failure injections in FTI.
Returns
integer FTI_SCES if successful.

This function reads the configuration given in the FTI configuration file and sets other required parameters.

Here is the call graph for this function:

int FTI_ReceiveDataChunk ( FTI_ADDRVAL buffer_addr,
FTI_ADDRVAL buffer_size,
FTIFF_dbvar dbvar,
FTIT_dataset FTI_Data 
)

Returns pointer and size of buffer to write during checkpoint.

Parameters
buffer_addrPointer to buffer.
buffer_sizeSize of buffer.
dbvarData chunk meta data.
FTI_DataDataset metadata.
Returns
integer 1 if buffer holds data to write.
integer 0 if nothing to write.

This function is called repeatedly for each data chunk. If it returns 1, 'buffer_addr' holds the pointer to a memory region inside the data chunk and 'buffer_size' holds the size of the region. For dCP disabled, this region is the whole data chunk. For dCP enabled, the function returns a pointer to contiguous dirty regions until no further dirty regions are found in which case 0 is returned.

Here is the call graph for this function:

int FTI_RecoverFiles ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It decides wich action take depending on the restart level.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function launches the required action depending on the recovery level. The recovery level is detected from the checkpoint ID of the last checkpoint taken.

Here is the call graph for this function:

int FTI_RecoverL1 ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It checks that all L1 ckpt. files are present.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function detects all the erasures for L1. If there is at least one, L1 is not considered as recoverable.

Here is the call graph for this function:

int FTI_RecoverL2 ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It recovers L2 ckpt. files using the partner copy.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function tries to recover the L2 ckpt. files missing using the partner copy. If a ckpt. file and its copy are both missing, then we consider this checkpoint unavailable.

Here is the call graph for this function:

int FTI_RecoverL3 ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It recovers L3 ckpt. files ordering the RS decoding algorithm.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function tries to recover the L3 ckpt. files missing using the RS decoding. If to many files are missing in the group, then we consider this checkpoint unavailable.

Here is the call graph for this function:

int FTI_RecoverL4 ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It recovers L4 ckpt. files from the PFS.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function tries to recover the ckpt. files using the L4 ckpt. files stored in the PFS. If at least one ckpt. file is missing in the PFS, we consider this checkpoint unavailable.

Here is the call graph for this function:

int FTI_RecoverL4Mpi ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It recovers L4 ckpt. files from the PFS using MPI-I/O.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function tries to recover the ckpt. files using the L4 ckpt. files stored in the PFS. If at least one ckpt. file is missing in the PFS, we consider this checkpoint unavailable.

Here is the call graph for this function:

int FTI_RecoverL4Posix ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It recovers L4 ckpt. files from the PFS using POSIX.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function tries to recover the ckpt. files using the L4 ckpt. files stored in the PFS. If at least one ckpt. file is missing in the PFS, we consider this checkpoint unavailable.

Here is the call graph for this function:

int FTI_RecoverL4Sionlib ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It recovers L4 ckpt. files from the PFS using SIONlib.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function tries to recover the ckpt. files using the L4 ckpt. files stored in the PFS. If at least one ckpt. file is missing in the PFS, we consider this checkpoint unavailable.

Here is the call graph for this function:

int FTI_ReorderNodes ( FTIT_configuration FTI_Conf,
FTIT_topology FTI_Topo,
int *  nodeList,
char *  nameList 
)

It reorders the nodes following the previous topology.

Parameters
FTI_ConfConfiguration metadata.
FTI_TopoTopology metadata.
nodeListThe list of the nodes.
nameListThe list of the node names.
Returns
integer FTI_SCES if successful.

This function writes the topology of the system (List of nodes and their ID) in a topology file that will be read during recovery to detect which nodes (and therefore checkpoit files) are missing in the new topology.

Here is the call graph for this function:

int FTI_RmDir ( char  path[FTI_BUFS],
int  flag 
)

It erases a directory and all its files.

Parameters
pathPath to the directory we want to erase.
flagSet to 1 to activate.
Returns
integer FTI_SCES if successful.

This function erases a directory and all its files. It focusses on the checkpoint directories created by FTI so it does NOT handle recursive erasing if the given directory includes other directories.

Here is the call graph for this function:

int FTI_RSenc ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

It performs RS encoding with the ckpt. files in to the group.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
Returns
integer FTI_SCES if successful.

This function performs the Reed-Solomon encoding for a given group. The checkpoint files are padded to the maximum size of the largest checkpoint file in the group +- the extra space to be a multiple of block size.

Here is the call graph for this function:

int FTI_SaveTopo ( FTIT_configuration FTI_Conf,
FTIT_topology FTI_Topo,
char *  nameList 
)

It writes the topology in a file for recovery.

Parameters
FTI_ConfConfiguration metadata.
FTI_TopoTopology metadata.
nameListThe list of the node names.
Returns
integer FTI_SCES if successful.

This function writes the topology of the system (List of nodes and their ID) in a topology file that will be read during recovery to detect which nodes (and therefore checkpoit files) are missing in the new topology.

Here is the call graph for this function:

int FTI_TestConfig ( FTIT_configuration FTI_Conf,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
FTIT_execution FTI_Exec 
)

It tests that the configuration given is correct.

Parameters
FTI_ConfConfiguration metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
FTI_ExecExecution metadata.
Returns
integer FTI_SCES if successful.

This function tests the FTI configuration to make sure that all parameter's values are correct.

Here is the call graph for this function:

int FTI_TestDirectories ( FTIT_configuration FTI_Conf,
FTIT_topology FTI_Topo 
)

It tests that the directories given is correct.

Parameters
FTI_ConfConfiguration metadata.
FTI_TopoTopology metadata.
Returns
integer FTI_SCES if successful.

This function tests that the directories given in the FTI configuration are correct.

Here is the call graph for this function:

int FTI_Topology ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo 
)

It builds and saves the topology of the current execution.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
Returns
integer FTI_SCES if successful.

This function builds the topology of the system, detects and replaces missing nodes in case of recovery and creates the communicators required for FTI to work. It stores all required information in FTI_Topo.

Here is the call graph for this function:

int FTI_Try ( int  result,
char *  message 
)

It receives the return code of a function and prints a message.

Parameters
resultResult to check.
messageMessage to print.
Returns
integer The same result as passed in parameter.

This function checks the result from a function and then decides to print the message either as a debug message or as a warning.

Here is the call graph for this function:

int FTI_UpdateConf ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
int  restart 
)

Sets the exec. ID and failure parameters in the conf. file.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
restartValue to set in the conf. file (0 or 1).
Returns
integer FTI_SCES if successful.

This function sets the execution ID and failure parameters in the configuration file. This is to avoid forcing the user to change these values manually in case of recovery needed. In this way, relaunching the execution in the same way as the initial time will make FTI detect that it is a restart. It also allows to set the failure parameter back to 0 at the end of a successful execution.

Here is the call graph for this function:

int FTI_UpdateDcpChanges ( FTIT_dataset FTI_Data,
FTIT_execution FTI_Exec 
)

Updates data chunk hash meta data.

Parameters
FTI_ExecExecution metadata.
FTI_DataDataset metadata.
Returns
integer FTI_SCES if successful.

This function updates the hashes of data blocks that were identified as dirty and initializes the hashes for data blocks that are invalid.

int FTI_UpdateIterTime ( FTIT_execution FTI_Exec)

It updates the local and global mean iteration time.

Parameters
FTI_ExecExecution metadata.
Returns
integer FTI_SCES if successful.

This function updates the local and global mean iteration time. It also recomputes the checkpoint interval in iterations and corrects the next checkpointing iteration based on the observed mean iteration duration.

Here is the call graph for this function:

int FTI_VerifyChecksum ( char *  fileName,
char *  checksumToCmp 
)

It compares checksum of the checkpoint file.

Parameters
fileNameFilename of the checkpoint.
checksumToCmpChecksum to compare.
Returns
integer FTI_SCES if successful.

This function calculates checksum of the checkpoint file based on MD5 algorithm. It compares calculated hash value with the one saved in the file.

Here is the call graph for this function:

int FTI_WriteCkpt ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
FTIT_dataset FTI_Data 
)

It writes the checkpoint data in the target file.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
FTI_DataDataset metadata.
Returns
integer FTI_SCES if successful.

This function checks whether the checkpoint needs to be local or remote, opens the target file and writes dataset per dataset, the checkpoint data, it finally flushes and closes the checkpoint file.

Here is the call graph for this function:

int FTI_WriteCkptMetaData ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt 
)

Creates or updates checkpoint meta data.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.

Writes checkpoint meta data in checkpoint meta data file.

  • timestamp
  • level
  • number of processes participating in the checkpoint
  • I/O mode
  • dCP enabled/disabled

Here is the call graph for this function:

int FTI_WriteMetadata ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
long *  fs,
long  mfs,
char *  fnl,
char *  checksums,
int *  allVarIDs,
long *  allVarSizes 
)

It writes the metadata to recover the data after a failure.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
fsPointer to the list of checkpoint sizes.
mfsThe maximum checkpoint file size.
fnlPointer to the list of checkpoint names.
checksumsChecksums array.
allVarIDsIDs of vars from all processes in group.
allVarSizesSizes of vars from all processes in group.
Returns
integer FTI_SCES if successful.

This function should be executed only by one process per group. It writes the metadata file used to recover in case of failure.

Here is the call graph for this function:

int FTI_WriteMPI ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_dataset FTI_Data 
)

Writes ckpt to PFS using MPI I/O.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_DataDataset metadata.
Returns
integer FTI_SCES if successful.

In here it is taken into account, that in MPIIO the count parameter in both, MPI_Type_contiguous and MPI_File_write_at, are integer types. The ckpt data is split into chunks of maximal (MAX_INT-1)/2 elements to form contiguous data types. It was experienced, that if the size is greater then that, it may lead to problems.

Here is the call graph for this function:

int FTI_WritePar ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_dataset FTI_Data 
)
int FTI_WritePosix ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
FTIT_dataset FTI_Data 
)

Writes ckpt to PFS using POSIX.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
FTI_DataDataset metadata.
Returns
integer FTI_SCES if successful.

Here is the call graph for this function:

int FTI_WriteRSedChecksum ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_checkpoint FTI_Ckpt,
int  rank,
char *  checksum 
)

It writes the RSed file checksum to metadata.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_CkptCheckpoint metadata.
rankglobal rank of the process
checksumPointer to the checksum.
Returns
integer FTI_SCES if successful.

This function should be executed only by one process per group. It writes the RSed checksum to the metadata file.

Here is the call graph for this function:

int FTI_WriteSionlib ( FTIT_configuration FTI_Conf,
FTIT_execution FTI_Exec,
FTIT_topology FTI_Topo,
FTIT_dataset FTI_Data 
)

Writes ckpt to PFS using SIONlib.

Parameters
FTI_ConfConfiguration metadata.
FTI_ExecExecution metadata.
FTI_TopoTopology metadata.
FTI_DataDataset metadata.
Returns
integer FTI_SCES if successful.

Here is the call graph for this function:

Variable Documentation

int FTI_dbstructsize

size of FTIFF_db in file

size of FTIFF_db struct in file

int FTI_dbvarstructsize

size of FTIFF_dbvar in file

size of FTIFF_db struct in file

int FTI_filemetastructsize

size of FTIFF_metaInfo in file

size of FTIFF_db struct in file