Fault Tolerance Interface
fti.h File Reference

Header file for the FTI library. More...

#include <mpi.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
Include dependency graph for fti.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  FTIFF_metaInfo
 
struct  FTIT_DataDiffHash
 
struct  FTIFF_dbvar
 
struct  FTIFF_db
 
struct  FTIT_StageInfo
 
union  FTIT_double
 
union  FTIT_float
 
struct  FTIT_H5Group
 
struct  FTIT_type
 
struct  FTIT_typeField
 
struct  FTIT_complexType
 
struct  FTIT_dataset
 
struct  FTIT_metadata
 
struct  FTIT_execution
 
struct  FTIT_configuration
 
struct  FTIT_topology
 
struct  FTIT_checkpoint
 
struct  FTIT_injection
 

Macros

#define RED   "\x1B[31m"
 
#define ORG   "\x1B[38;5;202m"
 
#define GRN   "\x1B[32m"
 
#define BLU   "\x1B[34m"
 
#define RESET   "\x1B[0m"
 
#define FTI_BUFS   256
 
#define FTI_WORD   16
 
#define FTI_DONE   1
 
#define FTI_SCES   0
 
#define FTI_NSCS   -1
 
#define FTI_NREC   -2
 
#define FTI_EROR   4
 
#define FTI_WARN   3
 
#define FTI_IDCP   5
 
#define FTI_INFO   2
 
#define FTI_DBUG   1
 
#define FTI_BASE   990
 
#define FTI_CKTW   991
 
#define FTI_XORW   992
 
#define FTI_RSEW   993
 
#define FTI_PFSW   994
 
#define FTI_ENDW   995
 
#define FTI_REJW   996
 
#define FTI_IO_POSIX   1001
 
#define FTI_IO_MPI   1002
 
#define FTI_IO_FTIFF   1003
 
#define FTI_SI_FAIL   0x4
 
#define FTI_SI_SCES   0x3
 
#define FTI_SI_ACTV   0x2
 
#define FTI_SI_PEND   0x1
 
#define FTI_SI_NINI   0x0
 
#define FTI_SI_MAX_NUM   (512L*1024L)
 
#define MD5_DIGEST_LENGTH   16
 
#define MD5_DIGEST_STRING_LENGTH   33
 
#define FTI_IO_SIONLIB   1004
 
#define FTI_IO_HDF5   1005
 
#define FTI_DCP_MODE_OFFSET   2000
 
#define FTI_DCP_MODE_MD5   2001
 
#define FTI_DCP_MODE_CRC32   2002
 

Typedefs

typedef uintptr_t FTI_ADDRVAL
 
typedef void * FTI_ADDRPTR
 
typedef struct FTIFF_metaInfo FTIFF_metaInfo
 Meta Information about file. More...
 
typedef struct FTIT_DataDiffHash FTIT_DataDiffHash
 dCP information about data block. More...
 
typedef struct FTIFF_dbvar FTIFF_dbvar
 Information about protected variable in datablock. More...
 
typedef struct FTIFF_db FTIFF_db
 Information about current datablock. More...
 
typedef struct FTIT_StageInfo FTIT_StageInfo
 Staging meta info. More...
 
typedef union FTIT_double FTIT_double
 Double mapped as two integers to allow bit-wise operations. More...
 
typedef union FTIT_float FTIT_float
 Float mapped as integer to allow bit-wise operations. More...
 
typedef struct FTIT_complexType FTIT_complexType
 Type that consists of other FTI types. More...
 
typedef struct FTIT_H5Group FTIT_H5Group
 
typedef struct FTIT_type FTIT_type
 Type recognized by FTI. More...
 
typedef struct FTIT_typeField FTIT_typeField
 Holds info about field in complex type. More...
 
typedef struct FTIT_dataset FTIT_dataset
 Dataset metadata. More...
 
typedef struct FTIT_metadata FTIT_metadata
 Metadata for restart. More...
 
typedef struct FTIT_execution FTIT_execution
 Execution metadata. More...
 
typedef struct FTIT_configuration FTIT_configuration
 Configuration metadata. More...
 
typedef struct FTIT_topology FTIT_topology
 Topology metadata. More...
 
typedef struct FTIT_checkpoint FTIT_checkpoint
 Checkpoint metadata. More...
 
typedef struct FTIT_injection FTIT_injection
 Type to describe failure injections in FTI. More...
 

Enumerations

enum  FTIT_level {
  FTI_L1 = 1, FTI_L2, FTI_L3, FTI_L4,
  FTI_L1_DCP, FTI_L2_DCP, FTI_L3_DCP, FTI_L4_DCP,
  FTI_MIN_LEVEL_ID = FTI_L1, FTI_MAX_LEVEL_ID = FTI_L4_DCP
}
 holds the level id. More...
 

Functions

int FTI_Init (char *configFile, MPI_Comm globalComm)
 Initializes FTI. More...
 
int FTI_Status ()
 It returns the current status of the recovery flag. More...
 
int FTI_InitType (FTIT_type *type, int size)
 It initializes a data type. More...
 
int FTI_InitComplexType (FTIT_type *newType, FTIT_complexType *typeDefinition, int length, size_t size, char *name, FTIT_H5Group *h5group)
 It initializes a complex data type. More...
 
void FTI_AddSimpleField (FTIT_complexType *typeDefinition, FTIT_type *ftiType, size_t offset, int id, char *name)
 It adds a simple field in complex data type. More...
 
void FTI_AddComplexField (FTIT_complexType *typeDefinition, FTIT_type *ftiType, size_t offset, int rank, int *dimLength, int id, char *name)
 It adds a simple field in complex data type. More...
 
int FTI_InitGroup (FTIT_H5Group *h5group, char *name, FTIT_H5Group *parent)
 It initialize a HDF5 group. More...
 
int FTI_RenameGroup (FTIT_H5Group *h5group, char *name)
 Renames a HDF5 group. More...
 
int FTI_Protect (int id, void *ptr, long count, FTIT_type type)
 It sets/resets the pointer and type to a protected variable. More...
 
int FTI_DefineDataset (int id, int rank, int *dimLength, char *name, FTIT_H5Group *h5group)
 Defines the dataset. More...
 
long FTI_GetStoredSize (int id)
 Returns size saved in metadata of variable. More...
 
void * FTI_Realloc (int id, void *ptr)
 Reallocates dataset to last checkpoint size. More...
 
int FTI_BitFlip (int datasetID)
 Bit-flip injection following the injection instructions. More...
 
int FTI_Checkpoint (int id, int level)
 It takes the checkpoint and triggers the post-ckpt. work. More...
 
int FTI_GetStageDir (char *stageDir, int maxLen)
 Places the FTI staging directory path into 'stageDir'. More...
 
int FTI_GetStageStatus (int ID)
 Returns status of staging request. More...
 
int FTI_SendFile (char *lpath, char *rpath)
 Copies file asynchronously from 'lpath' to 'rpath'. More...
 
int FTI_Recover ()
 It loads the checkpoint data. More...
 
int FTI_Snapshot ()
 Takes an FTI snapshot or recovers the data if it is a restart. More...
 
int FTI_Finalize ()
 It closes FTI properly on the application processes. More...
 
int FTI_RecoverVar (int id)
 During the restart, recovers the given variable. More...
 

Variables

MPI_Comm FTI_COMM_WORLD
 
FTIT_type FTI_CHAR
 
FTIT_type FTI_SHRT
 
FTIT_type FTI_INTG
 
FTIT_type FTI_LONG
 
FTIT_type FTI_UCHR
 
FTIT_type FTI_USHT
 
FTIT_type FTI_UINT
 
FTIT_type FTI_ULNG
 
FTIT_type FTI_SFLT
 
FTIT_type FTI_DBLE
 
FTIT_type FTI_LDBE
 

Detailed Description

Header file for the FTI library.

Author
Leonardo A. Bautista Gomez (leoba.nosp@m.go@g.nosp@m.mail..nosp@m.com)
Date
July, 2013

Macro Definition Documentation

#define BLU   "\x1B[34m"

Define BLUE color for FTI output.

#define FTI_BASE   990

Token for checkpoint Baseline.

#define FTI_BUFS   256

Standard size of buffer and max node size.

#define FTI_CKTW   991

Token for checkpoint Level 1.

#define FTI_DBUG   1

Verbosity level to print debug messages.

#define FTI_DCP_MODE_CRC32   2002
#define FTI_DCP_MODE_MD5   2001
#define FTI_DCP_MODE_OFFSET   2000
#define FTI_DONE   1

Token returned when FTI performs a checkpoint.

#define FTI_ENDW   995

Token for end of the execution.

#define FTI_EROR   4

Verbosity level to print only errors.

#define FTI_IDCP   5

Verbosity level to print main information.

#define FTI_INFO   2

Verbosity level to print debug messages.

#define FTI_IO_FTIFF   1003

Token for IO mode FTI-FF.

#define FTI_IO_HDF5   1005

Token for IO mode HDF5.

#define FTI_IO_MPI   1002

Token for IO mode MPI.

#define FTI_IO_POSIX   1001

Token for IO mode Posix.

#define FTI_IO_SIONLIB   1004

Token for IO mode SIONlib.

#define FTI_NREC   -2

Token returned if recovery fails.

#define FTI_NSCS   -1

Token returned if a FTI function fails.

#define FTI_PFSW   994

Token for checkpoint Level 4.

#define FTI_REJW   996

Token to reject checkpoint.

#define FTI_RSEW   993

Token for checkpoint Level 3.

#define FTI_SCES   0

Token returned if a FTI function succeeds.

#define FTI_SI_ACTV   0x2

status 'active' for stage requests

#define FTI_SI_FAIL   0x4

status 'failed' for stage requests

#define FTI_SI_MAX_NUM   (512L*1024L)

Maximum amount of concurrent active staging requests

Note
leads to 2.5MB for the application processes as minimum memory allocated
#define FTI_SI_NINI   0x0

status 'not initialized' for stage requests

#define FTI_SI_PEND   0x1

status 'pending' for stage requests

#define FTI_SI_SCES   0x3

status 'succeed' for stage requests

#define FTI_WARN   3

Verbosity level to print only warning and errors.

#define FTI_WORD   16

Word size used during RS encoding.

#define FTI_XORW   992

Token for checkpoint Level 2.

#define GRN   "\x1B[32m"

Define GREEN color for FTI output.

#define MD5_DIGEST_LENGTH   16

MD5-hash: unsigned char digest length.

#define MD5_DIGEST_STRING_LENGTH   33

MD5-hash: hex converted char digest length.

#define ORG   "\x1B[38;5;202m"

Define ORANGE color for FTI output.

#define RED   "\x1B[31m"

Define RED color for FTI output.

#define RESET   "\x1B[0m"

Define color RESET for FTI output.

Typedef Documentation

typedef void* FTI_ADDRPTR

void ptr type

typedef uintptr_t FTI_ADDRVAL

for ptr manipulation

Information about current datablock.

(For FTI-FF only) Keeps information about the current datablock in file

Information about protected variable in datablock.

(For FTI-FF only) Keeps information about the chunk of the protected variable with id stored in the current datablock. 'idx' is the index for the array element of 'FTIT_dataset* FTI_Data', that contains variable with 'id'.

Meta Information about file.

(For FTI-FF only) Keeps information about the file. 'checksum' is the hash of the file excluding the file meta data. 'myHash' is the hash of the file meta data.

Checkpoint metadata.

This type stores all the checkpoint metadata.

Type that consists of other FTI types.

This type allows creating complex datatypes.

Configuration metadata.

This type stores the general configuration metadata.

dCP information about data block.

Holds information for each data block relevant for the dCP mechanism. This structure is a member of FTIFF_dbvar. It is stored as an array with n elements, where n corresponds to the number of data blocks in that the data chunk is partitioned (depending on the dCP block size).

Dataset metadata.

This type stores the metadata related with a dataset.

Double mapped as two integers to allow bit-wise operations.

Double mapped as integer and byte array to allow bit-wise operators so that we can inject failures on it.

Execution metadata.

This type stores all the dynamic metadata related to the current execution

Float mapped as integer to allow bit-wise operations.

Float mapped as integer and byte array to allow bit-wise operators so that we can inject failures on it.

typedef struct FTIT_H5Group FTIT_H5Group

Type to describe failure injections in FTI.

This type allows users to describe a SDC failure injection model.

Metadata for restart.

This type stores all the metadata necessary for the restart.

Staging meta info.

The request pointer is void in order to allow the structure to keep the head rank staging info if used by a head process or the application rank staging info otherwise. The cast is performed via the macros 'FTI_SI_HPTR( ptr )' for the head processes and 'FTI_SI_APTR( ptr )' for the application processes.

Topology metadata.

This type stores the topology metadata.

Type recognized by FTI.

This type allows handling data structures.

Holds info about field in complex type.

This type simplify creating complex datatypes.

Enumeration Type Documentation

enum FTIT_level

holds the level id.

Enumerator
FTI_L1 
FTI_L2 
FTI_L3 
FTI_L4 
FTI_L1_DCP 
FTI_L2_DCP 
FTI_L3_DCP 
FTI_L4_DCP 
FTI_MIN_LEVEL_ID 
FTI_MAX_LEVEL_ID 

Function Documentation

void FTI_AddComplexField ( FTIT_complexType typeDefinition,
FTIT_type ftiType,
size_t  offset,
int  rank,
int *  dimLength,
int  id,
char *  name 
)

It adds a simple field in complex data type.

Parameters
typeDefinitionStructure definition of the complex data type.
ftiTypeType of the field
offsetOffset of the field (use offsetof)
rankRank of the array
dimLengthDimention length for each rank
idId of the field (start with 0)
nameName of the field (put NULL if want default)
Returns
integer FTI_SCES if successful.

This function adds a field to the complex datatype. Use offsetof macro to set offset. First ID must be 0, next one must be +1. If name is NULL FTI will set "T${id}" name.

void FTI_AddSimpleField ( FTIT_complexType typeDefinition,
FTIT_type ftiType,
size_t  offset,
int  id,
char *  name 
)

It adds a simple field in complex data type.

Parameters
typeDefinitionStructure definition of the complex data type.
ftiTypeType of the field
offsetOffset of the field (use offsetof)
idId of the field (start with 0)
nameName of the field (put NULL if want default)
Returns
integer FTI_SCES if successful.

This function adds a field to the complex datatype. Use offsetof macro to set offset. First ID must be 0, next one must be +1. If name is NULL FTI will set "T${id}" name. Sets rank and dimLength to 1.

int FTI_BitFlip ( int  datasetID)

Bit-flip injection following the injection instructions.

Parameters
datasetIDID of the dataset where to inject.
Returns
integer FTI_SCES if successful.

This function injects the given number of bit-flips, at the given frequency and in the given location (rank, dataset, bit position).

Here is the call graph for this function:

int FTI_Checkpoint ( int  id,
int  level 
)

It takes the checkpoint and triggers the post-ckpt. work.

Parameters
idCheckpoint ID.
levelCheckpoint level.
Returns
integer FTI_SCES if successful.

This function starts by blocking on a receive if the previous ckpt. was offline. Then, it updates the ckpt. information. It writes down the ckpt. data, creates the metadata and the post-processing work. This function is complementary with the FTI_Listen function in terms of communications.

Here is the call graph for this function:

int FTI_DefineDataset ( int  id,
int  rank,
int *  dimLength,
char *  name,
FTIT_H5Group h5group 
)

Defines the dataset.

Parameters
idID for searches and update.
rankRank of the array
dimLengthDimention length for each rank
nameName of the dataset in HDF5 file.
h5groupGroup of the dataset. If Null then "/"
Returns
integer FTI_SCES if successful.

This function gives FTI all information needed by HDF5 to correctly save the dataset in the checkpoint file.

Here is the call graph for this function:

int FTI_Finalize ( )

It closes FTI properly on the application processes.

Returns
integer FTI_SCES if successful.

This function notifies the FTI processes that the execution is over, frees some data structures and it closes. If this function is not called on the application processes the FTI processes will never finish (deadlock).

Here is the call graph for this function:

int FTI_GetStageDir ( char *  stageDir,
int  maxLen 
)

Places the FTI staging directory path into 'stageDir'.

Parameters
stageDirpointer to allocated memory region.
maxLensize of allocated memory region in bytes.
Returns
integer FTI_SCES if successful, FTI_NSCS else.

This function places the FTI staging directory path in 'stageDir'. If allocation size is not sufficiant, no action is perfoprmed and FTI_NSCS is returned.

Here is the call graph for this function:

int FTI_GetStageStatus ( int  ID)

Returns status of staging request.

Parameters
IDID of staging request.
Returns
integer Status of staging request on success, FTI_NSCS else.

This function returns the status of the staging request corresponding to ID. The ID is returned by the function 'FTI_SendFile'. The status may be one of the five possible statuses:

FTI_SI_FAIL - Stage request failed FTI_SI_SCES - Stage request succeed FTI_SI_ACTV - Stage request is currently processed FTI_SI_PEND - Stage request is pending FTI_SI_NINI - There is no stage request with this ID
Note
If the status is FTI_SI_NINI, the ID is either invalid or the request was finished (succeeded or failed). In the latter case, 'FTI_GetStageStatus' returns FTI_SI_FAIL or FTI_SI_SCES and frees the stage request ressources. In the consecutive call it will then return FTI_SI_NINI.

Here is the call graph for this function:

long FTI_GetStoredSize ( int  id)

Returns size saved in metadata of variable.

Parameters
idVariable ID.
Returns
long Returns size of variable or 0 if size not saved.

This function returns size of variable of given ID that is saved in metadata. This may be different from size of variable that is in the program. If this function it's called when recovery it returns size from metadata file, if it's called after checkpoint it returns size saved in temporary metadata. If there is no size saved in metadata it returns 0.

Here is the call graph for this function:

int FTI_Init ( char *  configFile,
MPI_Comm  globalComm 
)

Initializes FTI.

Parameters
configFileFTI configuration file.
globalCommMain MPI communicator of the application.
Returns
integer FTI_SCES if successful.

This function initializes the FTI context and prepares the heads to wait for checkpoints. FTI processes should never get out of this function. In case of a restart, checkpoint files should be recovered and in place at the end of this function.

Here is the call graph for this function:

int FTI_InitComplexType ( FTIT_type newType,
FTIT_complexType typeDefinition,
int  length,
size_t  size,
char *  name,
FTIT_H5Group h5group 
)

It initializes a complex data type.

Parameters
newTypeThe data type to be intialized.
typeDefinitionStructure definition of the new type.
lengthNumber of fields in structure
sizeSize of the structure.
nameName of the structure.
h5groupGroup of the type.
Returns
integer FTI_SCES if successful.

This function initalizes a simple data type. New type can only consists fields of flat FTI types (no arrays). Type definition must include:

  • length => number of fields in the new type
  • field[].type => types of the field in the new type
  • field[].name => name of the field in the new type
  • field[].rank => number of dimentions of the field
  • field[].dimLength[] => length of each dimention of the field

Here is the call graph for this function:

int FTI_InitGroup ( FTIT_H5Group h5group,
char *  name,
FTIT_H5Group parent 
)

It initialize a HDF5 group.

Parameters
h5groupH5 group that we want to initialize
nameName of the H5 group
parentParent H5 group
Returns
integer FTI_SCES if successful.

Initialize group defined by user. If parent is NULL this mean parent will be set to root group.

int FTI_InitType ( FTIT_type type,
int  size 
)

It initializes a data type.

Parameters
typeThe data type to be intialized.
sizeThe size of the data type to be intialized.
Returns
integer FTI_SCES if successful.

This function initalizes a data type. The only information needed is the size of the data type, the rest is black box for FTI. Types saved as byte array in case of HDF5 format.

int FTI_Protect ( int  id,
void *  ptr,
long  count,
FTIT_type  type 
)

It sets/resets the pointer and type to a protected variable.

Parameters
idID for searches and update.
ptrPointer to the data structure.
countNumber of elements in the data structure.
typeType of elements in the data structure.
Returns
integer FTI_SCES if successful.

This function stores a pointer to a data structure, its size, its ID, its number of elements and the type of the elements. This list of structures is the data that will be stored during a checkpoint and loaded during a recovery. It resets the pointer to a data structure, its size, its number of elements and the type of the elements if the dataset was already previously registered.

Here is the call graph for this function:

void* FTI_Realloc ( int  id,
void *  ptr 
)

Reallocates dataset to last checkpoint size.

Parameters
idVariable ID.
ptrPointer to the variable.
Returns
ptr Pointer if successful, NULL otherwise This function loads the checkpoint data size from the metadata file, reallacates memory and updates data size information.

Here is the call graph for this function:

int FTI_Recover ( )

It loads the checkpoint data.

Returns
integer FTI_SCES if successful.

This function loads the checkpoint data from the checkpoint file and it updates some basic checkpoint information.

Here is the call graph for this function:

int FTI_RecoverVar ( int  id)

During the restart, recovers the given variable.

Parameters
idVariable to recover
Returns
int FTI_SCES if successful.

During a restart process, this function recovers the variable specified by the given id. No effect during a regular execution. The variable must have already been protected, otherwise, FTI_NSCS is returned. Improvements to be done:

  • Open checkpoint file at FTI_Init, close it at FTI_Snapshot
  • Maintain a variable accumulating the offset as variable are protected during the restart to avoid doing the loop to calculate the offset in the checkpoint file.

Here is the call graph for this function:

int FTI_RenameGroup ( FTIT_H5Group h5group,
char *  name 
)

Renames a HDF5 group.

Parameters
h5groupH5 group that we want to rename
nameNew name of the H5 group
Returns
integer FTI_SCES if successful.

This function renames HDF5 group defined by user.

int FTI_SendFile ( char *  lpath,
char *  rpath 
)

Copies file asynchronously from 'lpath' to 'rpath'.

Parameters
lpathabsolute path local file.
rpathabsolute path remote file.
Returns
integer Request handle (ID) on success, FTI_NSCS else.

This function may be used to copy a file local on the nodes via the FTI head process asynchronously to the PFS. The file will not be removed after successful transfer, however, if stored in the directory returned by 'FTI_GetStageDir' it will be removed during 'FTI_Finalize'.

If staging is enabled but no head process, the staging will be performed synchronously (i.e. by the calling rank).

Here is the call graph for this function:

int FTI_Snapshot ( )

Takes an FTI snapshot or recovers the data if it is a restart.

Returns
integer FTI_SCES if successful.

This function loads the checkpoint data from the checkpoint file in case of restart. Otherwise, it checks if the current iteration requires checkpointing, if it does it checks which checkpoint level, write the data in the files and it communicates with the head of the node to inform that a checkpoint has been taken. Checkpoint ID and counters are updated.

Here is the call graph for this function:

int FTI_Status ( )

It returns the current status of the recovery flag.

Returns
integer FTI_Exec.reco.

This function returns the current status of the recovery flag.

Variable Documentation

FTIT_type FTI_CHAR

FTI data type for chars.

MPI_Comm FTI_COMM_WORLD

MPI communicator that splits the global one into app and FTI appart.

FTIT_type FTI_DBLE

FTI data type for double floating point.

FTIT_type FTI_INTG

FTI data type for integers.

FTIT_type FTI_LDBE

FTI data type for long doble floating point.

FTIT_type FTI_LONG

FTI data type for long integers.

FTIT_type FTI_SFLT

FTI data type for single floating point.

FTIT_type FTI_SHRT

FTI data type for short integers.

FTIT_type FTI_UCHR

FTI data type for unsigned chars.

FTIT_type FTI_UINT

FTI data type for unsigned integers.

FTIT_type FTI_ULNG

FTI data type for unsigned long integers.

FTIT_type FTI_USHT

FTI data type for unsigned short integers.