Fault Tolerance Interface
|
Header file for the FTI library. More...
#include <mpi.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
Go to the source code of this file.
Classes | |
struct | FTIFF_metaInfo |
struct | FTIT_DataDiffHash |
struct | FTIFF_dbvar |
struct | FTIFF_db |
struct | FTIT_StageInfo |
union | FTIT_double |
union | FTIT_float |
struct | FTIT_H5Group |
struct | FTIT_type |
struct | FTIT_typeField |
struct | FTIT_complexType |
struct | FTIT_dataset |
struct | FTIT_metadata |
struct | FTIT_execution |
struct | FTIT_configuration |
struct | FTIT_topology |
struct | FTIT_checkpoint |
struct | FTIT_injection |
Macros | |
#define | RED "\x1B[31m" |
#define | ORG "\x1B[38;5;202m" |
#define | GRN "\x1B[32m" |
#define | BLU "\x1B[34m" |
#define | RESET "\x1B[0m" |
#define | FTI_BUFS 256 |
#define | FTI_WORD 16 |
#define | FTI_DONE 1 |
#define | FTI_SCES 0 |
#define | FTI_NSCS -1 |
#define | FTI_NREC -2 |
#define | FTI_EROR 4 |
#define | FTI_WARN 3 |
#define | FTI_IDCP 5 |
#define | FTI_INFO 2 |
#define | FTI_DBUG 1 |
#define | FTI_BASE 990 |
#define | FTI_CKTW 991 |
#define | FTI_XORW 992 |
#define | FTI_RSEW 993 |
#define | FTI_PFSW 994 |
#define | FTI_ENDW 995 |
#define | FTI_REJW 996 |
#define | FTI_IO_POSIX 1001 |
#define | FTI_IO_MPI 1002 |
#define | FTI_IO_FTIFF 1003 |
#define | FTI_SI_FAIL 0x4 |
#define | FTI_SI_SCES 0x3 |
#define | FTI_SI_ACTV 0x2 |
#define | FTI_SI_PEND 0x1 |
#define | FTI_SI_NINI 0x0 |
#define | FTI_SI_MAX_NUM (512L*1024L) |
#define | MD5_DIGEST_LENGTH 16 |
#define | MD5_DIGEST_STRING_LENGTH 33 |
#define | FTI_IO_SIONLIB 1004 |
#define | FTI_IO_HDF5 1005 |
#define | FTI_DCP_MODE_OFFSET 2000 |
#define | FTI_DCP_MODE_MD5 2001 |
#define | FTI_DCP_MODE_CRC32 2002 |
Typedefs | |
typedef uintptr_t | FTI_ADDRVAL |
typedef void * | FTI_ADDRPTR |
typedef struct FTIFF_metaInfo | FTIFF_metaInfo |
Meta Information about file. More... | |
typedef struct FTIT_DataDiffHash | FTIT_DataDiffHash |
dCP information about data block. More... | |
typedef struct FTIFF_dbvar | FTIFF_dbvar |
Information about protected variable in datablock. More... | |
typedef struct FTIFF_db | FTIFF_db |
Information about current datablock. More... | |
typedef struct FTIT_StageInfo | FTIT_StageInfo |
Staging meta info. More... | |
typedef union FTIT_double | FTIT_double |
Double mapped as two integers to allow bit-wise operations. More... | |
typedef union FTIT_float | FTIT_float |
Float mapped as integer to allow bit-wise operations. More... | |
typedef struct FTIT_complexType | FTIT_complexType |
Type that consists of other FTI types. More... | |
typedef struct FTIT_H5Group | FTIT_H5Group |
typedef struct FTIT_type | FTIT_type |
Type recognized by FTI. More... | |
typedef struct FTIT_typeField | FTIT_typeField |
Holds info about field in complex type. More... | |
typedef struct FTIT_dataset | FTIT_dataset |
Dataset metadata. More... | |
typedef struct FTIT_metadata | FTIT_metadata |
Metadata for restart. More... | |
typedef struct FTIT_execution | FTIT_execution |
Execution metadata. More... | |
typedef struct FTIT_configuration | FTIT_configuration |
Configuration metadata. More... | |
typedef struct FTIT_topology | FTIT_topology |
Topology metadata. More... | |
typedef struct FTIT_checkpoint | FTIT_checkpoint |
Checkpoint metadata. More... | |
typedef struct FTIT_injection | FTIT_injection |
Type to describe failure injections in FTI. More... | |
Enumerations | |
enum | FTIT_level { FTI_L1 = 1, FTI_L2, FTI_L3, FTI_L4, FTI_L1_DCP, FTI_L2_DCP, FTI_L3_DCP, FTI_L4_DCP, FTI_MIN_LEVEL_ID = FTI_L1, FTI_MAX_LEVEL_ID = FTI_L4_DCP } |
holds the level id. More... | |
Functions | |
int | FTI_Init (char *configFile, MPI_Comm globalComm) |
Initializes FTI. More... | |
int | FTI_Status () |
It returns the current status of the recovery flag. More... | |
int | FTI_InitType (FTIT_type *type, int size) |
It initializes a data type. More... | |
int | FTI_InitComplexType (FTIT_type *newType, FTIT_complexType *typeDefinition, int length, size_t size, char *name, FTIT_H5Group *h5group) |
It initializes a complex data type. More... | |
void | FTI_AddSimpleField (FTIT_complexType *typeDefinition, FTIT_type *ftiType, size_t offset, int id, char *name) |
It adds a simple field in complex data type. More... | |
void | FTI_AddComplexField (FTIT_complexType *typeDefinition, FTIT_type *ftiType, size_t offset, int rank, int *dimLength, int id, char *name) |
It adds a simple field in complex data type. More... | |
int | FTI_InitGroup (FTIT_H5Group *h5group, char *name, FTIT_H5Group *parent) |
It initialize a HDF5 group. More... | |
int | FTI_RenameGroup (FTIT_H5Group *h5group, char *name) |
Renames a HDF5 group. More... | |
int | FTI_Protect (int id, void *ptr, long count, FTIT_type type) |
It sets/resets the pointer and type to a protected variable. More... | |
int | FTI_DefineDataset (int id, int rank, int *dimLength, char *name, FTIT_H5Group *h5group) |
Defines the dataset. More... | |
long | FTI_GetStoredSize (int id) |
Returns size saved in metadata of variable. More... | |
void * | FTI_Realloc (int id, void *ptr) |
Reallocates dataset to last checkpoint size. More... | |
int | FTI_BitFlip (int datasetID) |
Bit-flip injection following the injection instructions. More... | |
int | FTI_Checkpoint (int id, int level) |
It takes the checkpoint and triggers the post-ckpt. work. More... | |
int | FTI_GetStageDir (char *stageDir, int maxLen) |
Places the FTI staging directory path into 'stageDir'. More... | |
int | FTI_GetStageStatus (int ID) |
Returns status of staging request. More... | |
int | FTI_SendFile (char *lpath, char *rpath) |
Copies file asynchronously from 'lpath' to 'rpath'. More... | |
int | FTI_Recover () |
It loads the checkpoint data. More... | |
int | FTI_Snapshot () |
Takes an FTI snapshot or recovers the data if it is a restart. More... | |
int | FTI_Finalize () |
It closes FTI properly on the application processes. More... | |
int | FTI_RecoverVar (int id) |
During the restart, recovers the given variable. More... | |
Header file for the FTI library.
#define BLU "\x1B[34m" |
Define BLUE color for FTI output.
#define FTI_BASE 990 |
Token for checkpoint Baseline.
#define FTI_BUFS 256 |
Standard size of buffer and max node size.
#define FTI_CKTW 991 |
Token for checkpoint Level 1.
#define FTI_DBUG 1 |
Verbosity level to print debug messages.
#define FTI_DCP_MODE_CRC32 2002 |
#define FTI_DCP_MODE_MD5 2001 |
#define FTI_DCP_MODE_OFFSET 2000 |
#define FTI_DONE 1 |
Token returned when FTI performs a checkpoint.
#define FTI_ENDW 995 |
Token for end of the execution.
#define FTI_EROR 4 |
Verbosity level to print only errors.
#define FTI_IDCP 5 |
Verbosity level to print main information.
#define FTI_INFO 2 |
Verbosity level to print debug messages.
#define FTI_IO_FTIFF 1003 |
Token for IO mode FTI-FF.
#define FTI_IO_HDF5 1005 |
Token for IO mode HDF5.
#define FTI_IO_MPI 1002 |
Token for IO mode MPI.
#define FTI_IO_POSIX 1001 |
Token for IO mode Posix.
#define FTI_IO_SIONLIB 1004 |
Token for IO mode SIONlib.
#define FTI_NREC -2 |
Token returned if recovery fails.
#define FTI_NSCS -1 |
Token returned if a FTI function fails.
#define FTI_PFSW 994 |
Token for checkpoint Level 4.
#define FTI_REJW 996 |
Token to reject checkpoint.
#define FTI_RSEW 993 |
Token for checkpoint Level 3.
#define FTI_SCES 0 |
Token returned if a FTI function succeeds.
#define FTI_SI_ACTV 0x2 |
status 'active' for stage requests
#define FTI_SI_FAIL 0x4 |
status 'failed' for stage requests
#define FTI_SI_MAX_NUM (512L*1024L) |
Maximum amount of concurrent active staging requests
#define FTI_SI_NINI 0x0 |
status 'not initialized' for stage requests
#define FTI_SI_PEND 0x1 |
status 'pending' for stage requests
#define FTI_SI_SCES 0x3 |
status 'succeed' for stage requests
#define FTI_WARN 3 |
Verbosity level to print only warning and errors.
#define FTI_WORD 16 |
Word size used during RS encoding.
#define FTI_XORW 992 |
Token for checkpoint Level 2.
#define GRN "\x1B[32m" |
Define GREEN color for FTI output.
#define MD5_DIGEST_LENGTH 16 |
MD5-hash: unsigned char digest length.
#define MD5_DIGEST_STRING_LENGTH 33 |
MD5-hash: hex converted char digest length.
#define ORG "\x1B[38;5;202m" |
Define ORANGE color for FTI output.
#define RED "\x1B[31m" |
Define RED color for FTI output.
#define RESET "\x1B[0m" |
Define color RESET for FTI output.
typedef void* FTI_ADDRPTR |
void ptr type
typedef uintptr_t FTI_ADDRVAL |
for ptr manipulation
Information about current datablock.
(For FTI-FF only) Keeps information about the current datablock in file
Information about protected variable in datablock.
(For FTI-FF only) Keeps information about the chunk of the protected variable with id stored in the current datablock. 'idx' is the index for the array element of 'FTIT_dataset* FTI_Data', that contains variable with 'id'.
Meta Information about file.
(For FTI-FF only) Keeps information about the file. 'checksum' is the hash of the file excluding the file meta data. 'myHash' is the hash of the file meta data.
Checkpoint metadata.
This type stores all the checkpoint metadata.
typedef struct FTIT_complexType FTIT_complexType |
Type that consists of other FTI types.
This type allows creating complex datatypes.
Configuration metadata.
This type stores the general configuration metadata.
dCP information about data block.
Holds information for each data block relevant for the dCP mechanism. This structure is a member of FTIFF_dbvar. It is stored as an array with n elements, where n corresponds to the number of data blocks in that the data chunk is partitioned (depending on the dCP block size).
Dataset metadata.
This type stores the metadata related with a dataset.
Double mapped as two integers to allow bit-wise operations.
Double mapped as integer and byte array to allow bit-wise operators so that we can inject failures on it.
Execution metadata.
This type stores all the dynamic metadata related to the current execution
Float mapped as integer to allow bit-wise operations.
Float mapped as integer and byte array to allow bit-wise operators so that we can inject failures on it.
typedef struct FTIT_H5Group FTIT_H5Group |
Type to describe failure injections in FTI.
This type allows users to describe a SDC failure injection model.
Metadata for restart.
This type stores all the metadata necessary for the restart.
Staging meta info.
The request pointer is void in order to allow the structure to keep the head rank staging info if used by a head process or the application rank staging info otherwise. The cast is performed via the macros 'FTI_SI_HPTR( ptr )' for the head processes and 'FTI_SI_APTR( ptr )' for the application processes.
Topology metadata.
This type stores the topology metadata.
Holds info about field in complex type.
This type simplify creating complex datatypes.
enum FTIT_level |
void FTI_AddComplexField | ( | FTIT_complexType * | typeDefinition, |
FTIT_type * | ftiType, | ||
size_t | offset, | ||
int | rank, | ||
int * | dimLength, | ||
int | id, | ||
char * | name | ||
) |
It adds a simple field in complex data type.
typeDefinition | Structure definition of the complex data type. |
ftiType | Type of the field |
offset | Offset of the field (use offsetof) |
rank | Rank of the array |
dimLength | Dimention length for each rank |
id | Id of the field (start with 0) |
name | Name of the field (put NULL if want default) |
This function adds a field to the complex datatype. Use offsetof macro to set offset. First ID must be 0, next one must be +1. If name is NULL FTI will set "T${id}" name.
void FTI_AddSimpleField | ( | FTIT_complexType * | typeDefinition, |
FTIT_type * | ftiType, | ||
size_t | offset, | ||
int | id, | ||
char * | name | ||
) |
It adds a simple field in complex data type.
typeDefinition | Structure definition of the complex data type. |
ftiType | Type of the field |
offset | Offset of the field (use offsetof) |
id | Id of the field (start with 0) |
name | Name of the field (put NULL if want default) |
This function adds a field to the complex datatype. Use offsetof macro to set offset. First ID must be 0, next one must be +1. If name is NULL FTI will set "T${id}" name. Sets rank and dimLength to 1.
int FTI_BitFlip | ( | int | datasetID | ) |
Bit-flip injection following the injection instructions.
datasetID | ID of the dataset where to inject. |
This function injects the given number of bit-flips, at the given frequency and in the given location (rank, dataset, bit position).
int FTI_Checkpoint | ( | int | id, |
int | level | ||
) |
It takes the checkpoint and triggers the post-ckpt. work.
id | Checkpoint ID. |
level | Checkpoint level. |
This function starts by blocking on a receive if the previous ckpt. was offline. Then, it updates the ckpt. information. It writes down the ckpt. data, creates the metadata and the post-processing work. This function is complementary with the FTI_Listen function in terms of communications.
int FTI_DefineDataset | ( | int | id, |
int | rank, | ||
int * | dimLength, | ||
char * | name, | ||
FTIT_H5Group * | h5group | ||
) |
Defines the dataset.
id | ID for searches and update. |
rank | Rank of the array |
dimLength | Dimention length for each rank |
name | Name of the dataset in HDF5 file. |
h5group | Group of the dataset. If Null then "/" |
This function gives FTI all information needed by HDF5 to correctly save the dataset in the checkpoint file.
int FTI_Finalize | ( | ) |
It closes FTI properly on the application processes.
This function notifies the FTI processes that the execution is over, frees some data structures and it closes. If this function is not called on the application processes the FTI processes will never finish (deadlock).
int FTI_GetStageDir | ( | char * | stageDir, |
int | maxLen | ||
) |
Places the FTI staging directory path into 'stageDir'.
stageDir | pointer to allocated memory region. |
maxLen | size of allocated memory region in bytes. |
This function places the FTI staging directory path in 'stageDir'. If allocation size is not sufficiant, no action is perfoprmed and FTI_NSCS is returned.
int FTI_GetStageStatus | ( | int | ID | ) |
Returns status of staging request.
ID | ID of staging request. |
This function returns the status of the staging request corresponding to ID. The ID is returned by the function 'FTI_SendFile'. The status may be one of the five possible statuses:
long FTI_GetStoredSize | ( | int | id | ) |
Returns size saved in metadata of variable.
id | Variable ID. |
This function returns size of variable of given ID that is saved in metadata. This may be different from size of variable that is in the program. If this function it's called when recovery it returns size from metadata file, if it's called after checkpoint it returns size saved in temporary metadata. If there is no size saved in metadata it returns 0.
int FTI_Init | ( | char * | configFile, |
MPI_Comm | globalComm | ||
) |
Initializes FTI.
configFile | FTI configuration file. |
globalComm | Main MPI communicator of the application. |
This function initializes the FTI context and prepares the heads to wait for checkpoints. FTI processes should never get out of this function. In case of a restart, checkpoint files should be recovered and in place at the end of this function.
int FTI_InitComplexType | ( | FTIT_type * | newType, |
FTIT_complexType * | typeDefinition, | ||
int | length, | ||
size_t | size, | ||
char * | name, | ||
FTIT_H5Group * | h5group | ||
) |
It initializes a complex data type.
newType | The data type to be intialized. |
typeDefinition | Structure definition of the new type. |
length | Number of fields in structure |
size | Size of the structure. |
name | Name of the structure. |
h5group | Group of the type. |
This function initalizes a simple data type. New type can only consists fields of flat FTI types (no arrays). Type definition must include:
int FTI_InitGroup | ( | FTIT_H5Group * | h5group, |
char * | name, | ||
FTIT_H5Group * | parent | ||
) |
It initialize a HDF5 group.
h5group | H5 group that we want to initialize |
name | Name of the H5 group |
parent | Parent H5 group |
Initialize group defined by user. If parent is NULL this mean parent will be set to root group.
int FTI_InitType | ( | FTIT_type * | type, |
int | size | ||
) |
It initializes a data type.
type | The data type to be intialized. |
size | The size of the data type to be intialized. |
This function initalizes a data type. The only information needed is the size of the data type, the rest is black box for FTI. Types saved as byte array in case of HDF5 format.
int FTI_Protect | ( | int | id, |
void * | ptr, | ||
long | count, | ||
FTIT_type | type | ||
) |
It sets/resets the pointer and type to a protected variable.
id | ID for searches and update. |
ptr | Pointer to the data structure. |
count | Number of elements in the data structure. |
type | Type of elements in the data structure. |
This function stores a pointer to a data structure, its size, its ID, its number of elements and the type of the elements. This list of structures is the data that will be stored during a checkpoint and loaded during a recovery. It resets the pointer to a data structure, its size, its number of elements and the type of the elements if the dataset was already previously registered.
void* FTI_Realloc | ( | int | id, |
void * | ptr | ||
) |
Reallocates dataset to last checkpoint size.
id | Variable ID. |
ptr | Pointer to the variable. |
int FTI_Recover | ( | ) |
It loads the checkpoint data.
This function loads the checkpoint data from the checkpoint file and it updates some basic checkpoint information.
int FTI_RecoverVar | ( | int | id | ) |
During the restart, recovers the given variable.
id | Variable to recover |
During a restart process, this function recovers the variable specified by the given id. No effect during a regular execution. The variable must have already been protected, otherwise, FTI_NSCS is returned. Improvements to be done:
int FTI_RenameGroup | ( | FTIT_H5Group * | h5group, |
char * | name | ||
) |
Renames a HDF5 group.
h5group | H5 group that we want to rename |
name | New name of the H5 group |
This function renames HDF5 group defined by user.
int FTI_SendFile | ( | char * | lpath, |
char * | rpath | ||
) |
Copies file asynchronously from 'lpath' to 'rpath'.
lpath | absolute path local file. |
rpath | absolute path remote file. |
This function may be used to copy a file local on the nodes via the FTI head process asynchronously to the PFS. The file will not be removed after successful transfer, however, if stored in the directory returned by 'FTI_GetStageDir' it will be removed during 'FTI_Finalize'.
int FTI_Snapshot | ( | ) |
Takes an FTI snapshot or recovers the data if it is a restart.
This function loads the checkpoint data from the checkpoint file in case of restart. Otherwise, it checks if the current iteration requires checkpointing, if it does it checks which checkpoint level, write the data in the files and it communicates with the head of the node to inform that a checkpoint has been taken. Checkpoint ID and counters are updated.
int FTI_Status | ( | ) |
It returns the current status of the recovery flag.
This function returns the current status of the recovery flag.
MPI_Comm FTI_COMM_WORLD |
MPI communicator that splits the global one into app and FTI appart.