Fault Tolerance Interface
Default Configuration File
1 ############## FTI CONFIGURATION FILE ###############
2 
3 # *****************************************************************
4 # *** Here are the main parameters you should provide to FTI ******
5 # *****************************************************************
6 [Basic]
7 
8 # Set to 1 if you want to dedicate 1 MPI rank per node to FTI
9 # set to 0 if you want ALL ckpt. post-processing to be done inline
10 Head = 0
11 
12 # The number of processes launched per node (Same for every node)
13 # including FTI-dedicated process.
14 Node_size = 2
15 
16 # LOCAL directory where the local checkpoints will be stored
17 # This directory MUST exist and have write access
18 Ckpt_dir = ./Local #/path/to/local/storage/
19 
20 # GLOBAL directory where the global checkpoints will be stored
21 # This directory MUST exist and have write access
22 Glbl_dir = ./Global #/path/to/global/storage/
23 
24 # GLOBAL directory where the FTI metadata will be stored
25 # This directory MUST exist and have write access
26 Meta_dir = ./Meta #/home/username/.fti
27 
28 # Level 1 ckpt interval in minutes of L1 ckpts (Local write)
29 Ckpt_L1 = 3
30 
31 # Level 2 ckpt interval in minutes of L2 ckpts (Partner copy)
32 Ckpt_L2 = 5
33 
34 # Level 3 ckpt interval in minutes of L3 ckpts (Reed-Solomon)
35 Ckpt_L3 = 7
36 
37 # Level 4 ckpt interval in minutes of L4 ckpts (PFS write)
38 Ckpt_L4 = 11
39 
40 # dCP interval in minutes for level 4 checkpoints
41 # dCP - differential checkpointing
42 # This setting requires io_mode=3 (FTI-FF) and dcp_enabled=1
43 Dcp_L4 = 0
44 
45 # 1 if Level 2 ckpt is inline (synchronous) 0 if not (asynchronous)
46 Inline_L2 = 1
47 
48 # 1 if Level 3 ckpt is inline (synchronous) 0 if not (asynchronous)
49 Inline_L3 = 1
50 
51 # 1 if Level 4 ckpt is inline (synchronous) 0 if not (asynchronous)
52 Inline_L4 = 1
53 
54 # Set to 1 if you want to save the last checkpoint taken before finalize
55 # Set to 0 if you want to erase all checkpoints after finalize
56 keep_last_ckpt = 0
57 
58 # Enabled, all level 4 checkpoints of the execution will be kept in 'Glbl_dir/l4_archive'
59 keep_l4_ckpt = 0
60 
61 # The size of the encoding groups (Something between 4 and 16)
62 # The total number of nodes MUST be multiple of this parameter
63 Group_size = 4
64 
65 # Number of iterations between iteration length sync (0 => 512 iterations)
66 # If you app has iterations of varying length set this value between (1 and 10)
67 max_sync_intv = 0
68 
69 # Set to:
70 # 1 -> POSIX
71 # 2 -> MPI-IO
72 # 3 -> FTI-FF
73 # 4 -> SIONLib
74 # 5 -> HDF5.
75 ckpt_io = 1
76 
77 # Enable staging feature
78 Enable_Staging = 0
79 
80 # Enable differential checkpointing (dCP)
81 Enable_dCP = 0
82 
83 # Select dCP hashing algorithm:
84 # 1 -> MD5
85 # 2 -> CRC32
86 # The modes may be set as well by the environment variable 'FTI_DCP_HASH_MODE=[0|1]'
87 # This will overwrite the setting from the configuration file!
88 dCP_Mode = 0
89 
90 # Set hash-partition block size
91 # The partition block size, b, must be: 512 < b < USHRT_MAX (Bytes)
92 # b may be set as well by the environment variable 'FTI_DCP_BLOCK_SIZE=b (in bytes)'
93 # This will overwrite the setting from the configuration file!
94 dCP_Block_Size = 16384
95 
96 # The verbosity of FTI. (2 is recommended)
97 # 3 (Print only errors, silent mode)
98 # 2 (Print errors and some few important information)
99 # 1 (Print debug messages, very verbose)
100 Verbosity = 2
101 
102 # *****************************************************************
103 # *** Change these parameters ONLY in case of restart ***********
104 # *****************************************************************
105 
106 [Restart]
107 
108 # Set this to 0 if you are launching this job for the first time
109 # Set this to 1 if you are recovering this job after a failure
110 Failure = 0
111 
112 # Set with the execution ID in case of restart after failure
113 # Set to NULL if normal execution
114 Exec_ID = XXXX-XX-XX_XX-XX-XX
115 
116 
117 # *****************************************************************
118 # *** Change these parameters to inject failures. ***********
119 # *****************************************************************
120 
121 [Injection]
122 
123 # Rank of the process that injects the failures
124 rank = 0
125 
126 # Total number of bit-flips to inject
127 number = 0
128 
129 # Bit position of the injection
130 position = 0
131 
132 # Injection frequency in seconds
133 frequency = 0
134 
135 
136 # *****************************************************************
137 # *** Change something here ONLY if you know what you are doing ***
138 # *****************************************************************
139 [Advanced]
140 
141 # The ckpt files are decomposed in blocks of size Block_size KB
142 Block_size = 1024
143 
144 # The ckpt files are transfered in chunks of size Transfer_size MB
145 # from local to PFS
146 Transfer_size = 16
147 
148 # The tags for MPI communications done within the FTI library
149 general_tag = 2612
150 ckpt_tag = 711
151 stage_tag = 406
152 final_tag = 3107
153 
154 # Set to 1 if you are doing a test in local in a single computer
155 Local_test = 1
156 
157 #This option only impacts if -DENABLE_LUSTRE was added to the Cmake command.
158 #It sets the striping unit for the MPI-IO file.
159 lustre_striping_unit = 4194304
160 
161 #This option only impacts if -DENABLE_LUSTRE was added to the Cmake command.
162 #It sets the striping factor for the MPI-IO file.
163 lustre_striping_factor = -1
164 
165 #This option only impacts if -DENABLE_LUSTRE was added to the Cmake command.
166 #It sets the striping offset for the MPI-IO file.
167 lustre_striping_offset = -1