############## FTI CONFIGURATION FILE ############### # ***************************************************************** # *** Here are the main parameters you should provide to FTI ****** # ***************************************************************** [Basic] # Set to 1 if you want to dedicate 1 MPI rank per node to FTI # set to 0 if you want ALL ckpt. post-processing to be done inline Head = 0 # The number of processes launched per node (Same for every node) # including FTI-dedicated process. Node_size = 2 # LOCAL directory where the local checkpoints will be stored # This directory MUST exist and have write access Ckpt_dir = ./Local #/path/to/local/storage/ # GLOBAL directory where the global checkpoints will be stored # This directory MUST exist and have write access Glbl_dir = ./Global #/path/to/global/storage/ # GLOBAL directory where the FTI metadata will be stored # This directory MUST exist and have write access Meta_dir = ./Meta #/home/username/.fti # Level 1 ckpt interval in minutes of L1 ckpts (Local write) Ckpt_L1 = 3 # Level 2 ckpt interval in minutes of L2 ckpts (Partner copy) Ckpt_L2 = 5 # Level 3 ckpt interval in minutes of L3 ckpts (Reed-Solomon) Ckpt_L3 = 7 # Level 4 ckpt interval in minutes of L4 ckpts (PFS write) Ckpt_L4 = 11 # dCP interval in minutes for level 4 checkpoints # dCP - differential checkpointing # This setting requires io_mode=3 (FTI-FF) and dcp_enabled=1 Dcp_L4 = 0 # 1 if Level 2 ckpt is inline (synchronous) 0 if not (asynchronous) Inline_L2 = 1 # 1 if Level 3 ckpt is inline (synchronous) 0 if not (asynchronous) Inline_L3 = 1 # 1 if Level 4 ckpt is inline (synchronous) 0 if not (asynchronous) Inline_L4 = 1 # Set to 1 if you want to save the last checkpoint taken before finalize # Set to 0 if you want to erase all checkpoints after finalize keep_last_ckpt = 0 # Enabled, all level 4 checkpoints of the execution will be kept in 'Glbl_dir/l4_archive' keep_l4_ckpt = 0 # The size of the encoding groups (Something between 4 and 16) # The total number of nodes MUST be multiple of this parameter Group_size = 4 # Number of iterations between iteration length sync (0 => 512 iterations) # If you app has iterations of varying length set this value between (1 and 10) max_sync_intv = 0 # Set to: # 1 -> POSIX # 2 -> MPI-IO # 3 -> FTI-FF # 4 -> SIONLib # 5 -> HDF5. ckpt_io = 1 # Enable staging feature Enable_Staging = 0 # Enable differential checkpointing (dCP) Enable_dCP = 0 # Select dCP hashing algorithm: # 1 -> MD5 # 2 -> CRC32 # The modes may be set as well by the environment variable 'FTI_DCP_HASH_MODE=[0|1]' # This will overwrite the setting from the configuration file! dCP_Mode = 0 # Set hash-partition block size # The partition block size, b, must be: 512 < b < USHRT_MAX (Bytes) # b may be set as well by the environment variable 'FTI_DCP_BLOCK_SIZE=b (in bytes)' # This will overwrite the setting from the configuration file! dCP_Block_Size = 16384 # The verbosity of FTI. (2 is recommended) # 3 (Print only errors, silent mode) # 2 (Print errors and some few important information) # 1 (Print debug messages, very verbose) Verbosity = 2 # ***************************************************************** # *** Change these parameters ONLY in case of restart *********** # ***************************************************************** [Restart] # Set this to 0 if you are launching this job for the first time # Set this to 1 if you are recovering this job after a failure Failure = 0 # Set with the execution ID in case of restart after failure # Set to NULL if normal execution Exec_ID = XXXX-XX-XX_XX-XX-XX # ***************************************************************** # *** Change these parameters to inject failures. *********** # ***************************************************************** [Injection] # Rank of the process that injects the failures rank = 0 # Total number of bit-flips to inject number = 0 # Bit position of the injection position = 0 # Injection frequency in seconds frequency = 0 # ***************************************************************** # *** Change something here ONLY if you know what you are doing *** # ***************************************************************** [Advanced] # The ckpt files are decomposed in blocks of size Block_size KB Block_size = 1024 # The ckpt files are transfered in chunks of size Transfer_size MB # from local to PFS Transfer_size = 16 # The tags for MPI communications done within the FTI library general_tag = 2612 ckpt_tag = 711 stage_tag = 406 final_tag = 3107 # Set to 1 if you are doing a test in local in a single computer Local_test = 1 #This option only impacts if -DENABLE_LUSTRE was added to the Cmake command. #It sets the striping unit for the MPI-IO file. lustre_striping_unit = 4194304 #This option only impacts if -DENABLE_LUSTRE was added to the Cmake command. #It sets the striping factor for the MPI-IO file. lustre_striping_factor = -1 #This option only impacts if -DENABLE_LUSTRE was added to the Cmake command. #It sets the striping offset for the MPI-IO file. lustre_striping_offset = -1