| 1 | == What is OpenCHK? |
| 2 | |
| 3 | The OpenCHK model is a pragma-based checkpointing model developed by the Programming Models group at the Barcelona Supercomputing Center. |
| 4 | |
| 5 | The aim of this model is to provide a generic and portable way to checkpoint and recover data in C/C++ and Fortran High Performance Computing applications. |
| 6 | |
| 7 | The prototype implementation of the model is based on two software components: |
| 8 | |
| 9 | Mercurium source-to-source compiler |
| 10 | Transparent Checkpoint Library |
| 11 | |
| 12 | = Loading the OpenCHK module |
| 13 | |
| 14 | Firstly, it is required to append some paths to your "MODULEPATH" environment variable: |
| 15 | |
| 16 | |
| 17 | {{{ |
| 18 | modulepath="/usr/local/software/skylake/Stages/2018b/modules/all/Core:$modulepath" |
| 19 | modulepath="/usr/local/software/skylake/Stages/2018b/modules/all/Compiler/mpi/intel/2019.0.117-GCC-7.3.0:$modulepath" |
| 20 | modulepath="/usr/local/software/skylake/Stages/2018b/modules/all/MPI/intel/2019.0.117-GCC-7.3.0/psmpi/5.2.1-1-mt:$modulepath" |
| 21 | export MODULEPATH="$modulepath:$MODULEPATH" |
| 22 | }}} |
| 23 | |
| 24 | |
| 25 | Once this is done, simply load the following modules: |
| 26 | |
| 27 | |
| 28 | {{{ |
| 29 | module load Intel/2019.0.117-GCC-7.3.0 |
| 30 | module load ParaStationMPI/5.2.1-1-mt |
| 31 | module load OpenCHK/1.0 |
| 32 | }}} |
| 33 | |
| 34 | = Documentation and User guide |
| 35 | |
| 36 | Manual: https://github.com/bsc-pm/OpenCHK-model |
| 37 | |
| 38 | = Quick Start Guide |
| 39 | |
| 40 | == Before the Execution |
| 41 | TCL provides support for three different backends: FTI, SCR and VeloC. The backend library must be chosen using the environment variable "TCL_BACKEND". \\ |
| 42 | NOTE: In the current installation, the only enabled backend is FTI. |
| 43 | |
| 44 | {{{ |
| 45 | export TCL_BACKEND=FTI |
| 46 | }}} |
| 47 | |
| 48 | |
| 49 | When using FTI as backend library, the user needs to provide an FTI configuration file using the environment variable "FTI_CONF_FILE". (see attachments: config.fti) |
| 50 | |
| 51 | {{{ |
| 52 | export FTI_CONF_FILE=config.fti |
| 53 | }}} |
| 54 | |
| 55 | |
| 56 | == OpenCHK syntax |
| 57 | * The ''init'' construct defines the initialization of a checkpoint context. None of the other constructs must be used without initializing a checkpoint context. \\ |
| 58 | * The ''shutdown'' construct defines the finalization of a checkpoint context. |
| 59 | * The 'store' construct specifies that some variables and memory regions are going to be saved in a new checkpoint. |
| 60 | * The 'load' construct specifies that some variables and memory regions are goind to be updated with the stored values that we have previously checkpointed (if any). |
| 61 | |
| 62 | == Example C/C++ |
| 63 | Compile the example: |
| 64 | |
| 65 | {{{ |
| 66 | mpicxx -cxx=mcxx --checkpoint -o test_scalar-cpp test_scalar.cpp |
| 67 | }}} |
| 68 | |
| 69 | Run the example: \\ |
| 70 | NOTE: Remember that you should have set TCL_BACKEND=FTI and a valid FTI_CONF_FILE. |
| 71 | |
| 72 | {{{ |
| 73 | mpirun -np 8 ./test_scalar-cpp [step_to_inject_error] |
| 74 | }}} |
| 75 | |
| 76 | |
| 77 | {{{#!C++ |
| 78 | #include <iostream> |
| 79 | #include <cassert> |
| 80 | #include <stdlib.h> |
| 81 | |
| 82 | void error_handler(int err_code) |
| 83 | { |
| 84 | std::cout << "Error inside CheckpointLib. Error code: " << err_code << "." << std::endl; |
| 85 | exit(-1); |
| 86 | } |
| 87 | |
| 88 | int main(int argc, char **argv) |
| 89 | { |
| 90 | int err = MPI_Init(&argc, &argv); |
| 91 | assert(err == MPI_SUCCESS); |
| 92 | MPI_Comm comm = MPI_COMM_WORLD; |
| 93 | |
| 94 | int inject_error = -1; |
| 95 | if(argc == 2) { |
| 96 | inject_error = std::atoi(argv[1]); |
| 97 | std::cout << "Inject error at step " << inject_error << "." << std::endl; |
| 98 | } |
| 99 | |
| 100 | #pragma chk init comm(comm) |
| 101 | { |
| 102 | int data, i = 0; |
| 103 | bool restored = false; |
| 104 | |
| 105 | #pragma chk load(i, data) |
| 106 | if(i != 0) { |
| 107 | std::cout << "Restored data from iteration " << i << ". data = " << data << "." << std::endl; |
| 108 | restored = true; |
| 109 | } |
| 110 | for(i; i < 10; i++) { |
| 111 | data = i; |
| 112 | #pragma chk store(i, data) kind(CHK_FULL) id(i) level((i%4)+1) if(1) handler(error_handler) |
| 113 | if(i == inject_error && !restored) { |
| 114 | std::cout << "Injected error." << std::endl; |
| 115 | exit(-1); |
| 116 | } |
| 117 | std::cout << "Completed step " << i << std::endl; |
| 118 | } |
| 119 | } |
| 120 | #pragma chk shutdown |
| 121 | |
| 122 | MPI_Finalize(); |
| 123 | } |
| 124 | }}} |
| 125 | |
| 126 | == Example Fortran |
| 127 | Compile the example: |
| 128 | |
| 129 | {{{ |
| 130 | mpif90 -fc=ifort-mfc --checkpoint -o test_scalar-fortran test_scalar.f90 |
| 131 | }}} |
| 132 | |
| 133 | Run the example: \\ |
| 134 | NOTE: Remember that you should have set TCL_BACKEND=FTI and a valid FTI_CONF_FILE. |
| 135 | |
| 136 | {{{ |
| 137 | mpirun -np 8 ./test_scalar-fortran [step_to_inject_error] |
| 138 | }}} |
| 139 | |
| 140 | |
| 141 | {{{#!Fortran |
| 142 | PROGRAM T1 |
| 143 | include 'mpif.h' |
| 144 | integer rank, size, ierror, tag, comm, status(MPI_STATUS_SIZE) |
| 145 | integer actual_data = 0, restored_i = 0, restored = 0, inject_error = -1, i |
| 146 | CHARACTER(len=32) :: arg |
| 147 | |
| 148 | call MPI_INIT(ierror) |
| 149 | comm = MPI_COMM_WORLD |
| 150 | |
| 151 | if (iargc() == 1) then |
| 152 | call getarg(1, arg) |
| 153 | read (arg,'(I10)') inject_error |
| 154 | print *, 'Inject error at step ', inject_error, '.' |
| 155 | endif |
| 156 | |
| 157 | !$chk init comm(comm) |
| 158 | !$chk load(restored_i, actual_data) |
| 159 | if (restored_i .ne. 0) then |
| 160 | print *, 'Restored data from iteration ', restored_i , '. data = ', actual_data, '.' |
| 161 | restored = 1 |
| 162 | end if |
| 163 | |
| 164 | do i = restored_i, 10 |
| 165 | actual_data = i |
| 166 | !$chk store(i, actual_data) kind(0) id(i) level(mod(i,4)+1) if(i .ge. 0) |
| 167 | if (i .eq. inject_error .AND. restored .eq. 0) then |
| 168 | print *, 'Injected error' |
| 169 | call EXIT(-1) |
| 170 | end if |
| 171 | print *, 'Completed step ', i |
| 172 | end do |
| 173 | !$chk shutdown |
| 174 | |
| 175 | call MPI_FINALIZE(ierror) |
| 176 | END PROGRAM |
| 177 | }}} |