Changes between Initial Version and Version 1 of Public/User_Guide/OpenCHK


Ignore:
Timestamp:
Jun 18, 2019, 11:20:54 AM (5 years ago)
Author:
Marcos Maronas Bravo
Comment:

Legend:

Unmodified
Added
Removed
Modified
  • Public/User_Guide/OpenCHK

    v1 v1  
     1== What is OpenCHK? 
     2
     3The OpenCHK model is a pragma-based checkpointing model developed by the Programming Models group at the Barcelona Supercomputing Center.
     4
     5The aim of this model is to provide a generic and portable way to checkpoint and recover data in C/C++ and Fortran High Performance Computing applications.
     6
     7The prototype implementation of the model is based on two software components:
     8
     9Mercurium source-to-source compiler
     10Transparent Checkpoint Library
     11
     12= Loading the OpenCHK module
     13 
     14Firstly, it is required to append some paths to your "MODULEPATH" environment variable:
     15
     16
     17{{{
     18modulepath="/usr/local/software/skylake/Stages/2018b/modules/all/Core:$modulepath"
     19modulepath="/usr/local/software/skylake/Stages/2018b/modules/all/Compiler/mpi/intel/2019.0.117-GCC-7.3.0:$modulepath"
     20modulepath="/usr/local/software/skylake/Stages/2018b/modules/all/MPI/intel/2019.0.117-GCC-7.3.0/psmpi/5.2.1-1-mt:$modulepath"
     21export MODULEPATH="$modulepath:$MODULEPATH"
     22}}}
     23
     24
     25Once this is done, simply load the following modules:
     26
     27
     28{{{
     29module load Intel/2019.0.117-GCC-7.3.0
     30module load ParaStationMPI/5.2.1-1-mt
     31module load OpenCHK/1.0
     32}}}
     33
     34= Documentation and User guide 
     35
     36Manual: https://github.com/bsc-pm/OpenCHK-model
     37
     38= Quick Start Guide
     39
     40== Before the Execution
     41TCL provides support for three different backends: FTI, SCR and VeloC. The backend library must be chosen using the environment variable "TCL_BACKEND". \\
     42NOTE: In the current installation, the only enabled backend is FTI.
     43
     44{{{
     45export TCL_BACKEND=FTI
     46}}}
     47
     48
     49When using FTI as backend library, the user needs to provide an FTI configuration file using the environment variable "FTI_CONF_FILE". (see attachments: config.fti)
     50
     51{{{
     52export FTI_CONF_FILE=config.fti
     53}}}
     54
     55
     56== OpenCHK syntax
     57* The ''init'' construct defines the initialization of a checkpoint context. None of the other constructs must be used without initializing a checkpoint context. \\
     58* The ''shutdown'' construct defines the finalization of a checkpoint context.
     59* The 'store' construct specifies that some variables and memory regions are going to be saved in a new checkpoint.
     60* The 'load' construct specifies that some variables and memory regions are goind to be updated with the stored values that we have previously checkpointed (if any).
     61
     62== Example C/C++
     63Compile the example:
     64
     65{{{
     66mpicxx -cxx=mcxx  --checkpoint -o test_scalar-cpp test_scalar.cpp
     67}}}
     68
     69Run the example: \\
     70NOTE: Remember that you should have set TCL_BACKEND=FTI and a valid FTI_CONF_FILE.
     71
     72{{{
     73mpirun -np 8 ./test_scalar-cpp [step_to_inject_error]
     74}}}
     75
     76
     77{{{#!C++
     78#include <iostream>
     79#include <cassert>
     80#include <stdlib.h>
     81
     82void error_handler(int err_code)
     83{
     84    std::cout << "Error inside CheckpointLib. Error code: " << err_code << "." << std::endl;
     85    exit(-1);
     86}
     87
     88int main(int argc, char **argv)
     89{
     90    int err = MPI_Init(&argc, &argv);
     91    assert(err == MPI_SUCCESS);
     92    MPI_Comm comm = MPI_COMM_WORLD;
     93
     94    int inject_error = -1;
     95    if(argc == 2) {
     96        inject_error = std::atoi(argv[1]);
     97        std::cout << "Inject error at step " << inject_error << "." << std::endl;
     98    }
     99
     100    #pragma chk init comm(comm)
     101    {
     102        int data, i = 0;
     103        bool restored = false;
     104
     105        #pragma chk load(i, data)
     106        if(i != 0) {
     107            std::cout << "Restored data from iteration " << i << ". data = " << data << "." << std::endl;
     108            restored = true;
     109        }
     110        for(i; i < 10; i++) {
     111            data = i;
     112            #pragma chk store(i, data) kind(CHK_FULL) id(i) level((i%4)+1) if(1) handler(error_handler)
     113            if(i == inject_error && !restored) {
     114                std::cout << "Injected error." << std::endl;
     115                exit(-1);
     116            }
     117            std::cout << "Completed step " << i << std::endl;
     118        }
     119    }
     120    #pragma chk shutdown
     121
     122    MPI_Finalize();
     123}
     124}}}
     125
     126== Example Fortran
     127Compile the example:
     128
     129{{{
     130mpif90 -fc=ifort-mfc  --checkpoint -o test_scalar-fortran test_scalar.f90
     131}}}
     132
     133Run the example: \\
     134NOTE: Remember that you should have set TCL_BACKEND=FTI and a valid FTI_CONF_FILE.
     135
     136{{{
     137mpirun -np 8 ./test_scalar-fortran [step_to_inject_error]
     138}}}
     139
     140
     141{{{#!Fortran
     142PROGRAM T1
     143   include 'mpif.h'
     144   integer rank, size, ierror, tag, comm, status(MPI_STATUS_SIZE)
     145   integer actual_data = 0, restored_i = 0, restored = 0, inject_error = -1, i
     146    CHARACTER(len=32) :: arg
     147
     148   call MPI_INIT(ierror)
     149   comm = MPI_COMM_WORLD
     150
     151   if (iargc() == 1) then
     152       call getarg(1, arg)
     153       read (arg,'(I10)') inject_error
     154       print *, 'Inject error at step ', inject_error, '.'
     155   endif
     156
     157!$chk init comm(comm)
     158!$chk load(restored_i, actual_data)
     159   if (restored_i .ne. 0) then
     160       print *, 'Restored data from iteration ', restored_i , '. data = ', actual_data, '.'
     161       restored = 1
     162   end if
     163
     164   do i = restored_i, 10
     165       actual_data = i
     166!$chk store(i, actual_data) kind(0) id(i) level(mod(i,4)+1) if(i .ge. 0)
     167       if (i .eq. inject_error .AND. restored .eq. 0) then
     168           print *, 'Injected error'
     169           call EXIT(-1)
     170       end if
     171       print *, 'Completed step ', i
     172   end do
     173!$chk shutdown
     174
     175   call MPI_FINALIZE(ierror)
     176END PROGRAM
     177}}}