McGrattan, Kevin B. Dr. (Fed)
2017-09-15 21:39:32 UTC
I am using MPI_FILE_WRITE_AT to print out the timings of subroutines in a big Fortran code. I have noticed since upgrading to Open MPI 2.1.1 that sometimes the file to be written is corrupted. Each MPI process is supposed to write out a character string that is 159 characters in length, plus a line feed. Sometimes, I see
^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@
Instead of the character string. I cannot reproduce the problem consistently. That is, sometimes the file is fine, and sometimes the records are corrupted randomly. The subroutine is included below. I added some MPI_BARRIERs hoping that this would prevent the file from being closed too early, but that did not help.
SUBROUTINE DUMP_TIMERS
INTEGER, PARAMETER :: LINE_LENGTH=159
CHARACTER, PARAMETER :: LF=ACHAR(10)
CHARACTER(LEN=LINE_LENGTH+1) :: LINE,HEAD
INTEGER :: ERROR,RECORD,FH
CALL MPI_BARRIER(MPI_COMM_WORLD, IERR)
FN_CPU = 'file_cpu.csv'
CALL MPI_TYPE_CONTIGUOUS(LINE_LENGTH+1,MPI_CHARACTER,RECORD,ERROR)
CALL MPI_TYPE_COMMIT(RECORD,ERROR)
CALL MPI_FILE_OPEN(MPI_COMM_WORLD,FN_CPU,MPI_MODE_WRONLY+MPI_MODE_CREATE,MPI_INFO_NULL,FH,ERROR)
CALL MPI_FILE_SET_VIEW(FH,0_MPI_OFFSET_KIND,RECORD,RECORD,'NATIVE',MPI_INFO_NULL,ERROR)
! T_USED(1) is the time spend in the main routine; i.e. the time not spend in some other routine
T_USED(1) = SECOND() - T_USED(1) - SUM(T_USED(2:N_TIMERS))
WRITE(LINE,'(I5,14(",",ES10.3))') MYID,(T_USED(I),I=1,N_TIMERS),SUM(T_USED(1:N_TIMERS))
LINE(LINE_LENGTH+1:LINE_LENGTH+1) = LF
IF (MYID==0) THEN
HEAD(1:LINE_LENGTH+1) = ' '
WRITE(HEAD,'(A)') 'Rank,MAIN,DIVG,MASS,VELO,PRES,WALL,DUMP,PART,RADI,FIRE,COMM,EVAC,HVAC,Total T_USED (s)'
HEAD(LINE_LENGTH+1:LINE_LENGTH+1) = LF
CALL MPI_FILE_WRITE_AT(FH,INT(0,MPI_OFFSET_KIND),HEAD,1,RECORD,MPI_STATUS_IGNORE,ERROR)
ENDIF
CALL MPI_FILE_WRITE_AT(FH,INT(MYID+1,MPI_OFFSET_KIND),LINE,1,RECORD,MPI_STATUS_IGNORE,ERROR)
CALL MPI_BARRIER(MPI_COMM_WORLD, IERR)
CALL MPI_FILE_CLOSE(FH,ERROR)
CALL MPI_TYPE_FREE(RECORD,ERROR)
END SUBROUTINE DUMP_TIMERS
^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@
Instead of the character string. I cannot reproduce the problem consistently. That is, sometimes the file is fine, and sometimes the records are corrupted randomly. The subroutine is included below. I added some MPI_BARRIERs hoping that this would prevent the file from being closed too early, but that did not help.
SUBROUTINE DUMP_TIMERS
INTEGER, PARAMETER :: LINE_LENGTH=159
CHARACTER, PARAMETER :: LF=ACHAR(10)
CHARACTER(LEN=LINE_LENGTH+1) :: LINE,HEAD
INTEGER :: ERROR,RECORD,FH
CALL MPI_BARRIER(MPI_COMM_WORLD, IERR)
FN_CPU = 'file_cpu.csv'
CALL MPI_TYPE_CONTIGUOUS(LINE_LENGTH+1,MPI_CHARACTER,RECORD,ERROR)
CALL MPI_TYPE_COMMIT(RECORD,ERROR)
CALL MPI_FILE_OPEN(MPI_COMM_WORLD,FN_CPU,MPI_MODE_WRONLY+MPI_MODE_CREATE,MPI_INFO_NULL,FH,ERROR)
CALL MPI_FILE_SET_VIEW(FH,0_MPI_OFFSET_KIND,RECORD,RECORD,'NATIVE',MPI_INFO_NULL,ERROR)
! T_USED(1) is the time spend in the main routine; i.e. the time not spend in some other routine
T_USED(1) = SECOND() - T_USED(1) - SUM(T_USED(2:N_TIMERS))
WRITE(LINE,'(I5,14(",",ES10.3))') MYID,(T_USED(I),I=1,N_TIMERS),SUM(T_USED(1:N_TIMERS))
LINE(LINE_LENGTH+1:LINE_LENGTH+1) = LF
IF (MYID==0) THEN
HEAD(1:LINE_LENGTH+1) = ' '
WRITE(HEAD,'(A)') 'Rank,MAIN,DIVG,MASS,VELO,PRES,WALL,DUMP,PART,RADI,FIRE,COMM,EVAC,HVAC,Total T_USED (s)'
HEAD(LINE_LENGTH+1:LINE_LENGTH+1) = LF
CALL MPI_FILE_WRITE_AT(FH,INT(0,MPI_OFFSET_KIND),HEAD,1,RECORD,MPI_STATUS_IGNORE,ERROR)
ENDIF
CALL MPI_FILE_WRITE_AT(FH,INT(MYID+1,MPI_OFFSET_KIND),LINE,1,RECORD,MPI_STATUS_IGNORE,ERROR)
CALL MPI_BARRIER(MPI_COMM_WORLD, IERR)
CALL MPI_FILE_CLOSE(FH,ERROR)
CALL MPI_TYPE_FREE(RECORD,ERROR)
END SUBROUTINE DUMP_TIMERS