[dcmf] [PATCH 2/5] introducing a lockless "ad_ufs". ROMIO will use this method if it finds

Bob Cernohous bobc at us.ibm.com
Fri Feb 15 09:44:57 CST 2008


What's the functional difference between your new 
ADIOI_NOLOCK_WriteStrided and ADIOI_GEN_WriteStrided_naive?  In an earlier 
patch we enabled ADIOI_GEN_WriteStrided_naive when romio_ds_write was 
disabled.   Will using ADIOI_GEN_WriteStrided_naive in af_ufs work well 
for you on pvfs?

diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c 
b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
index ce0f6a5..bc65198 100755
--- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
@@ -21,7 +21,7 @@ struct ADIOI_Fns_struct ADIO_UFS_operations = {
     ADIOI_GEN_Fcntl, /* Fcntl */
     ADIOI_BGL_SetInfo, /* SetInfo */
     ADIOI_GEN_ReadStrided, /* ReadStrided */
-    ADIOI_NOLOCK_WriteStrided, /* WriteStrided */
+    ADIOI_GEN_WriteStrided_naive, /*ADIOI_NOLOCK_WriteStrided, * 
WriteStrided */
     ADIOI_BGL_Close, /* Close */
 #ifdef ROMIO_HAVE_WORKING_AIO
     ADIOI_GEN_IreadContig, /* IreadContig */

The issue I'm having is romio noncontig testcases fail on ufs:files on 
GPFS or NFS filesystems using ADIOI_NOLOCK_WriteStrided but work using 
ADIOI_GEN_WriteStrided_naive.    I'm wondering if 
ADIOI_NOLOCK_WriteStrided works if pvfs is the underlying file system but 
not for more generic af_ufs filesystems?   I'm not comfortable enabling 
af_ufs on bgl if it could be abused and cause failures on non-pvfs 
filesystems.   I suppose we could explicitly put in some checks to disable 
it if the underlying file system wasn't pvfs... Thoughts?

Bob Cernohous:  (T/L 553) 507-253-6093

BobC at us.ibm.com
IBM Rochester, Building 030-2(C335), Department 61L
3605 Hwy 52 North, Rochester,  MN 55901-7829

> Chaos reigns within.
> Reflect, repent, and reboot.
> Order shall return.


dcmf-bounces at lists.anl-external.org wrote on 02/12/2008 02:22:03 PM:

> 
> Signed-off-by: Rob Latham <robl at mcs.anl.gov>
> ---
>  lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c  |    2 +-
>  .../mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c |    3 +
>  .../mpich2/src/mpi/romio/adio/common/Makefile.in   |    2 +-
>  .../mpich2/src/mpi/romio/adio/common/ad_fstype.c   |   23 ++
>  .../src/mpi/romio/adio/common/ad_write_nolock.c    |  331 
> ++++++++++++++++++++
>  lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h  |    4 +
>  6 files changed, 363 insertions(+), 2 deletions(-)
>  create mode 100644 
lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
> 
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c 
> b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
> index 3e016ab..f311afc 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
> @@ -20,7 +20,7 @@ struct ADIOI_Fns_struct ADIO_UFS_operations = {
>      ADIOI_GEN_Fcntl, /* Fcntl */
>      ADIOI_GEN_SetInfo, /* SetInfo */
>      ADIOI_GEN_ReadStrided, /* ReadStrided */
> -    ADIOI_GEN_WriteStrided, /* WriteStrided */
> +    ADIOI_NOLOCK_WriteStrided, /* WriteStrided */
>      ADIOI_GEN_Close, /* Close */
>      ADIOI_GEN_IreadContig, /* IreadContig */
>      ADIOI_GEN_IwriteContig, /* IwriteContig */
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c 
> b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
> index 4e91594..1a0dfb9 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
> @@ -12,6 +12,9 @@ void ADIOI_UFS_Open(ADIO_File fd, int *error_code)
>      int perm, old_mask, amode;
>      static char myname[] = "ADIOI_UFS_OPEN";
> 
> +    /* set internal variables for tuning environment variables */
> +    ad_bgl_get_env_vars(); 
> +
>      if (fd->perm == ADIO_PERM_NULL) {
>     old_mask = umask(022);
>     umask(old_mask);
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in 
> b/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
> index 627dc96..9e1d597 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
> @@ -28,7 +28,7 @@ AD_OBJECTS = ad_close.o ad_init.o ad_end.o 
> ad_open.o flatten.o \
>        ad_write_str_naive.o ad_resize.o ad_read.o ad_write.o ad_iread.o 
\
>        ad_iwrite.o ad_done.o ad_wait.o adi_close.o ad_prealloc.o 
ad_fcntl.o \
>        ad_iread_fake.o ad_iwrite_fake.o ad_done_fake.o ad_wait_fake.o \
> -      ad_subarray.o ad_darray.o strfns.o
> +      ad_subarray.o ad_darray.o strfns.o ad_write_nolock.o
> 
>  all: $(LIBNAME)
>     @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c 
> b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
> index 452596f..e90332d 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
> @@ -187,6 +187,28 @@ static void ADIO_FileSysType_parentdir(char 
> *filename, char **dirnamep)
>  }
>  #endif /* ROMIO_NTFS */
> 
> +static void check_for_pvfs_exception(char *filename, int *fstype)
> +{
> +    /* exception for lockless PVFS file system */
> +    int err;
> +    struct statfs fsbuf;
> +    char *dir;
> +
> +    do {
> +   err = statfs(filename, &fsbuf);
> +    } while (err && (errno == ESTALE));
> +
> +    if (err && (errno == ENOENT)) {
> +   ADIO_FileSysType_parentdir(filename, &dir);
> +   err = statfs(dir, &fsbuf);
> +   ADIOI_Free(dir);
> +    }
> +    /* 2030528: magic number for pvfs, but we might not have pvfs 
> header files
> +     * in this environment */
> +    if (fsbuf.f_type == 20030528) 
> +       *fstype = ADIO_UFS;
> +}
> +
>  /*
>   ADIO_FileSysType_fncall - determines the file system type for a given 
file 
>   using a system-dependent function call
> @@ -212,6 +234,7 @@ static void ADIO_FileSysType_fncall(char 
> *filename, int *fstype, int *error_code
>    -------------------------------------------------------------*/
>  #ifdef ROMIO_BGL
>      *fstype = ADIO_BGL;
> +    check_for_pvfs_exception(filename, fstype);
>      *error_code = MPI_SUCCESS;
>      return;
>  #endif  /* ROMIO_BGL */
> diff --git 
> a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c 
> b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
> new file mode 100644
> index 0000000..79b11a7
> --- /dev/null
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
> @@ -0,0 +1,331 @@
> +/* -*- Mode: C; c-basic-offset:4 ; -*- */
> +/* 
> + *
> + *   Copyright (C) 1997 University of Chicago. 
> + *   See COPYRIGHT notice in top-level directory.
> + */
> +
> +#include "adio.h"
> +#include "adio_extern.h"
> +
> +#ifdef HAVE_UNISTD_H
> +#include <unistd.h>
> +#endif
> +
> +void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, void *buf, int count,
> +              MPI_Datatype datatype, int file_ptr_type,
> +              ADIO_Offset offset, ADIO_Status *status, int
> +              *error_code)
> +{
> +/* borrowed from old-school PVFS (v1) code. A driver for file systems 
that
> + * cannot or do not support client-side buffering
> + * Does not do data sieving optimization
> + * Does contain write-combining optimization for noncontig in 
> memory, contig in
> + * file 
> + */
> +
> +/* offset is in units of etype relative to the filetype. */
> +
> +    ADIOI_Flatlist_node *flat_buf, *flat_file;
> +    int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0;
> +    int bufsize, num, size, sum, n_etypes_in_filetype, 
size_in_filetype;
> +    int n_filetypes, etype_in_filetype;
> +    ADIO_Offset abs_off_in_filetype=0;
> +    int filetype_size, etype_size, buftype_size;
> +    MPI_Aint filetype_extent, buftype_extent, indx;
> +    int buf_count, buftype_is_contig, filetype_is_contig;
> +    ADIO_Offset off, disp;
> +    int flag, new_bwr_size, new_fwr_size, err_flag=0;
> +    static char myname[] = "ADIOI_PVFS_WRITESTRIDED";
> +
> +    /* --BEGIN ERROR HANDLING-- */
> +    if (fd->atomicity) {
> +   *error_code = MPIO_Err_create_code(MPI_SUCCESS, 
MPIR_ERR_RECOVERABLE,
> +                  myname, __LINE__,
> +                  MPI_ERR_INTERN,
> +                  "Atomic mode set in I/O function", 0);
> +   return;
> +    }
> +    /* --END ERROR HANDLING-- */
> +
> +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
> +    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
> +
> +    MPI_Type_size(fd->filetype, &filetype_size);
> +    if ( ! filetype_size ) {
> +   *error_code = MPI_SUCCESS; 
> +   return;
> +    }
> +
> +    MPI_Type_extent(fd->filetype, &filetype_extent);
> +    MPI_Type_size(datatype, &buftype_size);
> +    MPI_Type_extent(datatype, &buftype_extent);
> +    etype_size = fd->etype_size;
> + 
> +    bufsize = buftype_size * count;
> +
> +    if (!buftype_is_contig && filetype_is_contig) {
> +   char *combine_buf, *combine_buf_ptr;
> +   ADIO_Offset combine_buf_remain;
> +/* noncontiguous in memory, contiguous in file. use writev */
> +
> +   ADIOI_Flatten_datatype(datatype);
> +   flat_buf = ADIOI_Flatlist;
> +   while (flat_buf->type != datatype) flat_buf = flat_buf->next;
> +
> +   /* allocate our "combine buffer" to pack data into before writing */
> +   combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size);
> +   combine_buf_ptr = combine_buf;
> +   combine_buf_remain = fd->hints->ind_wr_buffer_size;
> +
> +   /* seek to the right spot in the file */
> +   if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
> +       off = fd->disp + etype_size * offset;
> +       lseek64(fd->fd_sys, off, SEEK_SET);
> +   }
> +   else off = lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET);
> +
> +   /* loop through all the flattened pieces.  combine into buffer until
> +    * no more will fit, then write.
> +    *
> +    * special case of a given piece being bigger than the combine 
buffer
> +    * is also handled.
> +    */
> +   for (j=0; j<count; j++) {
> +       for (i=0; i<flat_buf->count; i++) {
> +      if (flat_buf->blocklens[i] > combine_buf_remain && 
> combine_buf != combine_buf_ptr) {
> +          /* there is data in the buffer; write out the buffer so far 
*/
> +          err = write(fd->fd_sys,
> +                 combine_buf,
> +                 fd->hints->ind_wr_buffer_size - combine_buf_remain);
> +          if (err == -1) err_flag = 1;
> +
> +          /* reset our buffer info */
> +          combine_buf_ptr = combine_buf;
> +          combine_buf_remain = fd->hints->ind_wr_buffer_size;
> +      }
> +
> +      /* TODO: heuristic for when to not bother to use combine buffer? 
*/
> +      if (flat_buf->blocklens[i] >= combine_buf_remain) {
> +          /* special case: blocklen is as big as or bigger than the
> combine buf;
> +           * write directly
> +           */
> +          err = write(fd->fd_sys,
> +                 ((char *) buf) + j*buftype_extent + 
flat_buf->indices[i],
> +                 flat_buf->blocklens[i]);
> +          if (err == -1) err_flag = 1;
> +          off += flat_buf->blocklens[i]; /* keep up with the final 
> file offset too */
> +      }
> +      else {
> +          /* copy more data into combine buffer */
> +          memcpy(combine_buf_ptr,
> +            ((char *) buf) + j*buftype_extent + flat_buf->indices[i],
> +            flat_buf->blocklens[i]);
> +          combine_buf_ptr += flat_buf->blocklens[i];
> +          combine_buf_remain -= flat_buf->blocklens[i];
> +          off += flat_buf->blocklens[i]; /* keep up with the final 
> file offset too */
> +      }
> +       }
> +   }
> +
> +   if (combine_buf_ptr != combine_buf) {
> +       /* data left in buffer to write */
> +       err = write(fd->fd_sys,
> +              combine_buf,
> +              fd->hints->ind_wr_buffer_size - combine_buf_remain);
> +       if (err == -1) err_flag = 1;
> +   }
> +
> +   if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
> +
> +   ADIOI_Free(combine_buf);
> +
> +   if (err_flag) {
> +       *error_code = MPIO_Err_create_code(MPI_SUCCESS,
> +                      MPIR_ERR_RECOVERABLE, myname,
> +                      __LINE__, MPI_ERR_IO, "**io",
> +                      "**io %s", strerror(errno));
> +   }
> +   else *error_code = MPI_SUCCESS;
> +    } /* if (!buftype_is_contig && filetype_is_contig)  ... */
> +
> +    else {  /* noncontiguous in file */
> +
> +/* split up into several contiguous writes */
> +
> +/* find starting location in the file */
> +
> +/* filetype already flattened in ADIO_Open */
> +   flat_file = ADIOI_Flatlist;
> +   while (flat_file->type != fd->filetype) flat_file = flat_file->next;
> +        disp = fd->disp;
> +
> +   if (file_ptr_type == ADIO_INDIVIDUAL) {
> +       offset = fd->fp_ind; /* in bytes */
> +            n_filetypes = -1;
> +            flag = 0;
> +            while (!flag) {
> +                n_filetypes++;
> +                for (i=0; i<flat_file->count; i++) {
> +                    if (disp + flat_file->indices[i] + 
> +                        (ADIO_Offset) n_filetypes*filetype_extent +
> flat_file->blocklens[i] 
> +                            >= offset) {
> +                        st_index = i;
> +                        fwr_size = disp + flat_file->indices[i] + 
> +                                (ADIO_Offset) 
n_filetypes*filetype_extent
> +                                 + flat_file->blocklens[i] - offset;
> +                        flag = 1;
> +                        break;
> +                    }
> +                }
> +            }
> +   }
> +   else {
> +       n_etypes_in_filetype = filetype_size/etype_size;
> +       n_filetypes = (int) (offset / n_etypes_in_filetype);
> +       etype_in_filetype = (int) (offset % n_etypes_in_filetype);
> +       size_in_filetype = etype_in_filetype * etype_size;
> + 
> +       sum = 0;
> +       for (i=0; i<flat_file->count; i++) {
> +      sum += flat_file->blocklens[i];
> +      if (sum > size_in_filetype) {
> +          st_index = i;
> +          fwr_size = sum - size_in_filetype;
> +          abs_off_in_filetype = flat_file->indices[i] +
> +         size_in_filetype - (sum - flat_file->blocklens[i]);
> +          break;
> +      }
> +       }
> +
> +       /* abs. offset in bytes in the file */
> +            offset = disp + (ADIO_Offset) 
> n_filetypes*filetype_extent + abs_off_in_filetype;
> +   }
> +
> +   if (buftype_is_contig && !filetype_is_contig) {
> +
> +/* contiguous in memory, noncontiguous in file. should be the most
> +   common case. */
> +
> +       i = 0;
> +       j = st_index;
> +       off = offset;
> +       fwr_size = ADIOI_MIN(fwr_size, bufsize);
> +       while (i < bufsize) {
> +                if (fwr_size) { 
> +                    /* TYPE_UB and TYPE_LB can result in 
> +                       fwr_size = 0. save system call in such cases */ 
> +#ifdef PROFILE
> +          MPE_Log_event(5, 0, "start write");
> +#endif
> +          err = pwrite(fd->fd_sys, ((char *) buf) + i, fwr_size, off);
> +#ifdef PROFILE
> +          MPE_Log_event(6, 0, "end write");
> +#endif
> +          if (err == -1) err_flag = 1;
> +      }
> +      i += fwr_size;
> +
> +                if (off + fwr_size < disp + flat_file->indices[j] +
> +                   flat_file->blocklens[j] + (ADIO_Offset) 
> n_filetypes*filetype_extent)
> +                       off += fwr_size;
> +                /* did not reach end of contiguous block in filetype.
> +                   no more I/O needed. off is incremented by fwr_size. 
*/
> +                else {
> +          if (j < (flat_file->count - 1)) j++;
> +          else {
> +         j = 0;
> +         n_filetypes++;
> +          }
> +          off = disp + flat_file->indices[j] + 
> +                                        (ADIO_Offset) 
> n_filetypes*filetype_extent;
> +          fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
> +      }
> +       }
> +   }
> +   else {
> +/* noncontiguous in memory as well as in file */
> +
> +       ADIOI_Flatten_datatype(datatype);
> +       flat_buf = ADIOI_Flatlist;
> +       while (flat_buf->type != datatype) flat_buf = flat_buf->next;
> +
> +       k = num = buf_count = 0;
> +       indx = flat_buf->indices[0];
> +       j = st_index;
> +       off = offset;
> +       bwr_size = flat_buf->blocklens[0];
> +
> +       while (num < bufsize) {
> +      size = ADIOI_MIN(fwr_size, bwr_size);
> +      if (size) {
> +#ifdef PROFILE
> +          MPE_Log_event(5, 0, "start write");
> +#endif
> +          err = pwrite(fd->fd_sys, ((char *) buf) + indx, size, off);
> +#ifdef PROFILE
> +          MPE_Log_event(6, 0, "end write");
> +#endif
> +          if (err == -1) err_flag = 1;
> +      }
> +
> +      new_fwr_size = fwr_size;
> +      new_bwr_size = bwr_size;
> +
> +      if (size == fwr_size) {
> +/* reached end of contiguous block in file */
> +                    if (j < (flat_file->count - 1)) j++;
> +                    else {
> +                        j = 0;
> +                        n_filetypes++;
> +                    }
> +
> +                    off = disp + flat_file->indices[j] + 
> +                                   (ADIO_Offset) 
n_filetypes*filetype_extent;
> +
> +          new_fwr_size = flat_file->blocklens[j];
> +          if (size != bwr_size) {
> +         indx += size;
> +         new_bwr_size -= size;
> +          }
> +      }
> +
> +      if (size == bwr_size) {
> +/* reached end of contiguous block in memory */
> +
> +          k = (k + 1)%flat_buf->count;
> +          buf_count++;
> +          indx = buftype_extent*(buf_count/flat_buf->count) +
> +         flat_buf->indices[k]; 
> +          new_bwr_size = flat_buf->blocklens[k];
> +          if (size != fwr_size) {
> +         off += size;
> +         new_fwr_size -= size;
> +          }
> +      }
> +      num += size;
> +      fwr_size = new_fwr_size;
> +                bwr_size = new_bwr_size;
> +       }
> +   }
> +
> +        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
> +   if (err_flag) {
> +       *error_code = MPIO_Err_create_code(MPI_SUCCESS,
> +                      MPIR_ERR_RECOVERABLE, myname,
> +                      __LINE__, MPI_ERR_IO, "**io",
> +                      "**io %s", strerror(errno));
> +   }
> +   else *error_code = MPI_SUCCESS;
> +    }
> +
> +    fd->fp_sys_posn = -1;   /* set it to null. */
> +
> +#ifdef HAVE_STATUS_SET_BYTES
> +    MPIR_Status_set_bytes(status, datatype, bufsize);
> +/* This is a temporary way of filling in status. The right way is to 
> +   keep track of how much data was actually written by 
> ADIOI_BUFFERED_WRITE. */
> +#endif
> +
> +    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
> +}
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h 
> b/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
> index c09a586..f6e58cf 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
> @@ -359,6 +359,10 @@ void ADIOI_GEN_WriteStrided_naive(ADIO_File fd,
> void *buf, int count,
>                         MPI_Datatype datatype, int file_ptr_type,
>                         ADIO_Offset offset, ADIO_Status *status, int
>                         *error_code);
> +void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, void *buf, int count,
> +                       MPI_Datatype datatype, int file_ptr_type,
> +                       ADIO_Offset offset, ADIO_Status *status, int
> +                       *error_code);
>  void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count,
>                         MPI_Datatype datatype, int file_ptr_type,
>                         ADIO_Offset offset, ADIO_Status *status, int
> -- 
> 1.5.3.8
> 
> _______________________________________________
> dcmf mailing list
> dcmf at lists.anl-external.org
> http://lists.anl-external.org/cgi-bin/mailman/listinfo/dcmf
> http://dcmf.anl-external.org/wiki
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.alcf.anl.gov/pipermail/dcmf/attachments/20080215/1b03651d/attachment.htm>


More information about the dcmf mailing list