[dcmf] [PATCH 2/5] introducing a lockless "ad_ufs". ROMIO will use this method if it finds
Bob Cernohous
bobc at us.ibm.com
Fri Feb 15 09:44:57 CST 2008
What's the functional difference between your new
ADIOI_NOLOCK_WriteStrided and ADIOI_GEN_WriteStrided_naive? In an earlier
patch we enabled ADIOI_GEN_WriteStrided_naive when romio_ds_write was
disabled. Will using ADIOI_GEN_WriteStrided_naive in af_ufs work well
for you on pvfs?
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
index ce0f6a5..bc65198 100755
--- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
@@ -21,7 +21,7 @@ struct ADIOI_Fns_struct ADIO_UFS_operations = {
ADIOI_GEN_Fcntl, /* Fcntl */
ADIOI_BGL_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
- ADIOI_NOLOCK_WriteStrided, /* WriteStrided */
+ ADIOI_GEN_WriteStrided_naive, /*ADIOI_NOLOCK_WriteStrided, *
WriteStrided */
ADIOI_BGL_Close, /* Close */
#ifdef ROMIO_HAVE_WORKING_AIO
ADIOI_GEN_IreadContig, /* IreadContig */
The issue I'm having is romio noncontig testcases fail on ufs:files on
GPFS or NFS filesystems using ADIOI_NOLOCK_WriteStrided but work using
ADIOI_GEN_WriteStrided_naive. I'm wondering if
ADIOI_NOLOCK_WriteStrided works if pvfs is the underlying file system but
not for more generic af_ufs filesystems? I'm not comfortable enabling
af_ufs on bgl if it could be abused and cause failures on non-pvfs
filesystems. I suppose we could explicitly put in some checks to disable
it if the underlying file system wasn't pvfs... Thoughts?
Bob Cernohous: (T/L 553) 507-253-6093
BobC at us.ibm.com
IBM Rochester, Building 030-2(C335), Department 61L
3605 Hwy 52 North, Rochester, MN 55901-7829
> Chaos reigns within.
> Reflect, repent, and reboot.
> Order shall return.
dcmf-bounces at lists.anl-external.org wrote on 02/12/2008 02:22:03 PM:
>
> Signed-off-by: Rob Latham <robl at mcs.anl.gov>
> ---
> lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c | 2 +-
> .../mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c | 3 +
> .../mpich2/src/mpi/romio/adio/common/Makefile.in | 2 +-
> .../mpich2/src/mpi/romio/adio/common/ad_fstype.c | 23 ++
> .../src/mpi/romio/adio/common/ad_write_nolock.c | 331
> ++++++++++++++++++++
> lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h | 4 +
> 6 files changed, 363 insertions(+), 2 deletions(-)
> create mode 100644
lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
>
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
> b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
> index 3e016ab..f311afc 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
> @@ -20,7 +20,7 @@ struct ADIOI_Fns_struct ADIO_UFS_operations = {
> ADIOI_GEN_Fcntl, /* Fcntl */
> ADIOI_GEN_SetInfo, /* SetInfo */
> ADIOI_GEN_ReadStrided, /* ReadStrided */
> - ADIOI_GEN_WriteStrided, /* WriteStrided */
> + ADIOI_NOLOCK_WriteStrided, /* WriteStrided */
> ADIOI_GEN_Close, /* Close */
> ADIOI_GEN_IreadContig, /* IreadContig */
> ADIOI_GEN_IwriteContig, /* IwriteContig */
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
> b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
> index 4e91594..1a0dfb9 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
> @@ -12,6 +12,9 @@ void ADIOI_UFS_Open(ADIO_File fd, int *error_code)
> int perm, old_mask, amode;
> static char myname[] = "ADIOI_UFS_OPEN";
>
> + /* set internal variables for tuning environment variables */
> + ad_bgl_get_env_vars();
> +
> if (fd->perm == ADIO_PERM_NULL) {
> old_mask = umask(022);
> umask(old_mask);
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
> b/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
> index 627dc96..9e1d597 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
> @@ -28,7 +28,7 @@ AD_OBJECTS = ad_close.o ad_init.o ad_end.o
> ad_open.o flatten.o \
> ad_write_str_naive.o ad_resize.o ad_read.o ad_write.o ad_iread.o
\
> ad_iwrite.o ad_done.o ad_wait.o adi_close.o ad_prealloc.o
ad_fcntl.o \
> ad_iread_fake.o ad_iwrite_fake.o ad_done_fake.o ad_wait_fake.o \
> - ad_subarray.o ad_darray.o strfns.o
> + ad_subarray.o ad_darray.o strfns.o ad_write_nolock.o
>
> all: $(LIBNAME)
> @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
> b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
> index 452596f..e90332d 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
> @@ -187,6 +187,28 @@ static void ADIO_FileSysType_parentdir(char
> *filename, char **dirnamep)
> }
> #endif /* ROMIO_NTFS */
>
> +static void check_for_pvfs_exception(char *filename, int *fstype)
> +{
> + /* exception for lockless PVFS file system */
> + int err;
> + struct statfs fsbuf;
> + char *dir;
> +
> + do {
> + err = statfs(filename, &fsbuf);
> + } while (err && (errno == ESTALE));
> +
> + if (err && (errno == ENOENT)) {
> + ADIO_FileSysType_parentdir(filename, &dir);
> + err = statfs(dir, &fsbuf);
> + ADIOI_Free(dir);
> + }
> + /* 2030528: magic number for pvfs, but we might not have pvfs
> header files
> + * in this environment */
> + if (fsbuf.f_type == 20030528)
> + *fstype = ADIO_UFS;
> +}
> +
> /*
> ADIO_FileSysType_fncall - determines the file system type for a given
file
> using a system-dependent function call
> @@ -212,6 +234,7 @@ static void ADIO_FileSysType_fncall(char
> *filename, int *fstype, int *error_code
> -------------------------------------------------------------*/
> #ifdef ROMIO_BGL
> *fstype = ADIO_BGL;
> + check_for_pvfs_exception(filename, fstype);
> *error_code = MPI_SUCCESS;
> return;
> #endif /* ROMIO_BGL */
> diff --git
> a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
> b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
> new file mode 100644
> index 0000000..79b11a7
> --- /dev/null
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
> @@ -0,0 +1,331 @@
> +/* -*- Mode: C; c-basic-offset:4 ; -*- */
> +/*
> + *
> + * Copyright (C) 1997 University of Chicago.
> + * See COPYRIGHT notice in top-level directory.
> + */
> +
> +#include "adio.h"
> +#include "adio_extern.h"
> +
> +#ifdef HAVE_UNISTD_H
> +#include <unistd.h>
> +#endif
> +
> +void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, void *buf, int count,
> + MPI_Datatype datatype, int file_ptr_type,
> + ADIO_Offset offset, ADIO_Status *status, int
> + *error_code)
> +{
> +/* borrowed from old-school PVFS (v1) code. A driver for file systems
that
> + * cannot or do not support client-side buffering
> + * Does not do data sieving optimization
> + * Does contain write-combining optimization for noncontig in
> memory, contig in
> + * file
> + */
> +
> +/* offset is in units of etype relative to the filetype. */
> +
> + ADIOI_Flatlist_node *flat_buf, *flat_file;
> + int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0;
> + int bufsize, num, size, sum, n_etypes_in_filetype,
size_in_filetype;
> + int n_filetypes, etype_in_filetype;
> + ADIO_Offset abs_off_in_filetype=0;
> + int filetype_size, etype_size, buftype_size;
> + MPI_Aint filetype_extent, buftype_extent, indx;
> + int buf_count, buftype_is_contig, filetype_is_contig;
> + ADIO_Offset off, disp;
> + int flag, new_bwr_size, new_fwr_size, err_flag=0;
> + static char myname[] = "ADIOI_PVFS_WRITESTRIDED";
> +
> + /* --BEGIN ERROR HANDLING-- */
> + if (fd->atomicity) {
> + *error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
> + myname, __LINE__,
> + MPI_ERR_INTERN,
> + "Atomic mode set in I/O function", 0);
> + return;
> + }
> + /* --END ERROR HANDLING-- */
> +
> + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
> + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
> +
> + MPI_Type_size(fd->filetype, &filetype_size);
> + if ( ! filetype_size ) {
> + *error_code = MPI_SUCCESS;
> + return;
> + }
> +
> + MPI_Type_extent(fd->filetype, &filetype_extent);
> + MPI_Type_size(datatype, &buftype_size);
> + MPI_Type_extent(datatype, &buftype_extent);
> + etype_size = fd->etype_size;
> +
> + bufsize = buftype_size * count;
> +
> + if (!buftype_is_contig && filetype_is_contig) {
> + char *combine_buf, *combine_buf_ptr;
> + ADIO_Offset combine_buf_remain;
> +/* noncontiguous in memory, contiguous in file. use writev */
> +
> + ADIOI_Flatten_datatype(datatype);
> + flat_buf = ADIOI_Flatlist;
> + while (flat_buf->type != datatype) flat_buf = flat_buf->next;
> +
> + /* allocate our "combine buffer" to pack data into before writing */
> + combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size);
> + combine_buf_ptr = combine_buf;
> + combine_buf_remain = fd->hints->ind_wr_buffer_size;
> +
> + /* seek to the right spot in the file */
> + if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
> + off = fd->disp + etype_size * offset;
> + lseek64(fd->fd_sys, off, SEEK_SET);
> + }
> + else off = lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET);
> +
> + /* loop through all the flattened pieces. combine into buffer until
> + * no more will fit, then write.
> + *
> + * special case of a given piece being bigger than the combine
buffer
> + * is also handled.
> + */
> + for (j=0; j<count; j++) {
> + for (i=0; i<flat_buf->count; i++) {
> + if (flat_buf->blocklens[i] > combine_buf_remain &&
> combine_buf != combine_buf_ptr) {
> + /* there is data in the buffer; write out the buffer so far
*/
> + err = write(fd->fd_sys,
> + combine_buf,
> + fd->hints->ind_wr_buffer_size - combine_buf_remain);
> + if (err == -1) err_flag = 1;
> +
> + /* reset our buffer info */
> + combine_buf_ptr = combine_buf;
> + combine_buf_remain = fd->hints->ind_wr_buffer_size;
> + }
> +
> + /* TODO: heuristic for when to not bother to use combine buffer?
*/
> + if (flat_buf->blocklens[i] >= combine_buf_remain) {
> + /* special case: blocklen is as big as or bigger than the
> combine buf;
> + * write directly
> + */
> + err = write(fd->fd_sys,
> + ((char *) buf) + j*buftype_extent +
flat_buf->indices[i],
> + flat_buf->blocklens[i]);
> + if (err == -1) err_flag = 1;
> + off += flat_buf->blocklens[i]; /* keep up with the final
> file offset too */
> + }
> + else {
> + /* copy more data into combine buffer */
> + memcpy(combine_buf_ptr,
> + ((char *) buf) + j*buftype_extent + flat_buf->indices[i],
> + flat_buf->blocklens[i]);
> + combine_buf_ptr += flat_buf->blocklens[i];
> + combine_buf_remain -= flat_buf->blocklens[i];
> + off += flat_buf->blocklens[i]; /* keep up with the final
> file offset too */
> + }
> + }
> + }
> +
> + if (combine_buf_ptr != combine_buf) {
> + /* data left in buffer to write */
> + err = write(fd->fd_sys,
> + combine_buf,
> + fd->hints->ind_wr_buffer_size - combine_buf_remain);
> + if (err == -1) err_flag = 1;
> + }
> +
> + if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
> +
> + ADIOI_Free(combine_buf);
> +
> + if (err_flag) {
> + *error_code = MPIO_Err_create_code(MPI_SUCCESS,
> + MPIR_ERR_RECOVERABLE, myname,
> + __LINE__, MPI_ERR_IO, "**io",
> + "**io %s", strerror(errno));
> + }
> + else *error_code = MPI_SUCCESS;
> + } /* if (!buftype_is_contig && filetype_is_contig) ... */
> +
> + else { /* noncontiguous in file */
> +
> +/* split up into several contiguous writes */
> +
> +/* find starting location in the file */
> +
> +/* filetype already flattened in ADIO_Open */
> + flat_file = ADIOI_Flatlist;
> + while (flat_file->type != fd->filetype) flat_file = flat_file->next;
> + disp = fd->disp;
> +
> + if (file_ptr_type == ADIO_INDIVIDUAL) {
> + offset = fd->fp_ind; /* in bytes */
> + n_filetypes = -1;
> + flag = 0;
> + while (!flag) {
> + n_filetypes++;
> + for (i=0; i<flat_file->count; i++) {
> + if (disp + flat_file->indices[i] +
> + (ADIO_Offset) n_filetypes*filetype_extent +
> flat_file->blocklens[i]
> + >= offset) {
> + st_index = i;
> + fwr_size = disp + flat_file->indices[i] +
> + (ADIO_Offset)
n_filetypes*filetype_extent
> + + flat_file->blocklens[i] - offset;
> + flag = 1;
> + break;
> + }
> + }
> + }
> + }
> + else {
> + n_etypes_in_filetype = filetype_size/etype_size;
> + n_filetypes = (int) (offset / n_etypes_in_filetype);
> + etype_in_filetype = (int) (offset % n_etypes_in_filetype);
> + size_in_filetype = etype_in_filetype * etype_size;
> +
> + sum = 0;
> + for (i=0; i<flat_file->count; i++) {
> + sum += flat_file->blocklens[i];
> + if (sum > size_in_filetype) {
> + st_index = i;
> + fwr_size = sum - size_in_filetype;
> + abs_off_in_filetype = flat_file->indices[i] +
> + size_in_filetype - (sum - flat_file->blocklens[i]);
> + break;
> + }
> + }
> +
> + /* abs. offset in bytes in the file */
> + offset = disp + (ADIO_Offset)
> n_filetypes*filetype_extent + abs_off_in_filetype;
> + }
> +
> + if (buftype_is_contig && !filetype_is_contig) {
> +
> +/* contiguous in memory, noncontiguous in file. should be the most
> + common case. */
> +
> + i = 0;
> + j = st_index;
> + off = offset;
> + fwr_size = ADIOI_MIN(fwr_size, bufsize);
> + while (i < bufsize) {
> + if (fwr_size) {
> + /* TYPE_UB and TYPE_LB can result in
> + fwr_size = 0. save system call in such cases */
> +#ifdef PROFILE
> + MPE_Log_event(5, 0, "start write");
> +#endif
> + err = pwrite(fd->fd_sys, ((char *) buf) + i, fwr_size, off);
> +#ifdef PROFILE
> + MPE_Log_event(6, 0, "end write");
> +#endif
> + if (err == -1) err_flag = 1;
> + }
> + i += fwr_size;
> +
> + if (off + fwr_size < disp + flat_file->indices[j] +
> + flat_file->blocklens[j] + (ADIO_Offset)
> n_filetypes*filetype_extent)
> + off += fwr_size;
> + /* did not reach end of contiguous block in filetype.
> + no more I/O needed. off is incremented by fwr_size.
*/
> + else {
> + if (j < (flat_file->count - 1)) j++;
> + else {
> + j = 0;
> + n_filetypes++;
> + }
> + off = disp + flat_file->indices[j] +
> + (ADIO_Offset)
> n_filetypes*filetype_extent;
> + fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
> + }
> + }
> + }
> + else {
> +/* noncontiguous in memory as well as in file */
> +
> + ADIOI_Flatten_datatype(datatype);
> + flat_buf = ADIOI_Flatlist;
> + while (flat_buf->type != datatype) flat_buf = flat_buf->next;
> +
> + k = num = buf_count = 0;
> + indx = flat_buf->indices[0];
> + j = st_index;
> + off = offset;
> + bwr_size = flat_buf->blocklens[0];
> +
> + while (num < bufsize) {
> + size = ADIOI_MIN(fwr_size, bwr_size);
> + if (size) {
> +#ifdef PROFILE
> + MPE_Log_event(5, 0, "start write");
> +#endif
> + err = pwrite(fd->fd_sys, ((char *) buf) + indx, size, off);
> +#ifdef PROFILE
> + MPE_Log_event(6, 0, "end write");
> +#endif
> + if (err == -1) err_flag = 1;
> + }
> +
> + new_fwr_size = fwr_size;
> + new_bwr_size = bwr_size;
> +
> + if (size == fwr_size) {
> +/* reached end of contiguous block in file */
> + if (j < (flat_file->count - 1)) j++;
> + else {
> + j = 0;
> + n_filetypes++;
> + }
> +
> + off = disp + flat_file->indices[j] +
> + (ADIO_Offset)
n_filetypes*filetype_extent;
> +
> + new_fwr_size = flat_file->blocklens[j];
> + if (size != bwr_size) {
> + indx += size;
> + new_bwr_size -= size;
> + }
> + }
> +
> + if (size == bwr_size) {
> +/* reached end of contiguous block in memory */
> +
> + k = (k + 1)%flat_buf->count;
> + buf_count++;
> + indx = buftype_extent*(buf_count/flat_buf->count) +
> + flat_buf->indices[k];
> + new_bwr_size = flat_buf->blocklens[k];
> + if (size != fwr_size) {
> + off += size;
> + new_fwr_size -= size;
> + }
> + }
> + num += size;
> + fwr_size = new_fwr_size;
> + bwr_size = new_bwr_size;
> + }
> + }
> +
> + if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
> + if (err_flag) {
> + *error_code = MPIO_Err_create_code(MPI_SUCCESS,
> + MPIR_ERR_RECOVERABLE, myname,
> + __LINE__, MPI_ERR_IO, "**io",
> + "**io %s", strerror(errno));
> + }
> + else *error_code = MPI_SUCCESS;
> + }
> +
> + fd->fp_sys_posn = -1; /* set it to null. */
> +
> +#ifdef HAVE_STATUS_SET_BYTES
> + MPIR_Status_set_bytes(status, datatype, bufsize);
> +/* This is a temporary way of filling in status. The right way is to
> + keep track of how much data was actually written by
> ADIOI_BUFFERED_WRITE. */
> +#endif
> +
> + if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
> +}
> diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
> b/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
> index c09a586..f6e58cf 100644
> --- a/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
> +++ b/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
> @@ -359,6 +359,10 @@ void ADIOI_GEN_WriteStrided_naive(ADIO_File fd,
> void *buf, int count,
> MPI_Datatype datatype, int file_ptr_type,
> ADIO_Offset offset, ADIO_Status *status, int
> *error_code);
> +void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, void *buf, int count,
> + MPI_Datatype datatype, int file_ptr_type,
> + ADIO_Offset offset, ADIO_Status *status, int
> + *error_code);
> void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count,
> MPI_Datatype datatype, int file_ptr_type,
> ADIO_Offset offset, ADIO_Status *status, int
> --
> 1.5.3.8
>
> _______________________________________________
> dcmf mailing list
> dcmf at lists.anl-external.org
> http://lists.anl-external.org/cgi-bin/mailman/listinfo/dcmf
> http://dcmf.anl-external.org/wiki
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.alcf.anl.gov/pipermail/dcmf/attachments/20080215/1b03651d/attachment.htm>
More information about the dcmf
mailing list