[dcmf] [PATCH 2/5] introducing a lockless "ad_ufs". ROMIO will use this method if it finds

Rob Latham robl at mcs.anl.gov
Tue Feb 12 14:22:03 CST 2008


Signed-off-by: Rob Latham <robl at mcs.anl.gov>
---
 lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c  |    2 +-
 .../mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c |    3 +
 .../mpich2/src/mpi/romio/adio/common/Makefile.in   |    2 +-
 .../mpich2/src/mpi/romio/adio/common/ad_fstype.c   |   23 ++
 .../src/mpi/romio/adio/common/ad_write_nolock.c    |  331 ++++++++++++++++++++
 lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h  |    4 +
 6 files changed, 363 insertions(+), 2 deletions(-)
 create mode 100644 lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c

diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
index 3e016ab..f311afc 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
@@ -20,7 +20,7 @@ struct ADIOI_Fns_struct ADIO_UFS_operations = {
     ADIOI_GEN_Fcntl, /* Fcntl */
     ADIOI_GEN_SetInfo, /* SetInfo */
     ADIOI_GEN_ReadStrided, /* ReadStrided */
-    ADIOI_GEN_WriteStrided, /* WriteStrided */
+    ADIOI_NOLOCK_WriteStrided, /* WriteStrided */
     ADIOI_GEN_Close, /* Close */
     ADIOI_GEN_IreadContig, /* IreadContig */
     ADIOI_GEN_IwriteContig, /* IwriteContig */
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
index 4e91594..1a0dfb9 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
@@ -12,6 +12,9 @@ void ADIOI_UFS_Open(ADIO_File fd, int *error_code)
     int perm, old_mask, amode;
     static char myname[] = "ADIOI_UFS_OPEN";
 
+    /* set internal variables for tuning environment variables */
+    ad_bgl_get_env_vars();		
+
     if (fd->perm == ADIO_PERM_NULL) {
 	old_mask = umask(022);
 	umask(old_mask);
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in b/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
index 627dc96..9e1d597 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
@@ -28,7 +28,7 @@ AD_OBJECTS = ad_close.o ad_init.o ad_end.o ad_open.o flatten.o \
       ad_write_str_naive.o ad_resize.o ad_read.o ad_write.o ad_iread.o \
       ad_iwrite.o ad_done.o ad_wait.o adi_close.o ad_prealloc.o ad_fcntl.o \
       ad_iread_fake.o ad_iwrite_fake.o ad_done_fake.o ad_wait_fake.o \
-      ad_subarray.o ad_darray.o strfns.o
+      ad_subarray.o ad_darray.o strfns.o ad_write_nolock.o
 
 all: $(LIBNAME)
 	@if [ "@ENABLE_SHLIB@" != "none" ] ; then \
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
index 452596f..e90332d 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
@@ -187,6 +187,28 @@ static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
 }
 #endif /* ROMIO_NTFS */
 
+static void check_for_pvfs_exception(char *filename, int *fstype)
+{
+    /* exception for lockless PVFS file system */
+    int err;
+    struct statfs fsbuf;
+    char *dir;
+
+    do {
+	err = statfs(filename, &fsbuf);
+    } while (err && (errno == ESTALE));
+
+    if (err && (errno == ENOENT)) {
+	ADIO_FileSysType_parentdir(filename, &dir);
+	err = statfs(dir, &fsbuf);
+	ADIOI_Free(dir);
+    }
+    /* 2030528: magic number for pvfs, but we might not have pvfs header files
+     * in this environment */
+    if (fsbuf.f_type == 20030528) 
+	    *fstype = ADIO_UFS;
+}
+
 /*
  ADIO_FileSysType_fncall - determines the file system type for a given file 
  using a system-dependent function call
@@ -212,6 +234,7 @@ static void ADIO_FileSysType_fncall(char *filename, int *fstype, int *error_code
   -------------------------------------------------------------*/
 #ifdef ROMIO_BGL
     *fstype = ADIO_BGL;
+    check_for_pvfs_exception(filename, fstype);
     *error_code = MPI_SUCCESS;
     return;
 #endif  /* ROMIO_BGL */
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
new file mode 100644
index 0000000..79b11a7
--- /dev/null
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
@@ -0,0 +1,331 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/* 
+ *
+ *   Copyright (C) 1997 University of Chicago. 
+ *   See COPYRIGHT notice in top-level directory.
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, void *buf, int count,
+			     MPI_Datatype datatype, int file_ptr_type,
+			     ADIO_Offset offset, ADIO_Status *status, int
+			     *error_code)
+{
+/* borrowed from old-school PVFS (v1) code. A driver for file systems that
+ * cannot or do not support client-side buffering
+ * Does not do data sieving optimization
+ * Does contain write-combining optimization for noncontig in memory, contig in
+ * file 
+ */
+
+/* offset is in units of etype relative to the filetype. */
+
+    ADIOI_Flatlist_node *flat_buf, *flat_file;
+    int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0;
+    int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
+    int n_filetypes, etype_in_filetype;
+    ADIO_Offset abs_off_in_filetype=0;
+    int filetype_size, etype_size, buftype_size;
+    MPI_Aint filetype_extent, buftype_extent, indx;
+    int buf_count, buftype_is_contig, filetype_is_contig;
+    ADIO_Offset off, disp;
+    int flag, new_bwr_size, new_fwr_size, err_flag=0;
+    static char myname[] = "ADIOI_PVFS_WRITESTRIDED";
+
+    /* --BEGIN ERROR HANDLING-- */
+    if (fd->atomicity) {
+	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+					   myname, __LINE__,
+					   MPI_ERR_INTERN,
+					   "Atomic mode set in I/O function", 0);
+	return;
+    }
+    /* --END ERROR HANDLING-- */
+
+    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+    MPI_Type_size(fd->filetype, &filetype_size);
+    if ( ! filetype_size ) {
+	*error_code = MPI_SUCCESS; 
+	return;
+    }
+
+    MPI_Type_extent(fd->filetype, &filetype_extent);
+    MPI_Type_size(datatype, &buftype_size);
+    MPI_Type_extent(datatype, &buftype_extent);
+    etype_size = fd->etype_size;
+    
+    bufsize = buftype_size * count;
+
+    if (!buftype_is_contig && filetype_is_contig) {
+	char *combine_buf, *combine_buf_ptr;
+	ADIO_Offset combine_buf_remain;
+/* noncontiguous in memory, contiguous in file. use writev */
+
+	ADIOI_Flatten_datatype(datatype);
+	flat_buf = ADIOI_Flatlist;
+	while (flat_buf->type != datatype) flat_buf = flat_buf->next;
+
+	/* allocate our "combine buffer" to pack data into before writing */
+	combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size);
+	combine_buf_ptr = combine_buf;
+	combine_buf_remain = fd->hints->ind_wr_buffer_size;
+
+	/* seek to the right spot in the file */
+	if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+	    off = fd->disp + etype_size * offset;
+	    lseek64(fd->fd_sys, off, SEEK_SET);
+	}
+	else off = lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET);
+
+	/* loop through all the flattened pieces.  combine into buffer until
+	 * no more will fit, then write.
+	 *
+	 * special case of a given piece being bigger than the combine buffer
+	 * is also handled.
+	 */
+	for (j=0; j<count; j++) {
+	    for (i=0; i<flat_buf->count; i++) {
+		if (flat_buf->blocklens[i] > combine_buf_remain && combine_buf != combine_buf_ptr) {
+		    /* there is data in the buffer; write out the buffer so far */
+		    err = write(fd->fd_sys,
+				     combine_buf,
+				     fd->hints->ind_wr_buffer_size - combine_buf_remain);
+		    if (err == -1) err_flag = 1;
+
+		    /* reset our buffer info */
+		    combine_buf_ptr = combine_buf;
+		    combine_buf_remain = fd->hints->ind_wr_buffer_size;
+		}
+
+		/* TODO: heuristic for when to not bother to use combine buffer? */
+		if (flat_buf->blocklens[i] >= combine_buf_remain) {
+		    /* special case: blocklen is as big as or bigger than the combine buf;
+		     * write directly
+		     */
+		    err = write(fd->fd_sys,
+				     ((char *) buf) + j*buftype_extent + flat_buf->indices[i],
+				     flat_buf->blocklens[i]);
+		    if (err == -1) err_flag = 1;
+		    off += flat_buf->blocklens[i]; /* keep up with the final file offset too */
+		}
+		else {
+		    /* copy more data into combine buffer */
+		    memcpy(combine_buf_ptr,
+			   ((char *) buf) + j*buftype_extent + flat_buf->indices[i],
+			   flat_buf->blocklens[i]);
+		    combine_buf_ptr += flat_buf->blocklens[i];
+		    combine_buf_remain -= flat_buf->blocklens[i];
+		    off += flat_buf->blocklens[i]; /* keep up with the final file offset too */
+		}
+	    }
+	}
+
+	if (combine_buf_ptr != combine_buf) {
+	    /* data left in buffer to write */
+	    err = write(fd->fd_sys,
+			     combine_buf,
+			     fd->hints->ind_wr_buffer_size - combine_buf_remain);
+	    if (err == -1) err_flag = 1;
+	}
+
+	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
+
+	ADIOI_Free(combine_buf);
+
+	if (err_flag) {
+	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+					       MPIR_ERR_RECOVERABLE, myname,
+					       __LINE__, MPI_ERR_IO, "**io",
+					       "**io %s", strerror(errno));
+	}
+	else *error_code = MPI_SUCCESS;
+    } /* if (!buftype_is_contig && filetype_is_contig)  ... */
+
+    else {  /* noncontiguous in file */
+
+/* split up into several contiguous writes */
+
+/* find starting location in the file */
+
+/* filetype already flattened in ADIO_Open */
+	flat_file = ADIOI_Flatlist;
+	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
+        disp = fd->disp;
+
+	if (file_ptr_type == ADIO_INDIVIDUAL) {
+	    offset = fd->fp_ind; /* in bytes */
+            n_filetypes = -1;
+            flag = 0;
+            while (!flag) {
+                n_filetypes++;
+                for (i=0; i<flat_file->count; i++) {
+                    if (disp + flat_file->indices[i] + 
+                        (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] 
+                            >= offset) {
+                        st_index = i;
+                        fwr_size = disp + flat_file->indices[i] + 
+                                (ADIO_Offset) n_filetypes*filetype_extent
+                                 + flat_file->blocklens[i] - offset;
+                        flag = 1;
+                        break;
+                    }
+                }
+            }
+	}
+	else {
+	    n_etypes_in_filetype = filetype_size/etype_size;
+	    n_filetypes = (int) (offset / n_etypes_in_filetype);
+	    etype_in_filetype = (int) (offset % n_etypes_in_filetype);
+	    size_in_filetype = etype_in_filetype * etype_size;
+ 
+	    sum = 0;
+	    for (i=0; i<flat_file->count; i++) {
+		sum += flat_file->blocklens[i];
+		if (sum > size_in_filetype) {
+		    st_index = i;
+		    fwr_size = sum - size_in_filetype;
+		    abs_off_in_filetype = flat_file->indices[i] +
+			size_in_filetype - (sum - flat_file->blocklens[i]);
+		    break;
+		}
+	    }
+
+	    /* abs. offset in bytes in the file */
+            offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype;
+	}
+
+	if (buftype_is_contig && !filetype_is_contig) {
+
+/* contiguous in memory, noncontiguous in file. should be the most
+   common case. */
+
+	    i = 0;
+	    j = st_index;
+	    off = offset;
+	    fwr_size = ADIOI_MIN(fwr_size, bufsize);
+	    while (i < bufsize) {
+                if (fwr_size) { 
+                    /* TYPE_UB and TYPE_LB can result in 
+                       fwr_size = 0. save system call in such cases */ 
+#ifdef PROFILE
+		    MPE_Log_event(5, 0, "start write");
+#endif
+		    err = pwrite(fd->fd_sys, ((char *) buf) + i, fwr_size, off);
+#ifdef PROFILE
+		    MPE_Log_event(6, 0, "end write");
+#endif
+		    if (err == -1) err_flag = 1;
+		}
+		i += fwr_size;
+
+                if (off + fwr_size < disp + flat_file->indices[j] +
+                   flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent)
+                       off += fwr_size;
+                /* did not reach end of contiguous block in filetype.
+                   no more I/O needed. off is incremented by fwr_size. */
+                else {
+		    if (j < (flat_file->count - 1)) j++;
+		    else {
+			j = 0;
+			n_filetypes++;
+		    }
+		    off = disp + flat_file->indices[j] + 
+                                        (ADIO_Offset) n_filetypes*filetype_extent;
+		    fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
+		}
+	    }
+	}
+	else {
+/* noncontiguous in memory as well as in file */
+
+	    ADIOI_Flatten_datatype(datatype);
+	    flat_buf = ADIOI_Flatlist;
+	    while (flat_buf->type != datatype) flat_buf = flat_buf->next;
+
+	    k = num = buf_count = 0;
+	    indx = flat_buf->indices[0];
+	    j = st_index;
+	    off = offset;
+	    bwr_size = flat_buf->blocklens[0];
+
+	    while (num < bufsize) {
+		size = ADIOI_MIN(fwr_size, bwr_size);
+		if (size) {
+#ifdef PROFILE
+		    MPE_Log_event(5, 0, "start write");
+#endif
+		    err = pwrite(fd->fd_sys, ((char *) buf) + indx, size, off);
+#ifdef PROFILE
+		    MPE_Log_event(6, 0, "end write");
+#endif
+		    if (err == -1) err_flag = 1;
+		}
+
+		new_fwr_size = fwr_size;
+		new_bwr_size = bwr_size;
+
+		if (size == fwr_size) {
+/* reached end of contiguous block in file */
+                    if (j < (flat_file->count - 1)) j++;
+                    else {
+                        j = 0;
+                        n_filetypes++;
+                    }
+
+                    off = disp + flat_file->indices[j] + 
+                                   (ADIO_Offset) n_filetypes*filetype_extent;
+
+		    new_fwr_size = flat_file->blocklens[j];
+		    if (size != bwr_size) {
+			indx += size;
+			new_bwr_size -= size;
+		    }
+		}
+
+		if (size == bwr_size) {
+/* reached end of contiguous block in memory */
+
+		    k = (k + 1)%flat_buf->count;
+		    buf_count++;
+		    indx = buftype_extent*(buf_count/flat_buf->count) +
+			flat_buf->indices[k]; 
+		    new_bwr_size = flat_buf->blocklens[k];
+		    if (size != fwr_size) {
+			off += size;
+			new_fwr_size -= size;
+		    }
+		}
+		num += size;
+		fwr_size = new_fwr_size;
+                bwr_size = new_bwr_size;
+	    }
+	}
+
+        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
+	if (err_flag) {
+	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+					       MPIR_ERR_RECOVERABLE, myname,
+					       __LINE__, MPI_ERR_IO, "**io",
+					       "**io %s", strerror(errno));
+	}
+	else *error_code = MPI_SUCCESS;
+    }
+
+    fd->fp_sys_posn = -1;   /* set it to null. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+    MPIR_Status_set_bytes(status, datatype, bufsize);
+/* This is a temporary way of filling in status. The right way is to 
+   keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
+#endif
+
+    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
+}
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h b/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
index c09a586..f6e58cf 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
@@ -359,6 +359,10 @@ void ADIOI_GEN_WriteStrided_naive(ADIO_File fd, void *buf, int count,
                        MPI_Datatype datatype, int file_ptr_type,
                        ADIO_Offset offset, ADIO_Status *status, int
                        *error_code);
+void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, void *buf, int count,
+                       MPI_Datatype datatype, int file_ptr_type,
+                       ADIO_Offset offset, ADIO_Status *status, int
+                       *error_code);
 void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count,
                        MPI_Datatype datatype, int file_ptr_type,
                        ADIO_Offset offset, ADIO_Status *status, int
-- 
1.5.3.8




More information about the dcmf mailing list