[dcmf] [PATCH 2/5] introducing a lockless "ad_ufs". ROMIO will use this method if it finds
Rob Latham
robl at mcs.anl.gov
Tue Feb 12 14:22:03 CST 2008
Signed-off-by: Rob Latham <robl at mcs.anl.gov>
---
lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c | 2 +-
.../mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c | 3 +
.../mpich2/src/mpi/romio/adio/common/Makefile.in | 2 +-
.../mpich2/src/mpi/romio/adio/common/ad_fstype.c | 23 ++
.../src/mpi/romio/adio/common/ad_write_nolock.c | 331 ++++++++++++++++++++
lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h | 4 +
6 files changed, 363 insertions(+), 2 deletions(-)
create mode 100644 lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
index 3e016ab..f311afc 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs.c
@@ -20,7 +20,7 @@ struct ADIOI_Fns_struct ADIO_UFS_operations = {
ADIOI_GEN_Fcntl, /* Fcntl */
ADIOI_GEN_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
- ADIOI_GEN_WriteStrided, /* WriteStrided */
+ ADIOI_NOLOCK_WriteStrided, /* WriteStrided */
ADIOI_GEN_Close, /* Close */
ADIOI_GEN_IreadContig, /* IreadContig */
ADIOI_GEN_IwriteContig, /* IwriteContig */
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
index 4e91594..1a0dfb9 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/ad_ufs/ad_ufs_open.c
@@ -12,6 +12,9 @@ void ADIOI_UFS_Open(ADIO_File fd, int *error_code)
int perm, old_mask, amode;
static char myname[] = "ADIOI_UFS_OPEN";
+ /* set internal variables for tuning environment variables */
+ ad_bgl_get_env_vars();
+
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in b/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
index 627dc96..9e1d597 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/Makefile.in
@@ -28,7 +28,7 @@ AD_OBJECTS = ad_close.o ad_init.o ad_end.o ad_open.o flatten.o \
ad_write_str_naive.o ad_resize.o ad_read.o ad_write.o ad_iread.o \
ad_iwrite.o ad_done.o ad_wait.o adi_close.o ad_prealloc.o ad_fcntl.o \
ad_iread_fake.o ad_iwrite_fake.o ad_done_fake.o ad_wait_fake.o \
- ad_subarray.o ad_darray.o strfns.o
+ ad_subarray.o ad_darray.o strfns.o ad_write_nolock.o
all: $(LIBNAME)
@if [ "@ENABLE_SHLIB@" != "none" ] ; then \
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
index 452596f..e90332d 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_fstype.c
@@ -187,6 +187,28 @@ static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
}
#endif /* ROMIO_NTFS */
+static void check_for_pvfs_exception(char *filename, int *fstype)
+{
+ /* exception for lockless PVFS file system */
+ int err;
+ struct statfs fsbuf;
+ char *dir;
+
+ do {
+ err = statfs(filename, &fsbuf);
+ } while (err && (errno == ESTALE));
+
+ if (err && (errno == ENOENT)) {
+ ADIO_FileSysType_parentdir(filename, &dir);
+ err = statfs(dir, &fsbuf);
+ ADIOI_Free(dir);
+ }
+ /* 2030528: magic number for pvfs, but we might not have pvfs header files
+ * in this environment */
+ if (fsbuf.f_type == 20030528)
+ *fstype = ADIO_UFS;
+}
+
/*
ADIO_FileSysType_fncall - determines the file system type for a given file
using a system-dependent function call
@@ -212,6 +234,7 @@ static void ADIO_FileSysType_fncall(char *filename, int *fstype, int *error_code
-------------------------------------------------------------*/
#ifdef ROMIO_BGL
*fstype = ADIO_BGL;
+ check_for_pvfs_exception(filename, fstype);
*error_code = MPI_SUCCESS;
return;
#endif /* ROMIO_BGL */
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
new file mode 100644
index 0000000..79b11a7
--- /dev/null
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/common/ad_write_nolock.c
@@ -0,0 +1,331 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *
+ * Copyright (C) 1997 University of Chicago.
+ * See COPYRIGHT notice in top-level directory.
+ */
+
+#include "adio.h"
+#include "adio_extern.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status *status, int
+ *error_code)
+{
+/* borrowed from old-school PVFS (v1) code. A driver for file systems that
+ * cannot or do not support client-side buffering
+ * Does not do data sieving optimization
+ * Does contain write-combining optimization for noncontig in memory, contig in
+ * file
+ */
+
+/* offset is in units of etype relative to the filetype. */
+
+ ADIOI_Flatlist_node *flat_buf, *flat_file;
+ int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0;
+ int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
+ int n_filetypes, etype_in_filetype;
+ ADIO_Offset abs_off_in_filetype=0;
+ int filetype_size, etype_size, buftype_size;
+ MPI_Aint filetype_extent, buftype_extent, indx;
+ int buf_count, buftype_is_contig, filetype_is_contig;
+ ADIO_Offset off, disp;
+ int flag, new_bwr_size, new_fwr_size, err_flag=0;
+ static char myname[] = "ADIOI_PVFS_WRITESTRIDED";
+
+ /* --BEGIN ERROR HANDLING-- */
+ if (fd->atomicity) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+ myname, __LINE__,
+ MPI_ERR_INTERN,
+ "Atomic mode set in I/O function", 0);
+ return;
+ }
+ /* --END ERROR HANDLING-- */
+
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
+
+ MPI_Type_size(fd->filetype, &filetype_size);
+ if ( ! filetype_size ) {
+ *error_code = MPI_SUCCESS;
+ return;
+ }
+
+ MPI_Type_extent(fd->filetype, &filetype_extent);
+ MPI_Type_size(datatype, &buftype_size);
+ MPI_Type_extent(datatype, &buftype_extent);
+ etype_size = fd->etype_size;
+
+ bufsize = buftype_size * count;
+
+ if (!buftype_is_contig && filetype_is_contig) {
+ char *combine_buf, *combine_buf_ptr;
+ ADIO_Offset combine_buf_remain;
+/* noncontiguous in memory, contiguous in file. use writev */
+
+ ADIOI_Flatten_datatype(datatype);
+ flat_buf = ADIOI_Flatlist;
+ while (flat_buf->type != datatype) flat_buf = flat_buf->next;
+
+ /* allocate our "combine buffer" to pack data into before writing */
+ combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size);
+ combine_buf_ptr = combine_buf;
+ combine_buf_remain = fd->hints->ind_wr_buffer_size;
+
+ /* seek to the right spot in the file */
+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
+ off = fd->disp + etype_size * offset;
+ lseek64(fd->fd_sys, off, SEEK_SET);
+ }
+ else off = lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET);
+
+ /* loop through all the flattened pieces. combine into buffer until
+ * no more will fit, then write.
+ *
+ * special case of a given piece being bigger than the combine buffer
+ * is also handled.
+ */
+ for (j=0; j<count; j++) {
+ for (i=0; i<flat_buf->count; i++) {
+ if (flat_buf->blocklens[i] > combine_buf_remain && combine_buf != combine_buf_ptr) {
+ /* there is data in the buffer; write out the buffer so far */
+ err = write(fd->fd_sys,
+ combine_buf,
+ fd->hints->ind_wr_buffer_size - combine_buf_remain);
+ if (err == -1) err_flag = 1;
+
+ /* reset our buffer info */
+ combine_buf_ptr = combine_buf;
+ combine_buf_remain = fd->hints->ind_wr_buffer_size;
+ }
+
+ /* TODO: heuristic for when to not bother to use combine buffer? */
+ if (flat_buf->blocklens[i] >= combine_buf_remain) {
+ /* special case: blocklen is as big as or bigger than the combine buf;
+ * write directly
+ */
+ err = write(fd->fd_sys,
+ ((char *) buf) + j*buftype_extent + flat_buf->indices[i],
+ flat_buf->blocklens[i]);
+ if (err == -1) err_flag = 1;
+ off += flat_buf->blocklens[i]; /* keep up with the final file offset too */
+ }
+ else {
+ /* copy more data into combine buffer */
+ memcpy(combine_buf_ptr,
+ ((char *) buf) + j*buftype_extent + flat_buf->indices[i],
+ flat_buf->blocklens[i]);
+ combine_buf_ptr += flat_buf->blocklens[i];
+ combine_buf_remain -= flat_buf->blocklens[i];
+ off += flat_buf->blocklens[i]; /* keep up with the final file offset too */
+ }
+ }
+ }
+
+ if (combine_buf_ptr != combine_buf) {
+ /* data left in buffer to write */
+ err = write(fd->fd_sys,
+ combine_buf,
+ fd->hints->ind_wr_buffer_size - combine_buf_remain);
+ if (err == -1) err_flag = 1;
+ }
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
+
+ ADIOI_Free(combine_buf);
+
+ if (err_flag) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**io",
+ "**io %s", strerror(errno));
+ }
+ else *error_code = MPI_SUCCESS;
+ } /* if (!buftype_is_contig && filetype_is_contig) ... */
+
+ else { /* noncontiguous in file */
+
+/* split up into several contiguous writes */
+
+/* find starting location in the file */
+
+/* filetype already flattened in ADIO_Open */
+ flat_file = ADIOI_Flatlist;
+ while (flat_file->type != fd->filetype) flat_file = flat_file->next;
+ disp = fd->disp;
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) {
+ offset = fd->fp_ind; /* in bytes */
+ n_filetypes = -1;
+ flag = 0;
+ while (!flag) {
+ n_filetypes++;
+ for (i=0; i<flat_file->count; i++) {
+ if (disp + flat_file->indices[i] +
+ (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i]
+ >= offset) {
+ st_index = i;
+ fwr_size = disp + flat_file->indices[i] +
+ (ADIO_Offset) n_filetypes*filetype_extent
+ + flat_file->blocklens[i] - offset;
+ flag = 1;
+ break;
+ }
+ }
+ }
+ }
+ else {
+ n_etypes_in_filetype = filetype_size/etype_size;
+ n_filetypes = (int) (offset / n_etypes_in_filetype);
+ etype_in_filetype = (int) (offset % n_etypes_in_filetype);
+ size_in_filetype = etype_in_filetype * etype_size;
+
+ sum = 0;
+ for (i=0; i<flat_file->count; i++) {
+ sum += flat_file->blocklens[i];
+ if (sum > size_in_filetype) {
+ st_index = i;
+ fwr_size = sum - size_in_filetype;
+ abs_off_in_filetype = flat_file->indices[i] +
+ size_in_filetype - (sum - flat_file->blocklens[i]);
+ break;
+ }
+ }
+
+ /* abs. offset in bytes in the file */
+ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype;
+ }
+
+ if (buftype_is_contig && !filetype_is_contig) {
+
+/* contiguous in memory, noncontiguous in file. should be the most
+ common case. */
+
+ i = 0;
+ j = st_index;
+ off = offset;
+ fwr_size = ADIOI_MIN(fwr_size, bufsize);
+ while (i < bufsize) {
+ if (fwr_size) {
+ /* TYPE_UB and TYPE_LB can result in
+ fwr_size = 0. save system call in such cases */
+#ifdef PROFILE
+ MPE_Log_event(5, 0, "start write");
+#endif
+ err = pwrite(fd->fd_sys, ((char *) buf) + i, fwr_size, off);
+#ifdef PROFILE
+ MPE_Log_event(6, 0, "end write");
+#endif
+ if (err == -1) err_flag = 1;
+ }
+ i += fwr_size;
+
+ if (off + fwr_size < disp + flat_file->indices[j] +
+ flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent)
+ off += fwr_size;
+ /* did not reach end of contiguous block in filetype.
+ no more I/O needed. off is incremented by fwr_size. */
+ else {
+ if (j < (flat_file->count - 1)) j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+ off = disp + flat_file->indices[j] +
+ (ADIO_Offset) n_filetypes*filetype_extent;
+ fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
+ }
+ }
+ }
+ else {
+/* noncontiguous in memory as well as in file */
+
+ ADIOI_Flatten_datatype(datatype);
+ flat_buf = ADIOI_Flatlist;
+ while (flat_buf->type != datatype) flat_buf = flat_buf->next;
+
+ k = num = buf_count = 0;
+ indx = flat_buf->indices[0];
+ j = st_index;
+ off = offset;
+ bwr_size = flat_buf->blocklens[0];
+
+ while (num < bufsize) {
+ size = ADIOI_MIN(fwr_size, bwr_size);
+ if (size) {
+#ifdef PROFILE
+ MPE_Log_event(5, 0, "start write");
+#endif
+ err = pwrite(fd->fd_sys, ((char *) buf) + indx, size, off);
+#ifdef PROFILE
+ MPE_Log_event(6, 0, "end write");
+#endif
+ if (err == -1) err_flag = 1;
+ }
+
+ new_fwr_size = fwr_size;
+ new_bwr_size = bwr_size;
+
+ if (size == fwr_size) {
+/* reached end of contiguous block in file */
+ if (j < (flat_file->count - 1)) j++;
+ else {
+ j = 0;
+ n_filetypes++;
+ }
+
+ off = disp + flat_file->indices[j] +
+ (ADIO_Offset) n_filetypes*filetype_extent;
+
+ new_fwr_size = flat_file->blocklens[j];
+ if (size != bwr_size) {
+ indx += size;
+ new_bwr_size -= size;
+ }
+ }
+
+ if (size == bwr_size) {
+/* reached end of contiguous block in memory */
+
+ k = (k + 1)%flat_buf->count;
+ buf_count++;
+ indx = buftype_extent*(buf_count/flat_buf->count) +
+ flat_buf->indices[k];
+ new_bwr_size = flat_buf->blocklens[k];
+ if (size != fwr_size) {
+ off += size;
+ new_fwr_size -= size;
+ }
+ }
+ num += size;
+ fwr_size = new_fwr_size;
+ bwr_size = new_bwr_size;
+ }
+ }
+
+ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
+ if (err_flag) {
+ *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+ MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**io",
+ "**io %s", strerror(errno));
+ }
+ else *error_code = MPI_SUCCESS;
+ }
+
+ fd->fp_sys_posn = -1; /* set it to null. */
+
+#ifdef HAVE_STATUS_SET_BYTES
+ MPIR_Status_set_bytes(status, datatype, bufsize);
+/* This is a temporary way of filling in status. The right way is to
+ keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
+#endif
+
+ if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
+}
diff --git a/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h b/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
index c09a586..f6e58cf 100644
--- a/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
+++ b/lib/mpi/mpich2/src/mpi/romio/adio/include/adioi.h
@@ -359,6 +359,10 @@ void ADIOI_GEN_WriteStrided_naive(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
+void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype, int file_ptr_type,
+ ADIO_Offset offset, ADIO_Status *status, int
+ *error_code);
void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
--
1.5.3.8
More information about the dcmf
mailing list