Introduce IO_CMD_P{READ,WRITE}V and fops->aio_{read,write}v This adds IO_CMD_IO_CMD_P{READ,WRITE}V to let userspace specify buffers with iovecs. aio_{read,write}v file operations are then used by the AIO core to hand the iovecs to filesystems, a significant number of whom already implement their IO methods in terms of iovecs. It lets applications work with vectored file IO in single AIO operations instead of having to issue multiple AIO ops. This is of particular use with O_DIRECT when the iovecs are pushed all the way down to devices which are capable of scatter-gather DMA. Signed-off-by: Zach Brown diff -purN --exclude=description 83_sigevent/fs/aio.c 84_aio_vectored/fs/aio.c --- 83_sigevent/fs/aio.c 2005-08-09 16:34:02.000000000 -0400 +++ 84_aio_vectored/fs/aio.c 2005-08-09 19:39:49.000000000 -0400 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -406,6 +407,7 @@ static struct kiocb fastcall *__aio_get_ req->ki_dtor = NULL; req->ki_signo = 0; req->private = NULL; + req->ki_iovec = NULL; INIT_LIST_HEAD(&req->ki_run_list); /* Check if the completion queue has enough free space to @@ -449,6 +451,7 @@ static inline void really_put_req(struct { if (req->ki_dtor) req->ki_dtor(req); + kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); ctx->reqs_active--; @@ -1379,6 +1382,65 @@ asmlinkage long sys_io_destroy(aio_conte return -EINVAL; } +static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) +{ + struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; + + BUG_ON(ret <= 0); + + while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { + ssize_t this = min(iov->iov_len, (size_t)ret); + iov->iov_base += this; + iov->iov_len -= this; + iocb->ki_left -= this; + ret -= this; + if (iov->iov_len == 0) { + iocb->ki_cur_seg++; + iov++; + } + } + + /* the caller should not have done more io than what fit in + * the remaining iovecs */ + BUG_ON(ret > 0 && iocb->ki_left == 0); +} + +static ssize_t aio_rw_vect_retry(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret = 0; + + if (iocb->ki_opcode == IOCB_CMD_PREADV) + ret = file->f_op->aio_readv(iocb, + &iocb->ki_iovec[iocb->ki_cur_seg], + iocb->ki_nr_segs - iocb->ki_cur_seg, + iocb->ki_pos); + else + ret = file->f_op->aio_writev(iocb, + &iocb->ki_iovec[iocb->ki_cur_seg], + iocb->ki_nr_segs - iocb->ki_cur_seg, + iocb->ki_pos); + + if (ret > 0) { + aio_advance_iovec(iocb, ret); + /* turn partial completion into retries. full completion + * gets ret = 0 below. only retry partial reads if they + * were to a regular file. */ + if (iocb->ki_opcode == IOCB_CMD_PWRITEV || + (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))) + ret = -EIOCBRETRY; + } + + /* This means we must have transferred all that we could */ + /* No need to retry anymore */ + if ((ret == 0) || (iocb->ki_left == 0)) + ret = iocb->ki_nbytes - iocb->ki_left; + + return ret; +} + /* * Default retry method for aio_read (also used for first time submit) * Responsible for updating iocb state as retries progress @@ -1551,6 +1613,27 @@ static ssize_t aio_fsync(struct kiocb *i return ret; } +static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, int type) +{ + ssize_t ret; + + ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, + kiocb->ki_nbytes, 0, NULL, + &kiocb->ki_iovec); + if (ret < 0) + goto out; + + kiocb->ki_nr_segs = kiocb->ki_nbytes; + kiocb->ki_cur_seg = 0; + /* ki_nbytes/left now reflect bytes instead of segs */ + kiocb->ki_nbytes = ret; + kiocb->ki_left = ret; + + ret = 0; +out: + return ret; +} + /* * aio_setup_iocb: * Performs the initial checks and aio retry method @@ -1590,6 +1673,28 @@ static ssize_t aio_setup_iocb(struct kio else if (file->f_op->write) kiocb->ki_retry = aio_thread_pwrite; break; + case IOCB_CMD_PREADV: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + break; + ret = aio_setup_vectored_rw(kiocb, READ); + if (ret) + break; + ret = EINVAL; + if (file->f_op->aio_readv) + kiocb->ki_retry = aio_rw_vect_retry; + break; + case IOCB_CMD_PWRITEV: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + break; + ret = aio_setup_vectored_rw(kiocb, WRITE); + if (ret) + break; + ret = EINVAL; + if (file->f_op->aio_writev) + kiocb->ki_retry = aio_rw_vect_retry; + break; case IOCB_CMD_FDSYNC: ret = -EINVAL; if (file->f_op->aio_fsync) diff -purN --exclude=description 83_sigevent/fs/bad_inode.c 84_aio_vectored/fs/bad_inode.c --- 83_sigevent/fs/bad_inode.c 2005-06-20 13:33:31.000000000 -0400 +++ 84_aio_vectored/fs/bad_inode.c 2005-08-09 19:13:07.000000000 -0400 @@ -26,9 +26,11 @@ static struct file_operations bad_file_o { .llseek = EIO_ERROR, .aio_read = EIO_ERROR, + .aio_readv = EIO_ERROR, .read = EIO_ERROR, .write = EIO_ERROR, .aio_write = EIO_ERROR, + .aio_writev = EIO_ERROR, .readdir = EIO_ERROR, .poll = EIO_ERROR, .ioctl = EIO_ERROR, diff -purN --exclude=description 83_sigevent/fs/block_dev.c 84_aio_vectored/fs/block_dev.c --- 83_sigevent/fs/block_dev.c 2005-08-04 15:55:50.000000000 -0400 +++ 84_aio_vectored/fs/block_dev.c 2005-08-09 19:15:45.000000000 -0400 @@ -777,6 +777,13 @@ static ssize_t blkdev_file_aio_write(str return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); } +static ssize_t blkdev_file_aio_writev(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return generic_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); +} + static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { return blkdev_ioctl(file->f_mapping->host, file, cmd, arg); @@ -799,7 +806,9 @@ struct file_operations def_blk_fops = { .read = generic_file_read, .write = blkdev_file_write, .aio_read = generic_file_aio_read, + .aio_readv = generic_file_aio_readv, .aio_write = blkdev_file_aio_write, + .aio_writev = blkdev_file_aio_writev, .mmap = generic_file_mmap, .fsync = block_fsync, .unlocked_ioctl = block_ioctl, diff -purN --exclude=description 83_sigevent/fs/ext2/file.c 84_aio_vectored/fs/ext2/file.c --- 83_sigevent/fs/ext2/file.c 2005-08-04 15:55:50.000000000 -0400 +++ 84_aio_vectored/fs/ext2/file.c 2005-08-09 19:13:07.000000000 -0400 @@ -44,7 +44,9 @@ struct file_operations ext2_file_operati .read = generic_file_read, .write = generic_file_write, .aio_read = generic_file_aio_read, + .aio_readv = generic_file_aio_readv, .aio_write = generic_file_aio_write, + .aio_writev = generic_file_aio_writev, .ioctl = ext2_ioctl, .mmap = generic_file_mmap, .open = generic_file_open, diff -purN --exclude=description 83_sigevent/fs/ext3/file.c 84_aio_vectored/fs/ext3/file.c --- 83_sigevent/fs/ext3/file.c 2005-08-04 15:55:51.000000000 -0400 +++ 84_aio_vectored/fs/ext3/file.c 2005-08-09 19:13:07.000000000 -0400 @@ -23,6 +23,7 @@ #include #include #include +#include #include "xattr.h" #include "acl.h" @@ -48,14 +49,15 @@ static int ext3_release_file (struct ino } static ssize_t -ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) +ext3_file_aio_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_dentry->d_inode; ssize_t ret; int err; - ret = generic_file_aio_write(iocb, buf, count, pos); + ret = generic_file_aio_writev(iocb, iov, nr_segs, pos); /* * Skip flushing if there was an error, or if nothing was written. @@ -105,12 +107,22 @@ force_commit: return ret; } +static ssize_t +ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) +{ + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + return ext3_file_aio_writev(iocb, &local_iov, 1, pos); +} + struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, + .aio_readv = generic_file_aio_readv, .aio_write = ext3_file_write, + .aio_writev = ext3_file_aio_writev, .readv = generic_file_readv, .writev = generic_file_writev, .ioctl = ext3_ioctl, diff -purN --exclude=description 83_sigevent/fs/jfs/file.c 84_aio_vectored/fs/jfs/file.c --- 83_sigevent/fs/jfs/file.c 2005-08-04 15:55:52.000000000 -0400 +++ 84_aio_vectored/fs/jfs/file.c 2005-08-09 19:13:07.000000000 -0400 @@ -106,7 +106,9 @@ struct file_operations jfs_file_operatio .write = generic_file_write, .read = generic_file_read, .aio_read = generic_file_aio_read, + .aio_readv = generic_file_aio_readv, .aio_write = generic_file_aio_write, + .aio_writev = generic_file_aio_writev, .mmap = generic_file_mmap, .readv = generic_file_readv, .writev = generic_file_writev, diff -purN --exclude=description 83_sigevent/fs/ntfs/file.c 84_aio_vectored/fs/ntfs/file.c --- 83_sigevent/fs/ntfs/file.c 2005-08-04 15:55:56.000000000 -0400 +++ 84_aio_vectored/fs/ntfs/file.c 2005-08-09 19:13:07.000000000 -0400 @@ -111,10 +111,12 @@ struct file_operations ntfs_file_ops = { .llseek = generic_file_llseek, /* Seek inside file. */ .read = generic_file_read, /* Read from file. */ .aio_read = generic_file_aio_read, /* Async read from file. */ + .aio_readv = generic_file_aio_readv, /* Async readv from file. */ .readv = generic_file_readv, /* Read from file. */ #ifdef NTFS_RW .write = generic_file_write, /* Write to file. */ .aio_write = generic_file_aio_write, /* Async write to file. */ + .aio_writev = generic_file_aio_writev,/* Async writev to file. */ .writev = generic_file_writev, /* Write to file. */ /*.release = ,*/ /* Last file is closed. See fs/ext2/file.c:: diff -purN --exclude=description 83_sigevent/fs/read_write.c 84_aio_vectored/fs/read_write.c --- 83_sigevent/fs/read_write.c 2005-08-08 17:15:58.000000000 -0400 +++ 84_aio_vectored/fs/read_write.c 2005-08-09 19:41:35.000000000 -0400 @@ -456,6 +456,77 @@ EXPORT_SYMBOL(iov_shorten); /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) +ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, + unsigned long nr_segs, unsigned long fast_segs, + struct iovec *fast_pointer, + struct iovec **ret_pointer) + { + unsigned long seg; + ssize_t ret; + struct iovec *iov = fast_pointer; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) { + ret = 0; + goto out; + } + + /* + * First get the "struct iovec" from user memory and + * verify all the pointers + */ + if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0)) { + ret = -EINVAL; + goto out; + } + if (nr_segs > fast_segs) { + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) { + ret = -ENOMEM; + goto out; + } + } + if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { + ret = -EFAULT; + goto out; + } + + /* + * According to the Single Unix Specification we should return EINVAL + * if an element length is < 0 when cast to ssize_t or if the + * total length would overflow the ssize_t return value of the + * system call. + */ + ret = 0; + for (seg = 0; seg < nr_segs; seg++) { + void __user *buf = iov[seg].iov_base; + ssize_t len = (ssize_t)iov[seg].iov_len; + + /* see if we we're about to use an invalid len or if + * it's about to overflow ssize_t */ + if (len < 0 || (ret + len < ret)) { + ret = -EINVAL; + goto out; + } + if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { + ret = -EFAULT; + goto out; + } + + ret += len; + } +out: + *ret_pointer = iov; + return ret; +} + +/* A write operation does a read from user space and vice versa */ +#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) + static ssize_t do_readv_writev(int type, struct file *file, const struct iovec __user * uvector, unsigned long nr_segs, loff_t *pos) @@ -463,62 +534,23 @@ static ssize_t do_readv_writev(int type, typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *); - size_t tot_len; + ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov=iovstack, *vector; ssize_t ret; - int seg; io_fn_t fn; iov_fn_t fnv; - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - ret = 0; - if (nr_segs == 0) + if (!file->f_op) { + ret = -EINVAL; goto out; - - /* - * First get the "struct iovec" from user memory and - * verify all the pointers - */ - ret = -EINVAL; - if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0)) - goto out; - if (!file->f_op) - goto out; - if (nr_segs > UIO_FASTIOV) { - ret = -ENOMEM; - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - goto out; } - ret = -EFAULT; - if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) - goto out; - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. The total length is fitting an ssize_t - * - * Be careful here because iov_len is a size_t not an ssize_t - */ - tot_len = 0; - ret = -EINVAL; - for (seg = 0; seg < nr_segs; seg++) { - void __user *buf = iov[seg].iov_base; - ssize_t len = (ssize_t)iov[seg].iov_len; - - if (len < 0) /* size_t not fitting an ssize_t .. */ - goto out; - if (unlikely(!access_ok(vrfy_dir(type), buf, len))) - goto Efault; - tot_len += len; - if ((ssize_t)tot_len < 0) /* maths overflow on the ssize_t */ - goto out; + tot_len = rw_copy_check_uvector(type, uvector, nr_segs, + ARRAY_SIZE(iovstack), iovstack, &iov); + if (tot_len < 0) { + ret = tot_len; + goto out; } if (tot_len == 0) { ret = 0; @@ -575,9 +607,6 @@ out: fsnotify_modify(file->f_dentry); } return ret; -Efault: - ret = -EFAULT; - goto out; } ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, diff -purN --exclude=description 83_sigevent/fs/reiserfs/file.c 84_aio_vectored/fs/reiserfs/file.c --- 83_sigevent/fs/reiserfs/file.c 2005-08-04 15:55:57.000000000 -0400 +++ 84_aio_vectored/fs/reiserfs/file.c 2005-08-09 19:29:57.000000000 -0400 @@ -1540,12 +1540,6 @@ static ssize_t reiserfs_file_write(struc return res; } -static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf, - size_t count, loff_t pos) -{ - return generic_file_aio_write(iocb, buf, count, pos); -} - struct file_operations reiserfs_file_operations = { .read = generic_file_read, .write = reiserfs_file_write, @@ -1555,7 +1549,9 @@ struct file_operations reiserfs_file_ope .fsync = reiserfs_sync_file, .sendfile = generic_file_sendfile, .aio_read = generic_file_aio_read, - .aio_write = reiserfs_aio_write, + .aio_readv = generic_file_aio_readv, + .aio_write = generic_file_aio_write, + .aio_writev = generic_file_aio_writev, }; struct inode_operations reiserfs_file_inode_operations = { diff -purN --exclude=description 83_sigevent/include/linux/aio.h 84_aio_vectored/include/linux/aio.h --- 83_sigevent/include/linux/aio.h 2005-08-09 16:02:13.000000000 -0400 +++ 84_aio_vectored/include/linux/aio.h 2005-08-09 19:13:07.000000000 -0400 @@ -88,6 +88,9 @@ struct kiocb { long ki_kicked; /* just for testing */ long ki_queued; /* just for testing */ + struct iovec *ki_iovec; + unsigned long ki_nr_segs; + unsigned long ki_cur_seg; void *private; /* to notify a process on I/O event only valid if ki_signo != 0 */ diff -purN --exclude=description 83_sigevent/include/linux/aio_abi.h 84_aio_vectored/include/linux/aio_abi.h --- 83_sigevent/include/linux/aio_abi.h 2005-08-09 12:37:25.000000000 -0400 +++ 84_aio_vectored/include/linux/aio_abi.h 2005-08-09 19:13:07.000000000 -0400 @@ -41,6 +41,8 @@ enum { * IOCB_CMD_POLL = 5, */ IOCB_CMD_NOOP = 6, + IOCB_CMD_PREADV = 7, + IOCB_CMD_PWRITEV = 8, }; /* read() from /dev/aio returns these structures. */ diff -purN --exclude=description 83_sigevent/include/linux/fs.h 84_aio_vectored/include/linux/fs.h --- 83_sigevent/include/linux/fs.h 2005-08-04 15:56:06.000000000 -0400 +++ 84_aio_vectored/include/linux/fs.h 2005-08-09 19:38:19.000000000 -0400 @@ -956,8 +956,10 @@ struct file_operations { loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t); + ssize_t (*aio_readv) (struct kiocb *, const struct iovec *, unsigned long, loff_t); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t); + ssize_t (*aio_writev) (struct kiocb *, const struct iovec *, unsigned long, loff_t); int (*readdir) (struct file *, void *, filldir_t); unsigned int (*poll) (struct file *, struct poll_table_struct *); int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); @@ -1007,6 +1009,11 @@ struct inode_operations { struct seq_file; +ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, + unsigned long nr_segs, unsigned long fast_segs, + struct iovec *fast_pointer, + struct iovec **ret_pointer); + extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_readv(struct file *, const struct iovec __user *, @@ -1498,8 +1505,10 @@ extern ssize_t generic_file_read(struct int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t generic_file_aio_read(struct kiocb *, char __user *, size_t, loff_t); +extern ssize_t generic_file_aio_readv(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern ssize_t __generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t *); extern ssize_t generic_file_aio_write(struct kiocb *, const char __user *, size_t, loff_t); +extern ssize_t generic_file_aio_writev(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, unsigned long, loff_t *); extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, diff -purN --exclude=description 83_sigevent/mm/filemap.c 84_aio_vectored/mm/filemap.c --- 83_sigevent/mm/filemap.c 2005-08-08 17:16:06.000000000 -0400 +++ 84_aio_vectored/mm/filemap.c 2005-08-09 19:34:31.000000000 -0400 @@ -1096,6 +1096,15 @@ generic_file_aio_read(struct kiocb *iocb EXPORT_SYMBOL(generic_file_aio_read); ssize_t +generic_file_aio_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + BUG_ON(iocb->ki_pos != pos); + return __generic_file_aio_read(iocb, iov, nr_segs, &iocb->ki_pos); +} +EXPORT_SYMBOL(generic_file_aio_readv); + +ssize_t generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct iovec local_iov = { .iov_base = buf, .iov_len = count }; @@ -2185,27 +2194,24 @@ generic_file_write_nolock(struct file *f } EXPORT_SYMBOL(generic_file_write_nolock); -ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, - size_t count, loff_t pos) +ssize_t generic_file_aio_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; - struct iovec local_iov = { .iov_base = (void __user *)buf, - .iov_len = count }; if (!is_sync_kiocb(iocb) && kiocbIsSynced(iocb)) { /* nothing to transfer, may just need to sync data */ - ret = count; + ret = iocb->ki_left; goto osync; } ret = aio_down(iocb, &inode->i_sem); if (ret) return ret; - ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, - &iocb->ki_pos); + ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); aio_up(iocb, &inode->i_sem); osync: @@ -2220,6 +2226,15 @@ osync: } return ret; } +EXPORT_SYMBOL(generic_file_aio_writev); + +ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, + size_t count, loff_t pos) +{ + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + return generic_file_aio_writev(iocb, &local_iov, 1, pos); +} EXPORT_SYMBOL(generic_file_aio_write); ssize_t generic_file_write(struct file *file, const char __user *buf,