diff options
| -rw-r--r-- | arch/ia64/kernel/entry.S | 1 | ||||
| -rw-r--r-- | arch/powerpc/kernel/systbl.S | 1 | ||||
| -rw-r--r-- | arch/powerpc/platforms/cell/spu_callbacks.c | 1 | ||||
| -rw-r--r-- | fs/splice.c | 292 | ||||
| -rw-r--r-- | include/asm-i386/unistd.h | 3 | ||||
| -rw-r--r-- | include/asm-ia64/unistd.h | 3 | ||||
| -rw-r--r-- | include/asm-powerpc/unistd.h | 3 | ||||
| -rw-r--r-- | include/asm-x86_64/unistd.h | 4 | ||||
| -rw-r--r-- | include/linux/syscalls.h | 3 | 
9 files changed, 268 insertions, 43 deletions
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index e3079881121..bcb80ca5cf4 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1610,5 +1610,6 @@ sys_call_table:  	data8 sys_get_robust_list  	data8 sys_sync_file_range		// 1300  	data8 sys_tee +	data8 sys_vmsplice  	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 8d152269050..0b98eea73c5 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -324,6 +324,7 @@ COMPAT_SYS(ppoll)  SYSCALL(unshare)  SYSCALL(splice)  SYSCALL(tee) +SYSCALL(vmsplice)  /*   * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c index deb3afb9448..b283380a2a1 100644 --- a/arch/powerpc/platforms/cell/spu_callbacks.c +++ b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -318,6 +318,7 @@ void *spu_syscall_table[] = {  	[__NR_unshare]			sys_unshare,  	[__NR_splice]			sys_splice,  	[__NR_tee]			sys_tee, +	[__NR_vmsplice]			sys_vmsplice,  };  long spu_sys_callback(struct spu_syscall_block *s) diff --git a/fs/splice.c b/fs/splice.c index 8c6030c762e..0b2c1f060ca 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -27,6 +27,7 @@  #include <linux/buffer_head.h>  #include <linux/module.h>  #include <linux/syscalls.h> +#include <linux/uio.h>  /*   * Passed to the actors @@ -38,6 +39,22 @@ struct splice_desc {  	loff_t pos;			/* file position */  }; +struct partial_page { +	unsigned int offset; +	unsigned int len; +}; + +/* + * Passed to move_to_pipe + */ +struct splice_pipe_desc { +	struct page **pages;		/* page map */ +	struct partial_page *partial;	/* pages[] may not be contig */ +	int nr_pages;			/* number of pages in map */ +	unsigned int flags;		/* splice flags */ +	struct pipe_buf_operations *ops;/* ops associated with output pipe */ +}; +  /*   * Attempt to steal a page from a pipe buffer. This should perhaps go into   * a vm helper function, it's already simplified quite a bit by the @@ -128,6 +145,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,  	kunmap(buf->page);  } +static void *user_page_pipe_buf_map(struct file *file, +				    struct pipe_inode_info *pipe, +				    struct pipe_buffer *buf) +{ +	return kmap(buf->page); +} + +static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe, +				     struct pipe_buffer *buf) +{ +	kunmap(buf->page); +} +  static void page_cache_pipe_buf_get(struct pipe_inode_info *info,  				    struct pipe_buffer *buf)  { @@ -143,19 +173,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {  	.get = page_cache_pipe_buf_get,  }; +static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, +				    struct pipe_buffer *buf) +{ +	return 1; +} + +static struct pipe_buf_operations user_page_pipe_buf_ops = { +	.can_merge = 0, +	.map = user_page_pipe_buf_map, +	.unmap = user_page_pipe_buf_unmap, +	.release = page_cache_pipe_buf_release, +	.steal = user_page_pipe_buf_steal, +	.get = page_cache_pipe_buf_get, +}; +  /*   * Pipe output worker. This sets up our pipe format with the page cache   * pipe buffer operations. Otherwise very similar to the regular pipe_writev().   */ -static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, -			    int nr_pages, unsigned long len, -			    unsigned int offset, unsigned int flags) +static ssize_t move_to_pipe(struct pipe_inode_info *pipe, +			    struct splice_pipe_desc *spd)  { -	int ret, do_wakeup, i; +	int ret, do_wakeup, page_nr;  	ret = 0;  	do_wakeup = 0; -	i = 0; +	page_nr = 0;  	if (pipe->inode)  		mutex_lock(&pipe->inode->i_mutex); @@ -171,27 +215,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,  		if (pipe->nrbufs < PIPE_BUFFERS) {  			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);  			struct pipe_buffer *buf = pipe->bufs + newbuf; -			struct page *page = pages[i++]; -			unsigned long this_len; - -			this_len = PAGE_CACHE_SIZE - offset; -			if (this_len > len) -				this_len = len; -			buf->page = page; -			buf->offset = offset; -			buf->len = this_len; -			buf->ops = &page_cache_pipe_buf_ops; +			buf->page = spd->pages[page_nr]; +			buf->offset = spd->partial[page_nr].offset; +			buf->len = spd->partial[page_nr].len; +			buf->ops = spd->ops;  			pipe->nrbufs++; +			page_nr++; +			ret += buf->len; +  			if (pipe->inode)  				do_wakeup = 1; -			ret += this_len; -			len -= this_len; -			offset = 0; -			if (!--nr_pages) -				break; -			if (!len) +			if (!--spd->nr_pages)  				break;  			if (pipe->nrbufs < PIPE_BUFFERS)  				continue; @@ -199,7 +235,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,  			break;  		} -		if (flags & SPLICE_F_NONBLOCK) { +		if (spd->flags & SPLICE_F_NONBLOCK) {  			if (!ret)  				ret = -EAGAIN;  			break; @@ -234,8 +270,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,  		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);  	} -	while (i < nr_pages) -		page_cache_release(pages[i++]); +	while (page_nr < spd->nr_pages) +		page_cache_release(spd->pages[page_nr++]);  	return ret;  } @@ -246,17 +282,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,  			   unsigned int flags)  {  	struct address_space *mapping = in->f_mapping; -	unsigned int loff, offset, nr_pages; +	unsigned int loff, nr_pages;  	struct page *pages[PIPE_BUFFERS]; +	struct partial_page partial[PIPE_BUFFERS];  	struct page *page;  	pgoff_t index, end_index;  	loff_t isize; -	size_t bytes; -	int i, error; +	size_t total_len; +	int error; +	struct splice_pipe_desc spd = { +		.pages = pages, +		.partial = partial, +		.flags = flags, +		.ops = &page_cache_pipe_buf_ops, +	};  	index = *ppos >> PAGE_CACHE_SHIFT; -	loff = offset = *ppos & ~PAGE_CACHE_MASK; -	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +	loff = *ppos & ~PAGE_CACHE_MASK; +	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;  	if (nr_pages > PIPE_BUFFERS)  		nr_pages = PIPE_BUFFERS; @@ -266,15 +309,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,  	 * read-ahead if this is a non-zero offset (we are likely doing small  	 * chunk splice and the page is already there) for a single page.  	 */ -	if (!offset || nr_pages > 1) -		do_page_cache_readahead(mapping, in, index, nr_pages); +	if (!loff || spd.nr_pages > 1) +		do_page_cache_readahead(mapping, in, index, spd.nr_pages);  	/*  	 * Now fill in the holes:  	 */  	error = 0; -	bytes = 0; -	for (i = 0; i < nr_pages; i++, index++) { +	total_len = 0; +	for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {  		unsigned int this_len;  		if (!len) @@ -367,26 +410,29 @@ readpage:  			 */  			if (end_index == index) {  				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); -				if (bytes + loff > isize) { +				if (total_len + loff > isize) {  					page_cache_release(page);  					break;  				}  				/*  				 * force quit after adding this page  				 */ -				nr_pages = i; +				nr_pages = spd.nr_pages;  				this_len = min(this_len, loff); +				loff = 0;  			}  		}  fill_it: -		pages[i] = page; -		bytes += this_len; +		pages[spd.nr_pages] = page; +		partial[spd.nr_pages].offset = loff; +		partial[spd.nr_pages].len = this_len;  		len -= this_len; +		total_len += this_len;  		loff = 0;  	} -	if (i) -		return move_to_pipe(pipe, pages, i, bytes, offset, flags); +	if (spd.nr_pages) +		return move_to_pipe(pipe, &spd);  	return error;  } @@ -1018,6 +1064,174 @@ static long do_splice(struct file *in, loff_t __user *off_in,  	return -EINVAL;  } +/* + * Map an iov into an array of pages and offset/length tupples. With the + * partial_page structure, we can map several non-contiguous ranges into + * our ones pages[] map instead of splitting that operation into pieces. + * Could easily be exported as a generic helper for other users, in which + * case one would probably want to add a 'max_nr_pages' parameter as well. + */ +static int get_iovec_page_array(const struct iovec __user *iov, +				unsigned int nr_vecs, struct page **pages, +				struct partial_page *partial) +{ +	int buffers = 0, error = 0; + +	/* +	 * It's ok to take the mmap_sem for reading, even +	 * across a "get_user()". +	 */ +	down_read(¤t->mm->mmap_sem); + +	while (nr_vecs) { +		unsigned long off, npages; +		void __user *base; +		size_t len; +		int i; + +		/* +		 * Get user address base and length for this iovec. +		 */ +		error = get_user(base, &iov->iov_base); +		if (unlikely(error)) +			break; +		error = get_user(len, &iov->iov_len); +		if (unlikely(error)) +			break; + +		/* +		 * Sanity check this iovec. 0 read succeeds. +		 */ +		if (unlikely(!len)) +			break; +		error = -EFAULT; +		if (unlikely(!base)) +			break; + +		/* +		 * Get this base offset and number of pages, then map +		 * in the user pages. +		 */ +		off = (unsigned long) base & ~PAGE_MASK; +		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; +		if (npages > PIPE_BUFFERS - buffers) +			npages = PIPE_BUFFERS - buffers; + +		error = get_user_pages(current, current->mm, +				       (unsigned long) base, npages, 0, 0, +				       &pages[buffers], NULL); + +		if (unlikely(error <= 0)) +			break; + +		/* +		 * Fill this contiguous range into the partial page map. +		 */ +		for (i = 0; i < error; i++) { +			const int plen = min_t(size_t, len, PAGE_SIZE) - off; + +			partial[buffers].offset = off; +			partial[buffers].len = plen; + +			off = 0; +			len -= plen; +			buffers++; +		} + +		/* +		 * We didn't complete this iov, stop here since it probably +		 * means we have to move some of this into a pipe to +		 * be able to continue. +		 */ +		if (len) +			break; + +		/* +		 * Don't continue if we mapped fewer pages than we asked for, +		 * or if we mapped the max number of pages that we have +		 * room for. +		 */ +		if (error < npages || buffers == PIPE_BUFFERS) +			break; + +		nr_vecs--; +		iov++; +	} + +	up_read(¤t->mm->mmap_sem); + +	if (buffers) +		return buffers; + +	return error; +} + +/* + * vmsplice splices a user address range into a pipe. It can be thought of + * as splice-from-memory, where the regular splice is splice-from-file (or + * to file). In both cases the output is a pipe, naturally. + * + * Note that vmsplice only supports splicing _from_ user memory to a pipe, + * not the other way around. Splicing from user memory is a simple operation + * that can be supported without any funky alignment restrictions or nasty + * vm tricks. We simply map in the user memory and fill them into a pipe. + * The reverse isn't quite as easy, though. There are two possible solutions + * for that: + * + *	- memcpy() the data internally, at which point we might as well just + *	  do a regular read() on the buffer anyway. + *	- Lots of nasty vm tricks, that are neither fast nor flexible (it + *	  has restriction limitations on both ends of the pipe). + * + * Alas, it isn't here. + * + */ +static long do_vmsplice(struct file *file, const struct iovec __user *iov, +			unsigned long nr_segs, unsigned int flags) +{ +	struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; +	struct page *pages[PIPE_BUFFERS]; +	struct partial_page partial[PIPE_BUFFERS]; +	struct splice_pipe_desc spd = { +		.pages = pages, +		.partial = partial, +		.flags = flags, +		.ops = &user_page_pipe_buf_ops, +	}; + +	if (unlikely(!pipe)) +		return -EBADF; +	if (unlikely(nr_segs > UIO_MAXIOV)) +		return -EINVAL; +	else if (unlikely(!nr_segs)) +		return 0; + +	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial); +	if (spd.nr_pages <= 0) +		return spd.nr_pages; + +	return move_to_pipe(pipe, &spd); +} + +asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, +			     unsigned long nr_segs, unsigned int flags) +{ +	struct file *file; +	long error; +	int fput; + +	error = -EBADF; +	file = fget_light(fd, &fput); +	if (file) { +		if (file->f_mode & FMODE_WRITE) +			error = do_vmsplice(file, iov, nr_segs, flags); + +		fput_light(file, fput); +	} + +	return error; +} +  asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,  			   int fd_out, loff_t __user *off_out,  			   size_t len, unsigned int flags) diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index d81d6cfc1bb..eb4b152c82f 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -321,8 +321,9 @@  #define __NR_splice		313  #define __NR_sync_file_range	314  #define __NR_tee		315 +#define __NR_vmsplice		316 -#define NR_syscalls 316 +#define NR_syscalls 317  /*   * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index a40ebec6aee..7107763168b 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -290,12 +290,13 @@  #define __NR_get_robust_list		1299  #define __NR_sync_file_range		1300  #define __NR_tee			1301 +#define __NR_vmsplice			1302  #ifdef __KERNEL__  #include <linux/config.h> -#define NR_syscalls			278 /* length of syscall table */ +#define NR_syscalls			279 /* length of syscall table */  #define __ARCH_WANT_SYS_RT_SIGACTION diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h index c612f1a6277..34325e29259 100644 --- a/include/asm-powerpc/unistd.h +++ b/include/asm-powerpc/unistd.h @@ -303,8 +303,9 @@  #define __NR_unshare		282  #define __NR_splice		283  #define __NR_tee		284 +#define __NR_vmsplice		285 -#define __NR_syscalls		285 +#define __NR_syscalls		286  #ifdef __KERNEL__  #define __NR__exit __NR_exit diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 98c36eae567..feb77cb8c04 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -615,8 +615,10 @@ __SYSCALL(__NR_splice, sys_splice)  __SYSCALL(__NR_tee, sys_tee)  #define __NR_sync_file_range	277  __SYSCALL(__NR_sync_file_range, sys_sync_file_range) +#define __NR_vmsplice		278 +__SYSCALL(__NR_vmsplice, sys_vmsplice) -#define __NR_syscall_max __NR_sync_file_range +#define __NR_syscall_max __NR_vmsplice  #ifndef __NO_STUBS diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d3ebc0e68b2..3996960fc56 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -574,6 +574,9 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,  			   int fd_out, loff_t __user *off_out,  			   size_t len, unsigned int flags); +asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, +			     unsigned long nr_segs, unsigned int flags); +  asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);  asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,  |