diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-02 20:25:04 -0700 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-02 20:25:04 -0700 | 
| commit | aab174f0df5d72d31caccf281af5f614fa254578 (patch) | |
| tree | 2a172c5009c4ac8755e858593154c258ce7709a0 | |
| parent | ca41cc96b2813221b05af57d0355157924de5a07 (diff) | |
| parent | 2bd2c1941f141ad780135ccc1cd08ca71a24f10a (diff) | |
| download | olio-linux-3.10-aab174f0df5d72d31caccf281af5f614fa254578.tar.xz olio-linux-3.10-aab174f0df5d72d31caccf281af5f614fa254578.zip | |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull vfs update from Al Viro:
 - big one - consolidation of descriptor-related logics; almost all of
   that is moved to fs/file.c
   (BTW, I'm seriously tempted to rename the result to fd.c.  As it is,
   we have a situation when file_table.c is about handling of struct
   file and file.c is about handling of descriptor tables; the reasons
   are historical - file_table.c used to be about a static array of
   struct file we used to have way back).
   A lot of stray ends got cleaned up and converted to saner primitives,
   disgusting mess in android/binder.c is still disgusting, but at least
   doesn't poke so much in descriptor table guts anymore.  A bunch of
   relatively minor races got fixed in process, plus an ext4 struct file
   leak.
 - related thing - fget_light() partially unuglified; see fdget() in
   there (and yes, it generates the code as good as we used to have).
 - also related - bits of Cyrill's procfs stuff that got entangled into
   that work; _not_ all of it, just the initial move to fs/proc/fd.c and
   switch of fdinfo to seq_file.
 - Alex's fs/coredump.c spiltoff - the same story, had been easier to
   take that commit than mess with conflicts.  The rest is a separate
   pile, this was just a mechanical code movement.
 - a few misc patches all over the place.  Not all for this cycle,
   there'll be more (and quite a few currently sit in akpm's tree)."
Fix up trivial conflicts in the android binder driver, and some fairly
simple conflicts due to two different changes to the sock_alloc_file()
interface ("take descriptor handling from sock_alloc_file() to callers"
vs "net: Providing protocol type via system.sockprotoname xattr of
/proc/PID/fd entries" adding a dentry name to the socket)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (72 commits)
  MAX_LFS_FILESIZE should be a loff_t
  compat: fs: Generic compat_sys_sendfile implementation
  fs: push rcu_barrier() from deactivate_locked_super() to filesystems
  btrfs: reada_extent doesn't need kref for refcount
  coredump: move core dump functionality into its own file
  coredump: prevent double-free on an error path in core dumper
  usb/gadget: fix misannotations
  fcntl: fix misannotations
  ceph: don't abuse d_delete() on failure exits
  hypfs: ->d_parent is never NULL or negative
  vfs: delete surplus inode NULL check
  switch simple cases of fget_light to fdget
  new helpers: fdget()/fdput()
  switch o2hb_region_dev_write() to fget_light()
  proc_map_files_readdir(): don't bother with grabbing files
  make get_file() return its argument
  vhost_set_vring(): turn pollstart/pollstop into bool
  switch prctl_set_mm_exe_file() to fget_light()
  switch xfs_find_handle() to fget_light()
  switch xfs_swapext() to fget_light()
  ...
142 files changed, 2913 insertions, 2875 deletions
| diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 9503a4be40f..63e77e3944c 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -145,27 +145,24 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,  		long __user *, basep)  {  	int error; -	struct file *file; +	struct fd arg = fdget(fd);  	struct osf_dirent_callback buf; -	error = -EBADF; -	file = fget(fd); -	if (!file) -		goto out; +	if (!arg.file) +		return -EBADF;  	buf.dirent = dirent;  	buf.basep = basep;  	buf.count = count;  	buf.error = 0; -	error = vfs_readdir(file, osf_filldir, &buf); +	error = vfs_readdir(arg.file, osf_filldir, &buf);  	if (error >= 0)  		error = buf.error;  	if (count != buf.count)  		error = count - buf.count; -	fput(file); - out: +	fdput(arg);  	return error;  } diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 5a5c22245de..f388b4e18a3 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2306,7 +2306,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t  	 * partially initialize the vma for the sampling buffer  	 */  	vma->vm_mm	     = mm; -	vma->vm_file	     = filp; +	vma->vm_file	     = get_file(filp);  	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;  	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */ @@ -2345,8 +2345,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t  		goto error;  	} -	get_file(filp); -  	/*  	 * now insert the vma in the vm list for the process, must be  	 * done with mmap lock held @@ -4782,7 +4780,7 @@ recheck:  asmlinkage long  sys_perfmonctl (int fd, int cmd, void __user *arg, int count)  { -	struct file *file = NULL; +	struct fd f = {NULL, 0};  	pfm_context_t *ctx = NULL;  	unsigned long flags = 0UL;  	void *args_k = NULL; @@ -4879,17 +4877,17 @@ restart_args:  	ret = -EBADF; -	file = fget(fd); -	if (unlikely(file == NULL)) { +	f = fdget(fd); +	if (unlikely(f.file == NULL)) {  		DPRINT(("invalid fd %d\n", fd));  		goto error_args;  	} -	if (unlikely(PFM_IS_FILE(file) == 0)) { +	if (unlikely(PFM_IS_FILE(f.file) == 0)) {  		DPRINT(("fd %d not related to perfmon\n", fd));  		goto error_args;  	} -	ctx = file->private_data; +	ctx = f.file->private_data;  	if (unlikely(ctx == NULL)) {  		DPRINT(("no context for fd %d\n", fd));  		goto error_args; @@ -4919,8 +4917,8 @@ abort_locked:  	if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;  error_args: -	if (file) -		fput(file); +	if (f.file) +		fdput(f);  	kfree(args_k); diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c index c71eb6c7989..6785de7bd2a 100644 --- a/arch/parisc/hpux/fs.c +++ b/arch/parisc/hpux/fs.c @@ -109,33 +109,32 @@ Efault:  int hpux_getdents(unsigned int fd, struct hpux_dirent __user *dirent, unsigned int count)  { -	struct file * file; +	struct fd arg;  	struct hpux_dirent __user * lastdirent;  	struct getdents_callback buf; -	int error = -EBADF; +	int error; -	file = fget(fd); -	if (!file) -		goto out; +	arg = fdget(fd); +	if (!arg.file) +		return -EBADF;  	buf.current_dir = dirent;  	buf.previous = NULL;  	buf.count = count;  	buf.error = 0; -	error = vfs_readdir(file, filldir, &buf); +	error = vfs_readdir(arg.file, filldir, &buf);  	if (error >= 0)  		error = buf.error;  	lastdirent = buf.previous;  	if (lastdirent) { -		if (put_user(file->f_pos, &lastdirent->d_off)) +		if (put_user(arg.file->f_pos, &lastdirent->d_off))  			error = -EFAULT;  		else  			error = count - buf.count;  	} -	fput(file); -out: +	fdput(arg);  	return error;  } diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 559ae1ee670..84083876985 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -189,7 +189,7 @@ SYSCALL_SPU(getcwd)  SYSCALL_SPU(capget)  SYSCALL_SPU(capset)  COMPAT_SYS(sigaltstack) -SYSX_SPU(sys_sendfile64,compat_sys_sendfile,sys_sendfile) +SYSX_SPU(sys_sendfile,compat_sys_sendfile_wrapper,sys_sendfile)  SYSCALL(ni_syscall)  SYSCALL(ni_syscall)  PPC_SYS(vfork) @@ -229,7 +229,7 @@ COMPAT_SYS_SPU(sched_setaffinity)  COMPAT_SYS_SPU(sched_getaffinity)  SYSCALL(ni_syscall)  SYSCALL(ni_syscall) -SYS32ONLY(sendfile64) +SYSX(sys_ni_syscall,compat_sys_sendfile64_wrapper,sys_sendfile64)  COMPAT_SYS_SPU(io_setup)  SYSCALL_SPU(io_destroy)  COMPAT_SYS_SPU(io_getevents) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index bd377a36861..c683fa350ad 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -419,6 +419,7 @@  #define __ARCH_WANT_COMPAT_SYS_TIME  #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND  #define __ARCH_WANT_SYS_NEWFSTATAT +#define __ARCH_WANT_COMPAT_SYS_SENDFILE  #endif  /* diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 81c570633ea..abd1112da54 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -143,48 +143,17 @@ long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t pt   * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)   * and the register representation of a signed int (msr in 64-bit mode) is performed.   */ -asmlinkage long compat_sys_sendfile(u32 out_fd, u32 in_fd, compat_off_t __user * offset, u32 count) +asmlinkage long compat_sys_sendfile_wrapper(u32 out_fd, u32 in_fd, +					    compat_off_t __user *offset, u32 count)  { -	mm_segment_t old_fs = get_fs(); -	int ret; -	off_t of; -	off_t __user *up; - -	if (offset && get_user(of, offset)) -		return -EFAULT; - -	/* The __user pointer cast is valid because of the set_fs() */		 -	set_fs(KERNEL_DS); -	up = offset ? (off_t __user *) &of : NULL; -	ret = sys_sendfile((int)out_fd, (int)in_fd, up, count); -	set_fs(old_fs); -	 -	if (offset && put_user(of, offset)) -		return -EFAULT; -		 -	return ret; +	return compat_sys_sendfile((int)out_fd, (int)in_fd, offset, count);  } -asmlinkage int compat_sys_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset, s32 count) +asmlinkage long compat_sys_sendfile64_wrapper(u32 out_fd, u32 in_fd, +					      compat_loff_t __user *offset, u32 count)  { -	mm_segment_t old_fs = get_fs(); -	int ret; -	loff_t lof; -	loff_t __user *up; -	 -	if (offset && get_user(lof, offset)) -		return -EFAULT; -		 -	/* The __user pointer cast is valid because of the set_fs() */		 -	set_fs(KERNEL_DS); -	up = offset ? (loff_t __user *) &lof : NULL; -	ret = sys_sendfile64(out_fd, in_fd, up, count); -	set_fs(old_fs); -	 -	if (offset && put_user(lof, offset)) -		return -EFAULT; -		 -	return ret; +	return sys_sendfile((int)out_fd, (int)in_fd, +			    (off_t __user *)offset, count);  }  long compat_sys_execve(unsigned long a0, unsigned long a1, unsigned long a2, diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c index 714bbfc3162..db4e638cf40 100644 --- a/arch/powerpc/platforms/cell/spu_syscalls.c +++ b/arch/powerpc/platforms/cell/spu_syscalls.c @@ -69,8 +69,6 @@ SYSCALL_DEFINE4(spu_create, const char __user *, name, unsigned int, flags,  	umode_t, mode, int, neighbor_fd)  {  	long ret; -	struct file *neighbor; -	int fput_needed;  	struct spufs_calls *calls;  	calls = spufs_calls_get(); @@ -78,11 +76,11 @@ SYSCALL_DEFINE4(spu_create, const char __user *, name, unsigned int, flags,  		return -ENOSYS;  	if (flags & SPU_CREATE_AFFINITY_SPU) { +		struct fd neighbor = fdget(neighbor_fd);  		ret = -EBADF; -		neighbor = fget_light(neighbor_fd, &fput_needed); -		if (neighbor) { -			ret = calls->create_thread(name, flags, mode, neighbor); -			fput_light(neighbor, fput_needed); +		if (neighbor.file) { +			ret = calls->create_thread(name, flags, mode, neighbor.file); +			fdput(neighbor);  		}  	} else  		ret = calls->create_thread(name, flags, mode, NULL); @@ -94,8 +92,7 @@ SYSCALL_DEFINE4(spu_create, const char __user *, name, unsigned int, flags,  asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)  {  	long ret; -	struct file *filp; -	int fput_needed; +	struct fd arg;  	struct spufs_calls *calls;  	calls = spufs_calls_get(); @@ -103,10 +100,10 @@ asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)  		return -ENOSYS;  	ret = -EBADF; -	filp = fget_light(fd, &fput_needed); -	if (filp) { -		ret = calls->spu_run(filp, unpc, ustatus); -		fput_light(filp, fput_needed); +	arg = fdget(fd); +	if (arg.file) { +		ret = calls->spu_run(arg.file, unpc, ustatus); +		fdput(arg);  	}  	spufs_calls_put(calls); diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index c2c5b078ba8..657e3f233a6 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -106,6 +106,17 @@ static int spufs_ctx_note_size(struct spu_context *ctx, int dfd)  	return total;  } +static int match_context(const void *v, struct file *file, unsigned fd) +{ +	struct spu_context *ctx; +	if (file->f_op != &spufs_context_fops) +		return 0; +	ctx = SPUFS_I(file->f_dentry->d_inode)->i_ctx; +	if (ctx->flags & SPU_CREATE_NOSCHED) +		return 0; +	return fd + 1; +} +  /*   * The additional architecture-specific notes for Cell are various   * context files in the spu context. @@ -115,29 +126,18 @@ static int spufs_ctx_note_size(struct spu_context *ctx, int dfd)   * internal functionality to dump them without needing to actually   * open the files.   */ +/* + * descriptor table is not shared, so files can't change or go away. + */  static struct spu_context *coredump_next_context(int *fd)  { -	struct fdtable *fdt = files_fdtable(current->files);  	struct file *file; -	struct spu_context *ctx = NULL; - -	for (; *fd < fdt->max_fds; (*fd)++) { -		if (!fd_is_open(*fd, fdt)) -			continue; - -		file = fcheck(*fd); - -		if (!file || file->f_op != &spufs_context_fops) -			continue; - -		ctx = SPUFS_I(file->f_dentry->d_inode)->i_ctx; -		if (ctx->flags & SPU_CREATE_NOSCHED) -			continue; - -		break; -	} - -	return ctx; +	int n = iterate_fd(current->files, *fd, match_context, NULL); +	if (!n) +		return NULL; +	*fd = n - 1; +	file = fcheck(*fd); +	return SPUFS_I(file->f_dentry->d_inode)->i_ctx;  }  int spufs_coredump_extra_notes_size(void) diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 124ec1a55cc..06ea69bd387 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -72,8 +72,6 @@ static void hypfs_remove(struct dentry *dentry)  	struct dentry *parent;  	parent = dentry->d_parent; -	if (!parent || !parent->d_inode) -		return;  	mutex_lock(&parent->d_inode->i_mutex);  	if (hypfs_positive(dentry)) {  		if (S_ISDIR(dentry->d_inode->i_mode)) diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index fb269346480..d9a677c5192 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -447,6 +447,7 @@  #else  #define __ARCH_WANT_COMPAT_SYS_TIME  #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND +#define __ARCH_WANT_COMPAT_SYS_SENDFILE  #endif  /* diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S index d97f3eb72e0..44025f4ba41 100644 --- a/arch/sparc/kernel/sys32.S +++ b/arch/sparc/kernel/sys32.S @@ -90,7 +90,7 @@ SIGN1(sys32_mkdir, sys_mkdir, %o1)  SIGN3(sys32_futex, compat_sys_futex, %o1, %o2, %o5)  SIGN1(sys32_sysfs, compat_sys_sysfs, %o0)  SIGN2(sys32_sendfile, compat_sys_sendfile, %o0, %o1) -SIGN2(sys32_sendfile64, compat_sys_sendfile64, %o0, %o1) +SIGN2(sys32_sendfile64, sys_sendfile, %o0, %o1)  SIGN1(sys32_prctl, sys_prctl, %o0)  SIGN1(sys32_sched_rr_get_interval, compat_sys_sched_rr_get_interval, %o0)  SIGN2(sys32_waitpid, sys_waitpid, %o0, %o2) diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index f7392336961..d862499eb01 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -506,52 +506,6 @@ long compat_sys_fadvise64_64(int fd,  				advice);  } -asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, -				    compat_off_t __user *offset, -				    compat_size_t count) -{ -	mm_segment_t old_fs = get_fs(); -	int ret; -	off_t of; -	 -	if (offset && get_user(of, offset)) -		return -EFAULT; -		 -	set_fs(KERNEL_DS); -	ret = sys_sendfile(out_fd, in_fd, -			   offset ? (off_t __user *) &of : NULL, -			   count); -	set_fs(old_fs); -	 -	if (offset && put_user(of, offset)) -		return -EFAULT; -		 -	return ret; -} - -asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd, -				      compat_loff_t __user *offset, -				      compat_size_t count) -{ -	mm_segment_t old_fs = get_fs(); -	int ret; -	loff_t lof; -	 -	if (offset && get_user(lof, offset)) -		return -EFAULT; -		 -	set_fs(KERNEL_DS); -	ret = sys_sendfile64(out_fd, in_fd, -			     offset ? (loff_t __user *) &lof : NULL, -			     count); -	set_fs(old_fs); -	 -	if (offset && put_user(lof, offset)) -		return -EFAULT; -		 -	return ret; -} -  /* This is just a version for 32-bit applications which does   * not force O_LARGEFILE on.   */ diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index c17de0db673..9efeb6da48b 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -21,6 +21,9 @@  #include <linux/un.h>  #include <linux/workqueue.h>  #include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/file.h>  #include <asm/uaccess.h>  #include <asm/switch_to.h> @@ -118,90 +121,38 @@ void mconsole_log(struct mc_request *req)  	mconsole_reply(req, "", 0, 0);  } -/* This is a more convoluted version of mconsole_proc, which has some stability - * problems; however, we need it fixed, because it is expected that UML users - * mount HPPFS instead of procfs on /proc. And we want mconsole_proc to still - * show the real procfs content, not the ones from hppfs.*/ -#if 0  void mconsole_proc(struct mc_request *req)  {  	struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; -	struct file *file; -	int n; -	char *ptr = req->request.data, *buf; -	mm_segment_t old_fs = get_fs(); - -	ptr += strlen("proc"); -	ptr = skip_spaces(ptr); - -	file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY); -	if (IS_ERR(file)) { -		mconsole_reply(req, "Failed to open file", 1, 0); -		goto out; -	} - -	buf = kmalloc(PAGE_SIZE, GFP_KERNEL); -	if (buf == NULL) { -		mconsole_reply(req, "Failed to allocate buffer", 1, 0); -		goto out_fput; -	} - -	if (file->f_op->read) { -		do { -			loff_t pos; -			set_fs(KERNEL_DS); -			n = vfs_read(file, buf, PAGE_SIZE - 1, &pos); -			file_pos_write(file, pos); -			set_fs(old_fs); -			if (n >= 0) { -				buf[n] = '\0'; -				mconsole_reply(req, buf, 0, (n > 0)); -			} -			else { -				mconsole_reply(req, "Read of file failed", -					       1, 0); -				goto out_free; -			} -		} while (n > 0); -	} -	else mconsole_reply(req, "", 0, 0); - - out_free: -	kfree(buf); - out_fput: -	fput(file); - out: ; -} -#endif - -void mconsole_proc(struct mc_request *req) -{ -	char path[64];  	char *buf;  	int len; -	int fd; +	struct file *file;  	int first_chunk = 1;  	char *ptr = req->request.data;  	ptr += strlen("proc");  	ptr = skip_spaces(ptr); -	snprintf(path, sizeof(path), "/proc/%s", ptr); -	fd = sys_open(path, 0, 0); -	if (fd < 0) { +	file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY); +	if (IS_ERR(file)) {  		mconsole_reply(req, "Failed to open file", 1, 0); -		printk(KERN_ERR "open %s: %d\n",path,fd); +		printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file));  		goto out;  	}  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);  	if (buf == NULL) {  		mconsole_reply(req, "Failed to allocate buffer", 1, 0); -		goto out_close; +		goto out_fput;  	} -	for (;;) { -		len = sys_read(fd, buf, PAGE_SIZE-1); +	do { +		loff_t pos; +		mm_segment_t old_fs = get_fs(); +		set_fs(KERNEL_DS); +		len = vfs_read(file, buf, PAGE_SIZE - 1, &pos); +		set_fs(old_fs); +		file->f_pos = pos;  		if (len < 0) {  			mconsole_reply(req, "Read of file failed", 1, 0);  			goto out_free; @@ -211,22 +162,14 @@ void mconsole_proc(struct mc_request *req)  			mconsole_reply(req, "\n", 0, 1);  			first_chunk = 0;  		} -		if (len == PAGE_SIZE-1) { -			buf[len] = '\0'; -			mconsole_reply(req, buf, 0, 1); -		} else { -			buf[len] = '\0'; -			mconsole_reply(req, buf, 0, 0); -			break; -		} -	} - +		buf[len] = '\0'; +		mconsole_reply(req, buf, 0, (len > 0)); +	} while (len > 0);   out_free:  	kfree(buf); - out_close: -	sys_close(fd); - out: -	/* nothing */; + out_fput: +	fput(file); + out: ;  }  #define UML_MCONSOLE_HELPTEXT \ diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c index c30f3e1d0ef..460e22dee36 100644 --- a/drivers/base/dma-buf.c +++ b/drivers/base/dma-buf.c @@ -460,8 +460,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,  	if (vma->vm_file)  		fput(vma->vm_file); -	vma->vm_file = dmabuf->file; -	get_file(vma->vm_file); +	vma->vm_file = get_file(dmabuf->file);  	vma->vm_pgoff = pgoff; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 7972bae2e9b..2709ff58139 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1183,7 +1183,7 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,  	struct rdma_ucm_migrate_id cmd;  	struct rdma_ucm_migrate_resp resp;  	struct ucma_context *ctx; -	struct file *filp; +	struct fd f;  	struct ucma_file *cur_file;  	int ret = 0; @@ -1191,12 +1191,12 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,  		return -EFAULT;  	/* Get current fd to protect against it being closed */ -	filp = fget(cmd.fd); -	if (!filp) +	f = fdget(cmd.fd); +	if (!f.file)  		return -ENOENT;  	/* Validate current fd and prevent destruction of id. */ -	ctx = ucma_get_ctx(filp->private_data, cmd.id); +	ctx = ucma_get_ctx(f.file->private_data, cmd.id);  	if (IS_ERR(ctx)) {  		ret = PTR_ERR(ctx);  		goto file_put; @@ -1230,7 +1230,7 @@ response:  	ucma_put_ctx(ctx);  file_put: -	fput(filp); +	fdput(f);  	return ret;  } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index f9d0d7c413a..0cb0007724a 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -705,7 +705,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,  	struct ib_udata			udata;  	struct ib_uxrcd_object         *obj;  	struct ib_xrcd                 *xrcd = NULL; -	struct file                    *f = NULL; +	struct fd			f = {NULL, 0};  	struct inode                   *inode = NULL;  	int				ret = 0;  	int				new_xrcd = 0; @@ -724,18 +724,13 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,  	if (cmd.fd != -1) {  		/* search for file descriptor */ -		f = fget(cmd.fd); -		if (!f) { -			ret = -EBADF; -			goto err_tree_mutex_unlock; -		} - -		inode = f->f_dentry->d_inode; -		if (!inode) { +		f = fdget(cmd.fd); +		if (!f.file) {  			ret = -EBADF;  			goto err_tree_mutex_unlock;  		} +		inode = f.file->f_path.dentry->d_inode;  		xrcd = find_xrcd(file->device, inode);  		if (!xrcd && !(cmd.oflags & O_CREAT)) {  			/* no file descriptor. Need CREATE flag */ @@ -800,8 +795,8 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,  		goto err_copy;  	} -	if (f) -		fput(f); +	if (f.file) +		fdput(f);  	mutex_lock(&file->mutex);  	list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list); @@ -830,8 +825,8 @@ err:  	put_uobj_write(&obj->uobject);  err_tree_mutex_unlock: -	if (f) -		fput(f); +	if (f.file) +		fdput(f);  	mutex_unlock(&file->device->xrcd_tree_mutex); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 604556d73d2..6f2ce6fa98f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -541,16 +541,15 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,  struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)  {  	struct ib_uverbs_event_file *ev_file = NULL; -	struct file *filp; +	struct fd f = fdget(fd); -	filp = fget(fd); -	if (!filp) +	if (!f.file)  		return NULL; -	if (filp->f_op != &uverbs_event_fops) +	if (f.file->f_op != &uverbs_event_fops)  		goto out; -	ev_file = filp->private_data; +	ev_file = f.file->private_data;  	if (ev_file->is_async) {  		ev_file = NULL;  		goto out; @@ -559,7 +558,7 @@ struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)  	kref_get(&ev_file->ref);  out: -	fput(filp); +	fdput(f);  	return ev_file;  } diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index b1937ca1357..7b0ba92e7e4 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -362,71 +362,22 @@ struct binder_transaction {  static void  binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); -/* - * copied from get_unused_fd_flags - */  static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)  {  	struct files_struct *files = proc->files; -	int fd, error; -	struct fdtable *fdt;  	unsigned long rlim_cur;  	unsigned long irqs;  	if (files == NULL)  		return -ESRCH; -	error = -EMFILE; -	spin_lock(&files->file_lock); - -repeat: -	fdt = files_fdtable(files); -	fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, files->next_fd); - -	/* -	 * N.B. For clone tasks sharing a files structure, this test -	 * will limit the total number of files that can be opened. -	 */ -	rlim_cur = 0; -	if (lock_task_sighand(proc->tsk, &irqs)) { -		rlim_cur = proc->tsk->signal->rlim[RLIMIT_NOFILE].rlim_cur; -		unlock_task_sighand(proc->tsk, &irqs); -	} -	if (fd >= rlim_cur) -		goto out; - -	/* Do we need to expand the fd array or fd set?  */ -	error = expand_files(files, fd); -	if (error < 0) -		goto out; - -	if (error) { -		/* -		 * If we needed to expand the fs array we -		 * might have blocked - try again. -		 */ -		error = -EMFILE; -		goto repeat; -	} - -	__set_open_fd(fd, fdt); -	if (flags & O_CLOEXEC) -		__set_close_on_exec(fd, fdt); -	else -		__clear_close_on_exec(fd, fdt); -	files->next_fd = fd + 1; +	if (!lock_task_sighand(proc->tsk, &irqs)) +		return -EMFILE; -	/* Sanity check */ -	if (fdt->fd[fd] != NULL) { -		pr_warn("get_unused_fd: slot %d not NULL!\n", fd); -		fdt->fd[fd] = NULL; -	} +	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); +	unlock_task_sighand(proc->tsk, &irqs); -	error = fd; - -out: -	spin_unlock(&files->file_lock); -	return error; +	return __alloc_fd(files, 0, rlim_cur, flags);  }  /* @@ -435,28 +386,8 @@ out:  static void task_fd_install(  	struct binder_proc *proc, unsigned int fd, struct file *file)  { -	struct files_struct *files = proc->files; -	struct fdtable *fdt; - -	if (files == NULL) -		return; - -	spin_lock(&files->file_lock); -	fdt = files_fdtable(files); -	BUG_ON(fdt->fd[fd] != NULL); -	rcu_assign_pointer(fdt->fd[fd], file); -	spin_unlock(&files->file_lock); -} - -/* - * copied from __put_unused_fd in open.c - */ -static void __put_unused_fd(struct files_struct *files, unsigned int fd) -{ -	struct fdtable *fdt = files_fdtable(files); -	__clear_open_fd(fd, fdt); -	if (fd < files->next_fd) -		files->next_fd = fd; +	if (proc->files) +		__fd_install(proc->files, fd, file);  }  /* @@ -464,27 +395,12 @@ static void __put_unused_fd(struct files_struct *files, unsigned int fd)   */  static long task_close_fd(struct binder_proc *proc, unsigned int fd)  { -	struct file *filp; -	struct files_struct *files = proc->files; -	struct fdtable *fdt;  	int retval; -	if (files == NULL) +	if (proc->files == NULL)  		return -ESRCH; -	spin_lock(&files->file_lock); -	fdt = files_fdtable(files); -	if (fd >= fdt->max_fds) -		goto out_unlock; -	filp = fdt->fd[fd]; -	if (!filp) -		goto out_unlock; -	rcu_assign_pointer(fdt->fd[fd], NULL); -	__clear_close_on_exec(fd, fdt); -	__put_unused_fd(files, fd); -	spin_unlock(&files->file_lock); -	retval = filp_close(filp, files); - +	retval = __close_fd(proc->files, fd);  	/* can't restart close syscall because file table entry was cleared */  	if (unlikely(retval == -ERESTARTSYS ||  		     retval == -ERESTARTNOINTR || @@ -493,10 +409,6 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)  		retval = -EINTR;  	return retval; - -out_unlock: -	spin_unlock(&files->file_lock); -	return -EBADF;  }  static void binder_set_nice(long nice) @@ -2793,6 +2705,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)  	const char *failure_string;  	struct binder_buffer *buffer; +	if (proc->tsk != current) +		return -EINVAL; +  	if ((vma->vm_end - vma->vm_start) > SZ_4M)  		vma->vm_end = vma->vm_start + SZ_4M; @@ -2857,7 +2772,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)  	binder_insert_free_buffer(proc, buffer);  	proc->free_async_space = proc->buffer_size / 2;  	barrier(); -	proc->files = get_files_struct(proc->tsk); +	proc->files = get_files_struct(current);  	proc->vma = vma;  	proc->vma_vm_mm = vma->vm_mm; diff --git a/drivers/staging/omapdrm/omap_gem.c b/drivers/staging/omapdrm/omap_gem.c index c8287438e0d..3434e6ec014 100644 --- a/drivers/staging/omapdrm/omap_gem.c +++ b/drivers/staging/omapdrm/omap_gem.c @@ -592,9 +592,8 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj,  		 * in particular in the case of mmap'd dmabufs)  		 */  		fput(vma->vm_file); -		get_file(obj->filp);  		vma->vm_pgoff = 0; -		vma->vm_file  = obj->filp; +		vma->vm_file  = get_file(obj->filp);  		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);  	} diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 8a5a8b06461..2ea176b2280 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1166,10 +1166,8 @@ ssize_t redirected_tty_write(struct file *file, const char __user *buf,  	struct file *p = NULL;  	spin_lock(&redirect_lock); -	if (redirect) { -		get_file(redirect); -		p = redirect; -	} +	if (redirect) +		p = get_file(redirect);  	spin_unlock(&redirect_lock);  	if (p) { @@ -2264,8 +2262,7 @@ static int tioccons(struct file *file)  		spin_unlock(&redirect_lock);  		return -EBUSY;  	} -	get_file(file); -	redirect = file; +	redirect = get_file(file);  	spin_unlock(&redirect_lock);  	return 0;  } @@ -2809,6 +2806,13 @@ static long tty_compat_ioctl(struct file *file, unsigned int cmd,  }  #endif +static int this_tty(const void *t, struct file *file, unsigned fd) +{ +	if (likely(file->f_op->read != tty_read)) +		return 0; +	return file_tty(file) != t ? 0 : fd + 1; +} +	  /*   * This implements the "Secure Attention Key" ---  the idea is to   * prevent trojan horses by killing all processes associated with this @@ -2836,8 +2840,6 @@ void __do_SAK(struct tty_struct *tty)  	struct task_struct *g, *p;  	struct pid *session;  	int		i; -	struct file	*filp; -	struct fdtable *fdt;  	if (!tty)  		return; @@ -2867,27 +2869,12 @@ void __do_SAK(struct tty_struct *tty)  			continue;  		}  		task_lock(p); -		if (p->files) { -			/* -			 * We don't take a ref to the file, so we must -			 * hold ->file_lock instead. -			 */ -			spin_lock(&p->files->file_lock); -			fdt = files_fdtable(p->files); -			for (i = 0; i < fdt->max_fds; i++) { -				filp = fcheck_files(p->files, i); -				if (!filp) -					continue; -				if (filp->f_op->read == tty_read && -				    file_tty(filp) == tty) { -					printk(KERN_NOTICE "SAK: killed process %d" -					    " (%s): fd#%d opened to the tty\n", -					    task_pid_nr(p), p->comm, i); -					force_sig(SIGKILL, p); -					break; -				} -			} -			spin_unlock(&p->files->file_lock); +		i = iterate_fd(p->files, 0, this_tty, tty); +		if (i != 0) { +			printk(KERN_NOTICE "SAK: killed process %d" +			    " (%s): fd#%d opened to the tty\n", +				    task_pid_nr(p), p->comm, i - 1); +			force_sig(SIGKILL, p);  		}  		task_unlock(p);  	} while_each_thread(g, p); diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c index a26c43a151f..64c4ec10d1f 100644 --- a/drivers/usb/gadget/f_fs.c +++ b/drivers/usb/gadget/f_fs.c @@ -340,7 +340,7 @@ ffs_sb_create_file(struct super_block *sb, const char *name, void *data,  static int ffs_mutex_lock(struct mutex *mutex, unsigned nonblock)  	__attribute__((warn_unused_result, nonnull)); -static char *ffs_prepare_buffer(const char * __user buf, size_t len) +static char *ffs_prepare_buffer(const char __user *buf, size_t len)  	__attribute__((warn_unused_result, nonnull)); @@ -2445,7 +2445,7 @@ static int ffs_mutex_lock(struct mutex *mutex, unsigned nonblock)  		: mutex_lock_interruptible(mutex);  } -static char *ffs_prepare_buffer(const char * __user buf, size_t len) +static char *ffs_prepare_buffer(const char __user *buf, size_t len)  {  	char *data; diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 17830c9c7cc..56097c6d072 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1014,7 +1014,7 @@ static void vfio_group_try_dissolve_container(struct vfio_group *group)  static int vfio_group_set_container(struct vfio_group *group, int container_fd)  { -	struct file *filep; +	struct fd f;  	struct vfio_container *container;  	struct vfio_iommu_driver *driver;  	int ret = 0; @@ -1022,17 +1022,17 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)  	if (atomic_read(&group->container_users))  		return -EINVAL; -	filep = fget(container_fd); -	if (!filep) +	f = fdget(container_fd); +	if (!f.file)  		return -EBADF;  	/* Sanity check, is this really our fd? */ -	if (filep->f_op != &vfio_fops) { -		fput(filep); +	if (f.file->f_op != &vfio_fops) { +		fdput(f);  		return -EINVAL;  	} -	container = filep->private_data; +	container = f.file->private_data;  	WARN_ON(!container); /* fget ensures we don't race vfio_release */  	mutex_lock(&container->group_lock); @@ -1054,8 +1054,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)  unlock_out:  	mutex_unlock(&container->group_lock); -	fput(filep); - +	fdput(f);  	return ret;  } diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index ef82a0d1848..99ac2cb08b4 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -636,8 +636,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)  static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)  { -	struct file *eventfp, *filep = NULL, -		    *pollstart = NULL, *pollstop = NULL; +	struct file *eventfp, *filep = NULL; +	bool pollstart = false, pollstop = false;  	struct eventfd_ctx *ctx = NULL;  	u32 __user *idxp = argp;  	struct vhost_virtqueue *vq; @@ -763,8 +763,8 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)  			break;  		}  		if (eventfp != vq->kick) { -			pollstop = filep = vq->kick; -			pollstart = vq->kick = eventfp; +			pollstop = (filep = vq->kick) != NULL; +			pollstart = (vq->kick = eventfp) != NULL;  		} else  			filep = eventfp;  		break; diff --git a/drivers/video/msm/mdp.c b/drivers/video/msm/mdp.c index d1f881e8030..2e0f3bab611 100644 --- a/drivers/video/msm/mdp.c +++ b/drivers/video/msm/mdp.c @@ -257,19 +257,17 @@ int get_img(struct mdp_img *img, struct fb_info *info,  	    unsigned long *start, unsigned long *len,  	    struct file **filep)  { -	int put_needed, ret = 0; -	struct file *file; - -	file = fget_light(img->memory_id, &put_needed); -	if (file == NULL) +	int ret = 0; +	struct fd f = fdget(img->memory_id); +	if (f.file == NULL)  		return -1; -	if (MAJOR(file->f_dentry->d_inode->i_rdev) == FB_MAJOR) { +	if (MAJOR(f.file->f_dentry->d_inode->i_rdev) == FB_MAJOR) {  		*start = info->fix.smem_start;  		*len = info->fix.smem_len;  	} else  		ret = -1; -	fput_light(file, put_needed); +	fdput(f);  	return ret;  } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index b85efa77394..392c5dac198 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -560,6 +560,11 @@ static int v9fs_init_inode_cache(void)   */  static void v9fs_destroy_inode_cache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(v9fs_inode_cache);  } diff --git a/fs/Makefile b/fs/Makefile index 2fb97793467..8938f825032 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \  		attr.o bad_inode.o file.o filesystems.o namespace.o \  		seq_file.o xattr.o libfs.o fs-writeback.o \  		pnode.o drop_caches.o splice.o sync.o utimes.o \ -		stack.o fs_struct.o statfs.o +		stack.o fs_struct.o statfs.o coredump.o  ifeq ($(CONFIG_BLOCK),y)  obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 22a0d7ed5fa..d5712293579 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -280,6 +280,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(adfs_inode_cachep);  } diff --git a/fs/affs/super.c b/fs/affs/super.c index 1f030825cd3..b84dc735250 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -147,6 +147,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(affs_inode_cachep);  } diff --git a/fs/afs/super.c b/fs/afs/super.c index df8c6047c2a..43165009428 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -123,6 +123,11 @@ void __exit afs_fs_exit(void)  		BUG();  	} +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(afs_inode_cachep);  	_leave("");  } diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index abf645c1703..a16214109d3 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -221,20 +221,6 @@ static int test_by_type(struct path *path, void *p)  	return ino && ino->sbi->type & *(unsigned *)p;  } -static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) -{ -	struct files_struct *files = current->files; -	struct fdtable *fdt; - -	spin_lock(&files->file_lock); -	fdt = files_fdtable(files); -	BUG_ON(fdt->fd[fd] != NULL); -	rcu_assign_pointer(fdt->fd[fd], file); -	__set_close_on_exec(fd, fdt); -	spin_unlock(&files->file_lock); -} - -  /*   * Open a file descriptor on the autofs mount point corresponding   * to the given path and device number (aka. new_encode_dev(sb->s_dev)). @@ -243,7 +229,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)  {  	int err, fd; -	fd = get_unused_fd(); +	fd = get_unused_fd_flags(O_CLOEXEC);  	if (likely(fd >= 0)) {  		struct file *filp;  		struct path path; @@ -264,7 +250,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)  			goto out;  		} -		autofs_dev_ioctl_fd_install(fd, filp); +		fd_install(fd, filp);  	}  	return fd; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index da8876d38a7..dce436e595c 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -175,8 +175,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,  		return;  	} -	pipe = sbi->pipe; -	get_file(pipe); +	pipe = get_file(sbi->pipe);  	mutex_unlock(&sbi->wq_mutex); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 7f73a692bfd..2b3bda8d5e6 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -457,6 +457,11 @@ befs_init_inodecache(void)  static void  befs_destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(befs_inode_cachep);  } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index b242beba58e..737aaa3f709 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -280,6 +280,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(bfs_inode_cachep);  } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1b52956afe3..0225fddf49b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1696,30 +1696,19 @@ static int elf_note_info_init(struct elf_note_info *info)  		return 0;  	info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);  	if (!info->psinfo) -		goto notes_free; +		return 0;  	info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);  	if (!info->prstatus) -		goto psinfo_free; +		return 0;  	info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);  	if (!info->fpu) -		goto prstatus_free; +		return 0;  #ifdef ELF_CORE_COPY_XFPREGS  	info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);  	if (!info->xfpu) -		goto fpu_free; +		return 0;  #endif  	return 1; -#ifdef ELF_CORE_COPY_XFPREGS - fpu_free: -	kfree(info->fpu); -#endif - prstatus_free: -	kfree(info->prstatus); - psinfo_free: -	kfree(info->psinfo); - notes_free: -	kfree(info->notes); -	return 0;  }  static int fill_note_info(struct elfhdr *elf, int phdrs, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4c878476bb9..b08ea4717e9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -107,6 +107,12 @@ void extent_io_exit(void)  		list_del(&eb->leak_list);  		kmem_cache_free(extent_buffer_cache, eb);  	} + +	/* +	 * Make sure all delayed rcu free are flushed before we +	 * destroy caches. +	 */ +	rcu_barrier();  	if (extent_state_cache)  		kmem_cache_destroy(extent_state_cache);  	if (extent_buffer_cache) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2a028a58619..a6ed6944e50 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7076,6 +7076,11 @@ static void init_once(void *foo)  void btrfs_destroy_cachep(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	if (btrfs_inode_cachep)  		kmem_cache_destroy(btrfs_inode_cachep);  	if (btrfs_trans_handle_cachep) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 27bfce58da3..47127c1bd29 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1397,7 +1397,6 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,  				u64 *transid, bool readonly,  				struct btrfs_qgroup_inherit **inherit)  { -	struct file *src_file;  	int namelen;  	int ret = 0; @@ -1421,25 +1420,24 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,  		ret = btrfs_mksubvol(&file->f_path, name, namelen,  				     NULL, transid, readonly, inherit);  	} else { +		struct fd src = fdget(fd);  		struct inode *src_inode; -		src_file = fget(fd); -		if (!src_file) { +		if (!src.file) {  			ret = -EINVAL;  			goto out_drop_write;  		} -		src_inode = src_file->f_path.dentry->d_inode; +		src_inode = src.file->f_path.dentry->d_inode;  		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {  			printk(KERN_INFO "btrfs: Snapshot src from "  			       "another FS\n");  			ret = -EINVAL; -			fput(src_file); -			goto out_drop_write; +		} else { +			ret = btrfs_mksubvol(&file->f_path, name, namelen, +					     BTRFS_I(src_inode)->root, +					     transid, readonly, inherit);  		} -		ret = btrfs_mksubvol(&file->f_path, name, namelen, -				     BTRFS_I(src_inode)->root, -				     transid, readonly, inherit); -		fput(src_file); +		fdput(src);  	}  out_drop_write:  	mnt_drop_write_file(file); @@ -2341,7 +2339,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  {  	struct inode *inode = fdentry(file)->d_inode;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct file *src_file; +	struct fd src_file;  	struct inode *src;  	struct btrfs_trans_handle *trans;  	struct btrfs_path *path; @@ -2376,24 +2374,24 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  	if (ret)  		return ret; -	src_file = fget(srcfd); -	if (!src_file) { +	src_file = fdget(srcfd); +	if (!src_file.file) {  		ret = -EBADF;  		goto out_drop_write;  	}  	ret = -EXDEV; -	if (src_file->f_path.mnt != file->f_path.mnt) +	if (src_file.file->f_path.mnt != file->f_path.mnt)  		goto out_fput; -	src = src_file->f_dentry->d_inode; +	src = src_file.file->f_dentry->d_inode;  	ret = -EINVAL;  	if (src == inode)  		goto out_fput;  	/* the src must be open for reading */ -	if (!(src_file->f_mode & FMODE_READ)) +	if (!(src_file.file->f_mode & FMODE_READ))  		goto out_fput;  	/* don't make the dst file partly checksummed */ @@ -2724,7 +2722,7 @@ out_unlock:  	vfree(buf);  	btrfs_free_path(path);  out_fput: -	fput(src_file); +	fdput(src_file);  out_drop_write:  	mnt_drop_write_file(file);  	return ret; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 48a4882d8ad..a955669519a 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -68,7 +68,7 @@ struct reada_extent {  	u32			blocksize;  	int			err;  	struct list_head	extctl; -	struct kref		refcnt; +	int 			refcnt;  	spinlock_t		lock;  	struct reada_zone	*zones[BTRFS_MAX_MIRRORS];  	int			nzones; @@ -126,7 +126,7 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,  	spin_lock(&fs_info->reada_lock);  	re = radix_tree_lookup(&fs_info->reada_tree, index);  	if (re) -		kref_get(&re->refcnt); +		re->refcnt++;  	spin_unlock(&fs_info->reada_lock);  	if (!re) @@ -336,7 +336,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,  	spin_lock(&fs_info->reada_lock);  	re = radix_tree_lookup(&fs_info->reada_tree, index);  	if (re) -		kref_get(&re->refcnt); +		re->refcnt++;  	spin_unlock(&fs_info->reada_lock);  	if (re) @@ -352,7 +352,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,  	re->top = *top;  	INIT_LIST_HEAD(&re->extctl);  	spin_lock_init(&re->lock); -	kref_init(&re->refcnt); +	re->refcnt = 1;  	/*  	 * map block @@ -398,7 +398,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,  	if (ret == -EEXIST) {  		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);  		BUG_ON(!re_exist); -		kref_get(&re_exist->refcnt); +		re_exist->refcnt++;  		spin_unlock(&fs_info->reada_lock);  		goto error;  	} @@ -465,10 +465,6 @@ error:  	return re_exist;  } -static void reada_kref_dummy(struct kref *kr) -{ -} -  static void reada_extent_put(struct btrfs_fs_info *fs_info,  			     struct reada_extent *re)  { @@ -476,7 +472,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,  	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;  	spin_lock(&fs_info->reada_lock); -	if (!kref_put(&re->refcnt, reada_kref_dummy)) { +	if (--re->refcnt) {  		spin_unlock(&fs_info->reada_lock);  		return;  	} @@ -671,7 +667,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,  		return 0;  	}  	dev->reada_next = re->logical + re->blocksize; -	kref_get(&re->refcnt); +	re->refcnt++;  	spin_unlock(&fs_info->reada_lock); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4b5762ef7c2..ba95eea201b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1104,7 +1104,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,  				pr_err("fill_trace bad get_inode "  				       "%llx.%llx\n", vino.ino, vino.snap);  				err = PTR_ERR(in); -				d_delete(dn); +				d_drop(dn);  				goto done;  			}  			dn = splice_dentry(dn, in, &have_lease, true); @@ -1277,7 +1277,7 @@ retry_lookup:  			in = ceph_get_inode(parent->d_sb, vino);  			if (IS_ERR(in)) {  				dout("new_inode badness\n"); -				d_delete(dn); +				d_drop(dn);  				dput(dn);  				err = PTR_ERR(in);  				goto out; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b982239f38f..3a42d932637 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -603,6 +603,11 @@ bad_cap:  static void destroy_caches(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(ceph_inode_cachep);  	kmem_cache_destroy(ceph_cap_cachep);  	kmem_cache_destroy(ceph_dentry_cachep); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index a41044a3108..e7931cc55d0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -968,6 +968,11 @@ cifs_init_inodecache(void)  static void  cifs_destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(cifs_inode_cachep);  } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index f1813120d75..be2aa490948 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -85,6 +85,11 @@ int coda_init_inodecache(void)  void coda_destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(coda_inode_cachep);  } @@ -107,43 +112,41 @@ static const struct super_operations coda_super_operations =  static int get_device_index(struct coda_mount_data *data)  { -	struct file *file; +	struct fd f;  	struct inode *inode;  	int idx; -	if(data == NULL) { +	if (data == NULL) {  		printk("coda_read_super: Bad mount data\n");  		return -1;  	} -	if(data->version != CODA_MOUNT_VERSION) { +	if (data->version != CODA_MOUNT_VERSION) {  		printk("coda_read_super: Bad mount version\n");  		return -1;  	} -	file = fget(data->fd); -	inode = NULL; -	if(file) -		inode = file->f_path.dentry->d_inode; -	 -	if(!inode || !S_ISCHR(inode->i_mode) || -	   imajor(inode) != CODA_PSDEV_MAJOR) { -		if(file) -			fput(file); - -		printk("coda_read_super: Bad file\n"); -		return -1; +	f = fdget(data->fd); +	if (!f.file) +		goto Ebadf; +	inode = f.file->f_path.dentry->d_inode; +	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { +		fdput(f); +		goto Ebadf;  	}  	idx = iminor(inode); -	fput(file); +	fdput(f); -	if(idx < 0 || idx >= MAX_CODADEVS) { +	if (idx < 0 || idx >= MAX_CODADEVS) {  		printk("coda_read_super: Bad minor number\n");  		return -1;  	}  	return idx; +Ebadf: +	printk("coda_read_super: Bad file\n"); +	return -1;  }  static int coda_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/compat.c b/fs/compat.c index 1bdb350ea5d..b7a24d0ca30 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -870,22 +870,20 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,  	struct compat_old_linux_dirent __user *dirent, unsigned int count)  {  	int error; -	struct file *file; -	int fput_needed; +	struct fd f = fdget(fd);  	struct compat_readdir_callback buf; -	file = fget_light(fd, &fput_needed); -	if (!file) +	if (!f.file)  		return -EBADF;  	buf.result = 0;  	buf.dirent = dirent; -	error = vfs_readdir(file, compat_fillonedir, &buf); +	error = vfs_readdir(f.file, compat_fillonedir, &buf);  	if (buf.result)  		error = buf.result; -	fput_light(file, fput_needed); +	fdput(f);  	return error;  } @@ -949,17 +947,16 @@ efault:  asmlinkage long compat_sys_getdents(unsigned int fd,  		struct compat_linux_dirent __user *dirent, unsigned int count)  { -	struct file * file; +	struct fd f;  	struct compat_linux_dirent __user * lastdirent;  	struct compat_getdents_callback buf; -	int fput_needed;  	int error;  	if (!access_ok(VERIFY_WRITE, dirent, count))  		return -EFAULT; -	file = fget_light(fd, &fput_needed); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		return -EBADF;  	buf.current_dir = dirent; @@ -967,17 +964,17 @@ asmlinkage long compat_sys_getdents(unsigned int fd,  	buf.count = count;  	buf.error = 0; -	error = vfs_readdir(file, compat_filldir, &buf); +	error = vfs_readdir(f.file, compat_filldir, &buf);  	if (error >= 0)  		error = buf.error;  	lastdirent = buf.previous;  	if (lastdirent) { -		if (put_user(file->f_pos, &lastdirent->d_off)) +		if (put_user(f.file->f_pos, &lastdirent->d_off))  			error = -EFAULT;  		else  			error = count - buf.count;  	} -	fput_light(file, fput_needed); +	fdput(f);  	return error;  } @@ -1035,17 +1032,16 @@ efault:  asmlinkage long compat_sys_getdents64(unsigned int fd,  		struct linux_dirent64 __user * dirent, unsigned int count)  { -	struct file * file; +	struct fd f;  	struct linux_dirent64 __user * lastdirent;  	struct compat_getdents_callback64 buf; -	int fput_needed;  	int error;  	if (!access_ok(VERIFY_WRITE, dirent, count))  		return -EFAULT; -	file = fget_light(fd, &fput_needed); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		return -EBADF;  	buf.current_dir = dirent; @@ -1053,18 +1049,18 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,  	buf.count = count;  	buf.error = 0; -	error = vfs_readdir(file, compat_filldir64, &buf); +	error = vfs_readdir(f.file, compat_filldir64, &buf);  	if (error >= 0)  		error = buf.error;  	lastdirent = buf.previous;  	if (lastdirent) { -		typeof(lastdirent->d_off) d_off = file->f_pos; +		typeof(lastdirent->d_off) d_off = f.file->f_pos;  		if (__put_user_unaligned(d_off, &lastdirent->d_off))  			error = -EFAULT;  		else  			error = count - buf.count;  	} -	fput_light(file, fput_needed); +	fdput(f);  	return error;  }  #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ @@ -1152,18 +1148,16 @@ asmlinkage ssize_t  compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,  		 unsigned long vlen)  { -	struct file *file; -	int fput_needed; +	struct fd f = fdget(fd);  	ssize_t ret;  	loff_t pos; -	file = fget_light(fd, &fput_needed); -	if (!file) +	if (!f.file)  		return -EBADF; -	pos = file->f_pos; -	ret = compat_readv(file, vec, vlen, &pos); -	file->f_pos = pos; -	fput_light(file, fput_needed); +	pos = f.file->f_pos; +	ret = compat_readv(f.file, vec, vlen, &pos); +	f.file->f_pos = pos; +	fdput(f);  	return ret;  } @@ -1171,19 +1165,18 @@ asmlinkage ssize_t  compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec,  		    unsigned long vlen, loff_t pos)  { -	struct file *file; -	int fput_needed; +	struct fd f;  	ssize_t ret;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		return -EBADF;  	ret = -ESPIPE; -	if (file->f_mode & FMODE_PREAD) -		ret = compat_readv(file, vec, vlen, &pos); -	fput_light(file, fput_needed); +	if (f.file->f_mode & FMODE_PREAD) +		ret = compat_readv(f.file, vec, vlen, &pos); +	fdput(f);  	return ret;  } @@ -1221,18 +1214,16 @@ asmlinkage ssize_t  compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,  		  unsigned long vlen)  { -	struct file *file; -	int fput_needed; +	struct fd f = fdget(fd);  	ssize_t ret;  	loff_t pos; -	file = fget_light(fd, &fput_needed); -	if (!file) +	if (!f.file)  		return -EBADF; -	pos = file->f_pos; -	ret = compat_writev(file, vec, vlen, &pos); -	file->f_pos = pos; -	fput_light(file, fput_needed); +	pos = f.file->f_pos; +	ret = compat_writev(f.file, vec, vlen, &pos); +	f.file->f_pos = pos; +	fdput(f);  	return ret;  } @@ -1240,19 +1231,18 @@ asmlinkage ssize_t  compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec,  		     unsigned long vlen, loff_t pos)  { -	struct file *file; -	int fput_needed; +	struct fd f;  	ssize_t ret;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		return -EBADF;  	ret = -ESPIPE; -	if (file->f_mode & FMODE_PWRITE) -		ret = compat_writev(file, vec, vlen, &pos); -	fput_light(file, fput_needed); +	if (f.file->f_mode & FMODE_PWRITE) +		ret = compat_writev(f.file, vec, vlen, &pos); +	fdput(f);  	return ret;  } @@ -1802,3 +1792,25 @@ compat_sys_open_by_handle_at(int mountdirfd,  	return do_handle_open(mountdirfd, handle, flags);  }  #endif + +#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE +asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, +				    compat_off_t __user *offset, compat_size_t count) +{ +	loff_t pos; +	off_t off; +	ssize_t ret; + +	if (offset) { +		if (unlikely(get_user(off, offset))) +			return -EFAULT; +		pos = off; +		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); +		if (unlikely(put_user(pos, offset))) +			return -EFAULT; +		return ret; +	} + +	return do_sendfile(out_fd, in_fd, NULL, count, 0); +} +#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */ diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 9c03a3ae898..f5054025f9d 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1539,16 +1539,13 @@ static int compat_ioctl_check_table(unsigned int xcmd)  asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,  				unsigned long arg)  { -	struct file *filp; +	struct fd f = fdget(fd);  	int error = -EBADF; -	int fput_needed; - -	filp = fget_light(fd, &fput_needed); -	if (!filp) +	if (!f.file)  		goto out;  	/* RED-PEN how should LSM module know it's handling 32bit? */ -	error = security_file_ioctl(filp, cmd, arg); +	error = security_file_ioctl(f.file, cmd, arg);  	if (error)  		goto out_fput; @@ -1568,30 +1565,30 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,  #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)  	case FS_IOC_RESVSP_32:  	case FS_IOC_RESVSP64_32: -		error = compat_ioctl_preallocate(filp, compat_ptr(arg)); +		error = compat_ioctl_preallocate(f.file, compat_ptr(arg));  		goto out_fput;  #else  	case FS_IOC_RESVSP:  	case FS_IOC_RESVSP64: -		error = ioctl_preallocate(filp, compat_ptr(arg)); +		error = ioctl_preallocate(f.file, compat_ptr(arg));  		goto out_fput;  #endif  	case FIBMAP:  	case FIGETBSZ:  	case FIONREAD: -		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) +		if (S_ISREG(f.file->f_path.dentry->d_inode->i_mode))  			break;  		/*FALL THROUGH*/  	default: -		if (filp->f_op && filp->f_op->compat_ioctl) { -			error = filp->f_op->compat_ioctl(filp, cmd, arg); +		if (f.file->f_op && f.file->f_op->compat_ioctl) { +			error = f.file->f_op->compat_ioctl(f.file, cmd, arg);  			if (error != -ENOIOCTLCMD)  				goto out_fput;  		} -		if (!filp->f_op || !filp->f_op->unlocked_ioctl) +		if (!f.file->f_op || !f.file->f_op->unlocked_ioctl)  			goto do_ioctl;  		break;  	} @@ -1599,7 +1596,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,  	if (compat_ioctl_check_table(XFORM(cmd)))  		goto found_handler; -	error = do_ioctl_trans(fd, cmd, arg, filp); +	error = do_ioctl_trans(fd, cmd, arg, f.file);  	if (error == -ENOIOCTLCMD)  		error = -ENOTTY; @@ -1608,9 +1605,9 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,   found_handler:  	arg = (unsigned long)compat_ptr(arg);   do_ioctl: -	error = do_vfs_ioctl(filp, fd, cmd, arg); +	error = do_vfs_ioctl(f.file, fd, cmd, arg);   out_fput: -	fput_light(filp, fput_needed); +	fdput(f);   out:  	return error;  } diff --git a/fs/coredump.c b/fs/coredump.c new file mode 100644 index 00000000000..f045bbad682 --- /dev/null +++ b/fs/coredump.c @@ -0,0 +1,686 @@ +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/mm.h> +#include <linux/stat.h> +#include <linux/fcntl.h> +#include <linux/swap.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/perf_event.h> +#include <linux/highmem.h> +#include <linux/spinlock.h> +#include <linux/key.h> +#include <linux/personality.h> +#include <linux/binfmts.h> +#include <linux/utsname.h> +#include <linux/pid_namespace.h> +#include <linux/module.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/security.h> +#include <linux/syscalls.h> +#include <linux/tsacct_kern.h> +#include <linux/cn_proc.h> +#include <linux/audit.h> +#include <linux/tracehook.h> +#include <linux/kmod.h> +#include <linux/fsnotify.h> +#include <linux/fs_struct.h> +#include <linux/pipe_fs_i.h> +#include <linux/oom.h> +#include <linux/compat.h> + +#include <asm/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/tlb.h> +#include <asm/exec.h> + +#include <trace/events/task.h> +#include "internal.h" + +#include <trace/events/sched.h> + +int core_uses_pid; +char core_pattern[CORENAME_MAX_SIZE] = "core"; +unsigned int core_pipe_limit; + +struct core_name { +	char *corename; +	int used, size; +}; +static atomic_t call_count = ATOMIC_INIT(1); + +/* The maximal length of core_pattern is also specified in sysctl.c */ + +static int expand_corename(struct core_name *cn) +{ +	char *old_corename = cn->corename; + +	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); +	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); + +	if (!cn->corename) { +		kfree(old_corename); +		return -ENOMEM; +	} + +	return 0; +} + +static int cn_printf(struct core_name *cn, const char *fmt, ...) +{ +	char *cur; +	int need; +	int ret; +	va_list arg; + +	va_start(arg, fmt); +	need = vsnprintf(NULL, 0, fmt, arg); +	va_end(arg); + +	if (likely(need < cn->size - cn->used - 1)) +		goto out_printf; + +	ret = expand_corename(cn); +	if (ret) +		goto expand_fail; + +out_printf: +	cur = cn->corename + cn->used; +	va_start(arg, fmt); +	vsnprintf(cur, need + 1, fmt, arg); +	va_end(arg); +	cn->used += need; +	return 0; + +expand_fail: +	return ret; +} + +static void cn_escape(char *str) +{ +	for (; *str; str++) +		if (*str == '/') +			*str = '!'; +} + +static int cn_print_exe_file(struct core_name *cn) +{ +	struct file *exe_file; +	char *pathbuf, *path; +	int ret; + +	exe_file = get_mm_exe_file(current->mm); +	if (!exe_file) { +		char *commstart = cn->corename + cn->used; +		ret = cn_printf(cn, "%s (path unknown)", current->comm); +		cn_escape(commstart); +		return ret; +	} + +	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); +	if (!pathbuf) { +		ret = -ENOMEM; +		goto put_exe_file; +	} + +	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); +	if (IS_ERR(path)) { +		ret = PTR_ERR(path); +		goto free_buf; +	} + +	cn_escape(path); + +	ret = cn_printf(cn, "%s", path); + +free_buf: +	kfree(pathbuf); +put_exe_file: +	fput(exe_file); +	return ret; +} + +/* format_corename will inspect the pattern parameter, and output a + * name into corename, which must have space for at least + * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. + */ +static int format_corename(struct core_name *cn, long signr) +{ +	const struct cred *cred = current_cred(); +	const char *pat_ptr = core_pattern; +	int ispipe = (*pat_ptr == '|'); +	int pid_in_pattern = 0; +	int err = 0; + +	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); +	cn->corename = kmalloc(cn->size, GFP_KERNEL); +	cn->used = 0; + +	if (!cn->corename) +		return -ENOMEM; + +	/* Repeat as long as we have more pattern to process and more output +	   space */ +	while (*pat_ptr) { +		if (*pat_ptr != '%') { +			if (*pat_ptr == 0) +				goto out; +			err = cn_printf(cn, "%c", *pat_ptr++); +		} else { +			switch (*++pat_ptr) { +			/* single % at the end, drop that */ +			case 0: +				goto out; +			/* Double percent, output one percent */ +			case '%': +				err = cn_printf(cn, "%c", '%'); +				break; +			/* pid */ +			case 'p': +				pid_in_pattern = 1; +				err = cn_printf(cn, "%d", +					      task_tgid_vnr(current)); +				break; +			/* uid */ +			case 'u': +				err = cn_printf(cn, "%d", cred->uid); +				break; +			/* gid */ +			case 'g': +				err = cn_printf(cn, "%d", cred->gid); +				break; +			/* signal that caused the coredump */ +			case 's': +				err = cn_printf(cn, "%ld", signr); +				break; +			/* UNIX time of coredump */ +			case 't': { +				struct timeval tv; +				do_gettimeofday(&tv); +				err = cn_printf(cn, "%lu", tv.tv_sec); +				break; +			} +			/* hostname */ +			case 'h': { +				char *namestart = cn->corename + cn->used; +				down_read(&uts_sem); +				err = cn_printf(cn, "%s", +					      utsname()->nodename); +				up_read(&uts_sem); +				cn_escape(namestart); +				break; +			} +			/* executable */ +			case 'e': { +				char *commstart = cn->corename + cn->used; +				err = cn_printf(cn, "%s", current->comm); +				cn_escape(commstart); +				break; +			} +			case 'E': +				err = cn_print_exe_file(cn); +				break; +			/* core limit size */ +			case 'c': +				err = cn_printf(cn, "%lu", +					      rlimit(RLIMIT_CORE)); +				break; +			default: +				break; +			} +			++pat_ptr; +		} + +		if (err) +			return err; +	} + +	/* Backward compatibility with core_uses_pid: +	 * +	 * If core_pattern does not include a %p (as is the default) +	 * and core_uses_pid is set, then .%pid will be appended to +	 * the filename. Do not do this for piped commands. */ +	if (!ispipe && !pid_in_pattern && core_uses_pid) { +		err = cn_printf(cn, ".%d", task_tgid_vnr(current)); +		if (err) +			return err; +	} +out: +	return ispipe; +} + +static int zap_process(struct task_struct *start, int exit_code) +{ +	struct task_struct *t; +	int nr = 0; + +	start->signal->flags = SIGNAL_GROUP_EXIT; +	start->signal->group_exit_code = exit_code; +	start->signal->group_stop_count = 0; + +	t = start; +	do { +		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); +		if (t != current && t->mm) { +			sigaddset(&t->pending.signal, SIGKILL); +			signal_wake_up(t, 1); +			nr++; +		} +	} while_each_thread(start, t); + +	return nr; +} + +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, +				struct core_state *core_state, int exit_code) +{ +	struct task_struct *g, *p; +	unsigned long flags; +	int nr = -EAGAIN; + +	spin_lock_irq(&tsk->sighand->siglock); +	if (!signal_group_exit(tsk->signal)) { +		mm->core_state = core_state; +		nr = zap_process(tsk, exit_code); +	} +	spin_unlock_irq(&tsk->sighand->siglock); +	if (unlikely(nr < 0)) +		return nr; + +	if (atomic_read(&mm->mm_users) == nr + 1) +		goto done; +	/* +	 * We should find and kill all tasks which use this mm, and we should +	 * count them correctly into ->nr_threads. We don't take tasklist +	 * lock, but this is safe wrt: +	 * +	 * fork: +	 *	None of sub-threads can fork after zap_process(leader). All +	 *	processes which were created before this point should be +	 *	visible to zap_threads() because copy_process() adds the new +	 *	process to the tail of init_task.tasks list, and lock/unlock +	 *	of ->siglock provides a memory barrier. +	 * +	 * do_exit: +	 *	The caller holds mm->mmap_sem. This means that the task which +	 *	uses this mm can't pass exit_mm(), so it can't exit or clear +	 *	its ->mm. +	 * +	 * de_thread: +	 *	It does list_replace_rcu(&leader->tasks, ¤t->tasks), +	 *	we must see either old or new leader, this does not matter. +	 *	However, it can change p->sighand, so lock_task_sighand(p) +	 *	must be used. Since p->mm != NULL and we hold ->mmap_sem +	 *	it can't fail. +	 * +	 *	Note also that "g" can be the old leader with ->mm == NULL +	 *	and already unhashed and thus removed from ->thread_group. +	 *	This is OK, __unhash_process()->list_del_rcu() does not +	 *	clear the ->next pointer, we will find the new leader via +	 *	next_thread(). +	 */ +	rcu_read_lock(); +	for_each_process(g) { +		if (g == tsk->group_leader) +			continue; +		if (g->flags & PF_KTHREAD) +			continue; +		p = g; +		do { +			if (p->mm) { +				if (unlikely(p->mm == mm)) { +					lock_task_sighand(p, &flags); +					nr += zap_process(p, exit_code); +					unlock_task_sighand(p, &flags); +				} +				break; +			} +		} while_each_thread(g, p); +	} +	rcu_read_unlock(); +done: +	atomic_set(&core_state->nr_threads, nr); +	return nr; +} + +static int coredump_wait(int exit_code, struct core_state *core_state) +{ +	struct task_struct *tsk = current; +	struct mm_struct *mm = tsk->mm; +	int core_waiters = -EBUSY; + +	init_completion(&core_state->startup); +	core_state->dumper.task = tsk; +	core_state->dumper.next = NULL; + +	down_write(&mm->mmap_sem); +	if (!mm->core_state) +		core_waiters = zap_threads(tsk, mm, core_state, exit_code); +	up_write(&mm->mmap_sem); + +	if (core_waiters > 0) { +		struct core_thread *ptr; + +		wait_for_completion(&core_state->startup); +		/* +		 * Wait for all the threads to become inactive, so that +		 * all the thread context (extended register state, like +		 * fpu etc) gets copied to the memory. +		 */ +		ptr = core_state->dumper.next; +		while (ptr != NULL) { +			wait_task_inactive(ptr->task, 0); +			ptr = ptr->next; +		} +	} + +	return core_waiters; +} + +static void coredump_finish(struct mm_struct *mm) +{ +	struct core_thread *curr, *next; +	struct task_struct *task; + +	next = mm->core_state->dumper.next; +	while ((curr = next) != NULL) { +		next = curr->next; +		task = curr->task; +		/* +		 * see exit_mm(), curr->task must not see +		 * ->task == NULL before we read ->next. +		 */ +		smp_mb(); +		curr->task = NULL; +		wake_up_process(task); +	} + +	mm->core_state = NULL; +} + +static void wait_for_dump_helpers(struct file *file) +{ +	struct pipe_inode_info *pipe; + +	pipe = file->f_path.dentry->d_inode->i_pipe; + +	pipe_lock(pipe); +	pipe->readers++; +	pipe->writers--; + +	while ((pipe->readers > 1) && (!signal_pending(current))) { +		wake_up_interruptible_sync(&pipe->wait); +		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +		pipe_wait(pipe); +	} + +	pipe->readers--; +	pipe->writers++; +	pipe_unlock(pipe); + +} + +/* + * umh_pipe_setup + * helper function to customize the process used + * to collect the core in userspace.  Specifically + * it sets up a pipe and installs it as fd 0 (stdin) + * for the process.  Returns 0 on success, or + * PTR_ERR on failure. + * Note that it also sets the core limit to 1.  This + * is a special value that we use to trap recursive + * core dumps + */ +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +{ +	struct file *files[2]; +	struct coredump_params *cp = (struct coredump_params *)info->data; +	int err = create_pipe_files(files, 0); +	if (err) +		return err; + +	cp->file = files[1]; + +	replace_fd(0, files[0], 0); +	/* and disallow core files too */ +	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; + +	return 0; +} + +void do_coredump(long signr, int exit_code, struct pt_regs *regs) +{ +	struct core_state core_state; +	struct core_name cn; +	struct mm_struct *mm = current->mm; +	struct linux_binfmt * binfmt; +	const struct cred *old_cred; +	struct cred *cred; +	int retval = 0; +	int flag = 0; +	int ispipe; +	struct files_struct *displaced; +	bool need_nonrelative = false; +	static atomic_t core_dump_count = ATOMIC_INIT(0); +	struct coredump_params cprm = { +		.signr = signr, +		.regs = regs, +		.limit = rlimit(RLIMIT_CORE), +		/* +		 * We must use the same mm->flags while dumping core to avoid +		 * inconsistency of bit flags, since this flag is not protected +		 * by any locks. +		 */ +		.mm_flags = mm->flags, +	}; + +	audit_core_dumps(signr); + +	binfmt = mm->binfmt; +	if (!binfmt || !binfmt->core_dump) +		goto fail; +	if (!__get_dumpable(cprm.mm_flags)) +		goto fail; + +	cred = prepare_creds(); +	if (!cred) +		goto fail; +	/* +	 * We cannot trust fsuid as being the "true" uid of the process +	 * nor do we know its entire history. We only know it was tainted +	 * so we dump it as root in mode 2, and only into a controlled +	 * environment (pipe handler or fully qualified path). +	 */ +	if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) { +		/* Setuid core dump mode */ +		flag = O_EXCL;		/* Stop rewrite attacks */ +		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */ +		need_nonrelative = true; +	} + +	retval = coredump_wait(exit_code, &core_state); +	if (retval < 0) +		goto fail_creds; + +	old_cred = override_creds(cred); + +	/* +	 * Clear any false indication of pending signals that might +	 * be seen by the filesystem code called to write the core file. +	 */ +	clear_thread_flag(TIF_SIGPENDING); + +	ispipe = format_corename(&cn, signr); + + 	if (ispipe) { +		int dump_count; +		char **helper_argv; + +		if (ispipe < 0) { +			printk(KERN_WARNING "format_corename failed\n"); +			printk(KERN_WARNING "Aborting core\n"); +			goto fail_corename; +		} + +		if (cprm.limit == 1) { +			/* See umh_pipe_setup() which sets RLIMIT_CORE = 1. +			 * +			 * Normally core limits are irrelevant to pipes, since +			 * we're not writing to the file system, but we use +			 * cprm.limit of 1 here as a speacial value, this is a +			 * consistent way to catch recursive crashes. +			 * We can still crash if the core_pattern binary sets +			 * RLIM_CORE = !1, but it runs as root, and can do +			 * lots of stupid things. +			 * +			 * Note that we use task_tgid_vnr here to grab the pid +			 * of the process group leader.  That way we get the +			 * right pid if a thread in a multi-threaded +			 * core_pattern process dies. +			 */ +			printk(KERN_WARNING +				"Process %d(%s) has RLIMIT_CORE set to 1\n", +				task_tgid_vnr(current), current->comm); +			printk(KERN_WARNING "Aborting core\n"); +			goto fail_unlock; +		} +		cprm.limit = RLIM_INFINITY; + +		dump_count = atomic_inc_return(&core_dump_count); +		if (core_pipe_limit && (core_pipe_limit < dump_count)) { +			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", +			       task_tgid_vnr(current), current->comm); +			printk(KERN_WARNING "Skipping core dump\n"); +			goto fail_dropcount; +		} + +		helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); +		if (!helper_argv) { +			printk(KERN_WARNING "%s failed to allocate memory\n", +			       __func__); +			goto fail_dropcount; +		} + +		retval = call_usermodehelper_fns(helper_argv[0], helper_argv, +					NULL, UMH_WAIT_EXEC, umh_pipe_setup, +					NULL, &cprm); +		argv_free(helper_argv); +		if (retval) { + 			printk(KERN_INFO "Core dump to %s pipe failed\n", +			       cn.corename); +			goto close_fail; + 		} +	} else { +		struct inode *inode; + +		if (cprm.limit < binfmt->min_coredump) +			goto fail_unlock; + +		if (need_nonrelative && cn.corename[0] != '/') { +			printk(KERN_WARNING "Pid %d(%s) can only dump core "\ +				"to fully qualified path!\n", +				task_tgid_vnr(current), current->comm); +			printk(KERN_WARNING "Skipping core dump\n"); +			goto fail_unlock; +		} + +		cprm.file = filp_open(cn.corename, +				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, +				 0600); +		if (IS_ERR(cprm.file)) +			goto fail_unlock; + +		inode = cprm.file->f_path.dentry->d_inode; +		if (inode->i_nlink > 1) +			goto close_fail; +		if (d_unhashed(cprm.file->f_path.dentry)) +			goto close_fail; +		/* +		 * AK: actually i see no reason to not allow this for named +		 * pipes etc, but keep the previous behaviour for now. +		 */ +		if (!S_ISREG(inode->i_mode)) +			goto close_fail; +		/* +		 * Dont allow local users get cute and trick others to coredump +		 * into their pre-created files. +		 */ +		if (!uid_eq(inode->i_uid, current_fsuid())) +			goto close_fail; +		if (!cprm.file->f_op || !cprm.file->f_op->write) +			goto close_fail; +		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) +			goto close_fail; +	} + +	/* get us an unshared descriptor table; almost always a no-op */ +	retval = unshare_files(&displaced); +	if (retval) +		goto close_fail; +	if (displaced) +		put_files_struct(displaced); +	retval = binfmt->core_dump(&cprm); +	if (retval) +		current->signal->group_exit_code |= 0x80; + +	if (ispipe && core_pipe_limit) +		wait_for_dump_helpers(cprm.file); +close_fail: +	if (cprm.file) +		filp_close(cprm.file, NULL); +fail_dropcount: +	if (ispipe) +		atomic_dec(&core_dump_count); +fail_unlock: +	kfree(cn.corename); +fail_corename: +	coredump_finish(mm); +	revert_creds(old_cred); +fail_creds: +	put_cred(cred); +fail: +	return; +} + +/* + * Core dumping helper functions.  These are the only things you should + * do on a core-file: use only these functions to write out all the + * necessary info. + */ +int dump_write(struct file *file, const void *addr, int nr) +{ +	return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} +EXPORT_SYMBOL(dump_write); + +int dump_seek(struct file *file, loff_t off) +{ +	int ret = 1; + +	if (file->f_op->llseek && file->f_op->llseek != no_llseek) { +		if (file->f_op->llseek(file, off, SEEK_CUR) < 0) +			return 0; +	} else { +		char *buf = (char *)get_zeroed_page(GFP_KERNEL); + +		if (!buf) +			return 0; +		while (off > 0) { +			unsigned long n = off; + +			if (n > PAGE_SIZE) +				n = PAGE_SIZE; +			if (!dump_write(file, buf, n)) { +				ret = 0; +				break; +			} +			off -= n; +		} +		free_page((unsigned long)buf); +	} +	return ret; +} +EXPORT_SYMBOL(dump_seek); diff --git a/fs/dcache.c b/fs/dcache.c index 693f95bf1ca..3a463d0c4fe 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2113,7 +2113,7 @@ again:  	inode = dentry->d_inode;  	isdir = S_ISDIR(inode->i_mode);  	if (dentry->d_count == 1) { -		if (inode && !spin_trylock(&inode->i_lock)) { +		if (!spin_trylock(&inode->i_lock)) {  			spin_unlock(&dentry->d_lock);  			cpu_relax();  			goto again; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 24bb043e50d..4e0886c9e5c 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -711,6 +711,12 @@ static void ecryptfs_free_kmem_caches(void)  {  	int i; +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier(); +  	for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {  		struct ecryptfs_cache_info *info; diff --git a/fs/efs/super.c b/fs/efs/super.c index e755ec746c6..2002431ef9a 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -96,6 +96,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(efs_inode_cachep);  } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eedec84c180..cd96649bfe6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1810,7 +1810,7 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,  		int, maxevents, int, timeout)  {  	int error; -	struct file *file; +	struct fd f;  	struct eventpoll *ep;  	/* The maximum number of event must be greater than zero */ @@ -1818,38 +1818,33 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,  		return -EINVAL;  	/* Verify that the area passed by the user is writeable */ -	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { -		error = -EFAULT; -		goto error_return; -	} +	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) +		return -EFAULT;  	/* Get the "struct file *" for the eventpoll file */ -	error = -EBADF; -	file = fget(epfd); -	if (!file) -		goto error_return; +	f = fdget(epfd); +	if (!f.file) +		return -EBADF;  	/*  	 * We have to check that the file structure underneath the fd  	 * the user passed to us _is_ an eventpoll file.  	 */  	error = -EINVAL; -	if (!is_file_epoll(file)) +	if (!is_file_epoll(f.file))  		goto error_fput;  	/*  	 * At this point it is safe to assume that the "private_data" contains  	 * our own data structure.  	 */ -	ep = file->private_data; +	ep = f.file->private_data;  	/* Time to fish for events ... */  	error = ep_poll(ep, events, maxevents, timeout);  error_fput: -	fput(file); -error_return: - +	fdput(f);  	return error;  } diff --git a/fs/exec.c b/fs/exec.c index 574cf4de4ec..48fb26ef8a1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,19 +66,8 @@  #include <trace/events/sched.h> -int core_uses_pid; -char core_pattern[CORENAME_MAX_SIZE] = "core"; -unsigned int core_pipe_limit;  int suid_dumpable = 0; -struct core_name { -	char *corename; -	int used, size; -}; -static atomic_t call_count = ATOMIC_INIT(1); - -/* The maximal length of core_pattern is also specified in sysctl.c */ -  static LIST_HEAD(formats);  static DEFINE_RWLOCK(binfmt_lock); @@ -1006,40 +995,6 @@ no_thread_group:  	return 0;  } -/* - * These functions flushes out all traces of the currently running executable - * so that a new one can be started - */ -static void flush_old_files(struct files_struct * files) -{ -	long j = -1; -	struct fdtable *fdt; - -	spin_lock(&files->file_lock); -	for (;;) { -		unsigned long set, i; - -		j++; -		i = j * BITS_PER_LONG; -		fdt = files_fdtable(files); -		if (i >= fdt->max_fds) -			break; -		set = fdt->close_on_exec[j]; -		if (!set) -			continue; -		fdt->close_on_exec[j] = 0; -		spin_unlock(&files->file_lock); -		for ( ; set ; i++,set >>= 1) { -			if (set & 1) { -				sys_close(i); -			} -		} -		spin_lock(&files->file_lock); - -	} -	spin_unlock(&files->file_lock); -} -  char *get_task_comm(char *buf, struct task_struct *tsk)  {  	/* buf must be at least sizeof(tsk->comm) in size */ @@ -1050,6 +1005,11 @@ char *get_task_comm(char *buf, struct task_struct *tsk)  }  EXPORT_SYMBOL_GPL(get_task_comm); +/* + * These functions flushes out all traces of the currently running executable + * so that a new one can be started + */ +  void set_task_comm(struct task_struct *tsk, char *buf)  {  	task_lock(tsk); @@ -1171,7 +1131,7 @@ void setup_new_exec(struct linux_binprm * bprm)  	current->self_exec_id++;  	flush_signal_handlers(current, 0); -	flush_old_files(current->files); +	do_close_on_exec(current->files);  }  EXPORT_SYMBOL(setup_new_exec); @@ -1632,353 +1592,6 @@ void set_binfmt(struct linux_binfmt *new)  EXPORT_SYMBOL(set_binfmt); -static int expand_corename(struct core_name *cn) -{ -	char *old_corename = cn->corename; - -	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); -	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); - -	if (!cn->corename) { -		kfree(old_corename); -		return -ENOMEM; -	} - -	return 0; -} - -static int cn_printf(struct core_name *cn, const char *fmt, ...) -{ -	char *cur; -	int need; -	int ret; -	va_list arg; - -	va_start(arg, fmt); -	need = vsnprintf(NULL, 0, fmt, arg); -	va_end(arg); - -	if (likely(need < cn->size - cn->used - 1)) -		goto out_printf; - -	ret = expand_corename(cn); -	if (ret) -		goto expand_fail; - -out_printf: -	cur = cn->corename + cn->used; -	va_start(arg, fmt); -	vsnprintf(cur, need + 1, fmt, arg); -	va_end(arg); -	cn->used += need; -	return 0; - -expand_fail: -	return ret; -} - -static void cn_escape(char *str) -{ -	for (; *str; str++) -		if (*str == '/') -			*str = '!'; -} - -static int cn_print_exe_file(struct core_name *cn) -{ -	struct file *exe_file; -	char *pathbuf, *path; -	int ret; - -	exe_file = get_mm_exe_file(current->mm); -	if (!exe_file) { -		char *commstart = cn->corename + cn->used; -		ret = cn_printf(cn, "%s (path unknown)", current->comm); -		cn_escape(commstart); -		return ret; -	} - -	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); -	if (!pathbuf) { -		ret = -ENOMEM; -		goto put_exe_file; -	} - -	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); -	if (IS_ERR(path)) { -		ret = PTR_ERR(path); -		goto free_buf; -	} - -	cn_escape(path); - -	ret = cn_printf(cn, "%s", path); - -free_buf: -	kfree(pathbuf); -put_exe_file: -	fput(exe_file); -	return ret; -} - -/* format_corename will inspect the pattern parameter, and output a - * name into corename, which must have space for at least - * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. - */ -static int format_corename(struct core_name *cn, long signr) -{ -	const struct cred *cred = current_cred(); -	const char *pat_ptr = core_pattern; -	int ispipe = (*pat_ptr == '|'); -	int pid_in_pattern = 0; -	int err = 0; - -	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); -	cn->corename = kmalloc(cn->size, GFP_KERNEL); -	cn->used = 0; - -	if (!cn->corename) -		return -ENOMEM; - -	/* Repeat as long as we have more pattern to process and more output -	   space */ -	while (*pat_ptr) { -		if (*pat_ptr != '%') { -			if (*pat_ptr == 0) -				goto out; -			err = cn_printf(cn, "%c", *pat_ptr++); -		} else { -			switch (*++pat_ptr) { -			/* single % at the end, drop that */ -			case 0: -				goto out; -			/* Double percent, output one percent */ -			case '%': -				err = cn_printf(cn, "%c", '%'); -				break; -			/* pid */ -			case 'p': -				pid_in_pattern = 1; -				err = cn_printf(cn, "%d", -					      task_tgid_vnr(current)); -				break; -			/* uid */ -			case 'u': -				err = cn_printf(cn, "%d", cred->uid); -				break; -			/* gid */ -			case 'g': -				err = cn_printf(cn, "%d", cred->gid); -				break; -			/* signal that caused the coredump */ -			case 's': -				err = cn_printf(cn, "%ld", signr); -				break; -			/* UNIX time of coredump */ -			case 't': { -				struct timeval tv; -				do_gettimeofday(&tv); -				err = cn_printf(cn, "%lu", tv.tv_sec); -				break; -			} -			/* hostname */ -			case 'h': { -				char *namestart = cn->corename + cn->used; -				down_read(&uts_sem); -				err = cn_printf(cn, "%s", -					      utsname()->nodename); -				up_read(&uts_sem); -				cn_escape(namestart); -				break; -			} -			/* executable */ -			case 'e': { -				char *commstart = cn->corename + cn->used; -				err = cn_printf(cn, "%s", current->comm); -				cn_escape(commstart); -				break; -			} -			case 'E': -				err = cn_print_exe_file(cn); -				break; -			/* core limit size */ -			case 'c': -				err = cn_printf(cn, "%lu", -					      rlimit(RLIMIT_CORE)); -				break; -			default: -				break; -			} -			++pat_ptr; -		} - -		if (err) -			return err; -	} - -	/* Backward compatibility with core_uses_pid: -	 * -	 * If core_pattern does not include a %p (as is the default) -	 * and core_uses_pid is set, then .%pid will be appended to -	 * the filename. Do not do this for piped commands. */ -	if (!ispipe && !pid_in_pattern && core_uses_pid) { -		err = cn_printf(cn, ".%d", task_tgid_vnr(current)); -		if (err) -			return err; -	} -out: -	return ispipe; -} - -static int zap_process(struct task_struct *start, int exit_code) -{ -	struct task_struct *t; -	int nr = 0; - -	start->signal->flags = SIGNAL_GROUP_EXIT; -	start->signal->group_exit_code = exit_code; -	start->signal->group_stop_count = 0; - -	t = start; -	do { -		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); -		if (t != current && t->mm) { -			sigaddset(&t->pending.signal, SIGKILL); -			signal_wake_up(t, 1); -			nr++; -		} -	} while_each_thread(start, t); - -	return nr; -} - -static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, -				struct core_state *core_state, int exit_code) -{ -	struct task_struct *g, *p; -	unsigned long flags; -	int nr = -EAGAIN; - -	spin_lock_irq(&tsk->sighand->siglock); -	if (!signal_group_exit(tsk->signal)) { -		mm->core_state = core_state; -		nr = zap_process(tsk, exit_code); -	} -	spin_unlock_irq(&tsk->sighand->siglock); -	if (unlikely(nr < 0)) -		return nr; - -	if (atomic_read(&mm->mm_users) == nr + 1) -		goto done; -	/* -	 * We should find and kill all tasks which use this mm, and we should -	 * count them correctly into ->nr_threads. We don't take tasklist -	 * lock, but this is safe wrt: -	 * -	 * fork: -	 *	None of sub-threads can fork after zap_process(leader). All -	 *	processes which were created before this point should be -	 *	visible to zap_threads() because copy_process() adds the new -	 *	process to the tail of init_task.tasks list, and lock/unlock -	 *	of ->siglock provides a memory barrier. -	 * -	 * do_exit: -	 *	The caller holds mm->mmap_sem. This means that the task which -	 *	uses this mm can't pass exit_mm(), so it can't exit or clear -	 *	its ->mm. -	 * -	 * de_thread: -	 *	It does list_replace_rcu(&leader->tasks, ¤t->tasks), -	 *	we must see either old or new leader, this does not matter. -	 *	However, it can change p->sighand, so lock_task_sighand(p) -	 *	must be used. Since p->mm != NULL and we hold ->mmap_sem -	 *	it can't fail. -	 * -	 *	Note also that "g" can be the old leader with ->mm == NULL -	 *	and already unhashed and thus removed from ->thread_group. -	 *	This is OK, __unhash_process()->list_del_rcu() does not -	 *	clear the ->next pointer, we will find the new leader via -	 *	next_thread(). -	 */ -	rcu_read_lock(); -	for_each_process(g) { -		if (g == tsk->group_leader) -			continue; -		if (g->flags & PF_KTHREAD) -			continue; -		p = g; -		do { -			if (p->mm) { -				if (unlikely(p->mm == mm)) { -					lock_task_sighand(p, &flags); -					nr += zap_process(p, exit_code); -					unlock_task_sighand(p, &flags); -				} -				break; -			} -		} while_each_thread(g, p); -	} -	rcu_read_unlock(); -done: -	atomic_set(&core_state->nr_threads, nr); -	return nr; -} - -static int coredump_wait(int exit_code, struct core_state *core_state) -{ -	struct task_struct *tsk = current; -	struct mm_struct *mm = tsk->mm; -	int core_waiters = -EBUSY; - -	init_completion(&core_state->startup); -	core_state->dumper.task = tsk; -	core_state->dumper.next = NULL; - -	down_write(&mm->mmap_sem); -	if (!mm->core_state) -		core_waiters = zap_threads(tsk, mm, core_state, exit_code); -	up_write(&mm->mmap_sem); - -	if (core_waiters > 0) { -		struct core_thread *ptr; - -		wait_for_completion(&core_state->startup); -		/* -		 * Wait for all the threads to become inactive, so that -		 * all the thread context (extended register state, like -		 * fpu etc) gets copied to the memory. -		 */ -		ptr = core_state->dumper.next; -		while (ptr != NULL) { -			wait_task_inactive(ptr->task, 0); -			ptr = ptr->next; -		} -	} - -	return core_waiters; -} - -static void coredump_finish(struct mm_struct *mm) -{ -	struct core_thread *curr, *next; -	struct task_struct *task; - -	next = mm->core_state->dumper.next; -	while ((curr = next) != NULL) { -		next = curr->next; -		task = curr->task; -		/* -		 * see exit_mm(), curr->task must not see -		 * ->task == NULL before we read ->next. -		 */ -		smp_mb(); -		curr->task = NULL; -		wake_up_process(task); -	} - -	mm->core_state = NULL; -} -  /*   * set_dumpable converts traditional three-value dumpable to two flags and   * stores them into mm->flags.  It modifies lower two bits of mm->flags, but @@ -2020,7 +1633,7 @@ void set_dumpable(struct mm_struct *mm, int value)  	}  } -static int __get_dumpable(unsigned long mm_flags) +int __get_dumpable(unsigned long mm_flags)  {  	int ret; @@ -2032,290 +1645,3 @@ int get_dumpable(struct mm_struct *mm)  {  	return __get_dumpable(mm->flags);  } - -static void wait_for_dump_helpers(struct file *file) -{ -	struct pipe_inode_info *pipe; - -	pipe = file->f_path.dentry->d_inode->i_pipe; - -	pipe_lock(pipe); -	pipe->readers++; -	pipe->writers--; - -	while ((pipe->readers > 1) && (!signal_pending(current))) { -		wake_up_interruptible_sync(&pipe->wait); -		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); -		pipe_wait(pipe); -	} - -	pipe->readers--; -	pipe->writers++; -	pipe_unlock(pipe); - -} - - -/* - * umh_pipe_setup - * helper function to customize the process used - * to collect the core in userspace.  Specifically - * it sets up a pipe and installs it as fd 0 (stdin) - * for the process.  Returns 0 on success, or - * PTR_ERR on failure. - * Note that it also sets the core limit to 1.  This - * is a special value that we use to trap recursive - * core dumps - */ -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) -{ -	struct file *files[2]; -	struct fdtable *fdt; -	struct coredump_params *cp = (struct coredump_params *)info->data; -	struct files_struct *cf = current->files; -	int err = create_pipe_files(files, 0); -	if (err) -		return err; - -	cp->file = files[1]; - -	sys_close(0); -	fd_install(0, files[0]); -	spin_lock(&cf->file_lock); -	fdt = files_fdtable(cf); -	__set_open_fd(0, fdt); -	__clear_close_on_exec(0, fdt); -	spin_unlock(&cf->file_lock); - -	/* and disallow core files too */ -	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - -	return 0; -} - -void do_coredump(long signr, int exit_code, struct pt_regs *regs) -{ -	struct core_state core_state; -	struct core_name cn; -	struct mm_struct *mm = current->mm; -	struct linux_binfmt * binfmt; -	const struct cred *old_cred; -	struct cred *cred; -	int retval = 0; -	int flag = 0; -	int ispipe; -	bool need_nonrelative = false; -	static atomic_t core_dump_count = ATOMIC_INIT(0); -	struct coredump_params cprm = { -		.signr = signr, -		.regs = regs, -		.limit = rlimit(RLIMIT_CORE), -		/* -		 * We must use the same mm->flags while dumping core to avoid -		 * inconsistency of bit flags, since this flag is not protected -		 * by any locks. -		 */ -		.mm_flags = mm->flags, -	}; - -	audit_core_dumps(signr); - -	binfmt = mm->binfmt; -	if (!binfmt || !binfmt->core_dump) -		goto fail; -	if (!__get_dumpable(cprm.mm_flags)) -		goto fail; - -	cred = prepare_creds(); -	if (!cred) -		goto fail; -	/* -	 * We cannot trust fsuid as being the "true" uid of the process -	 * nor do we know its entire history. We only know it was tainted -	 * so we dump it as root in mode 2, and only into a controlled -	 * environment (pipe handler or fully qualified path). -	 */ -	if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) { -		/* Setuid core dump mode */ -		flag = O_EXCL;		/* Stop rewrite attacks */ -		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */ -		need_nonrelative = true; -	} - -	retval = coredump_wait(exit_code, &core_state); -	if (retval < 0) -		goto fail_creds; - -	old_cred = override_creds(cred); - -	/* -	 * Clear any false indication of pending signals that might -	 * be seen by the filesystem code called to write the core file. -	 */ -	clear_thread_flag(TIF_SIGPENDING); - -	ispipe = format_corename(&cn, signr); - - 	if (ispipe) { -		int dump_count; -		char **helper_argv; - -		if (ispipe < 0) { -			printk(KERN_WARNING "format_corename failed\n"); -			printk(KERN_WARNING "Aborting core\n"); -			goto fail_corename; -		} - -		if (cprm.limit == 1) { -			/* See umh_pipe_setup() which sets RLIMIT_CORE = 1. -			 * -			 * Normally core limits are irrelevant to pipes, since -			 * we're not writing to the file system, but we use -			 * cprm.limit of 1 here as a speacial value, this is a -			 * consistent way to catch recursive crashes. -			 * We can still crash if the core_pattern binary sets -			 * RLIM_CORE = !1, but it runs as root, and can do -			 * lots of stupid things. -			 * -			 * Note that we use task_tgid_vnr here to grab the pid -			 * of the process group leader.  That way we get the -			 * right pid if a thread in a multi-threaded -			 * core_pattern process dies. -			 */ -			printk(KERN_WARNING -				"Process %d(%s) has RLIMIT_CORE set to 1\n", -				task_tgid_vnr(current), current->comm); -			printk(KERN_WARNING "Aborting core\n"); -			goto fail_unlock; -		} -		cprm.limit = RLIM_INFINITY; - -		dump_count = atomic_inc_return(&core_dump_count); -		if (core_pipe_limit && (core_pipe_limit < dump_count)) { -			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", -			       task_tgid_vnr(current), current->comm); -			printk(KERN_WARNING "Skipping core dump\n"); -			goto fail_dropcount; -		} - -		helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); -		if (!helper_argv) { -			printk(KERN_WARNING "%s failed to allocate memory\n", -			       __func__); -			goto fail_dropcount; -		} - -		retval = call_usermodehelper_fns(helper_argv[0], helper_argv, -					NULL, UMH_WAIT_EXEC, umh_pipe_setup, -					NULL, &cprm); -		argv_free(helper_argv); -		if (retval) { - 			printk(KERN_INFO "Core dump to %s pipe failed\n", -			       cn.corename); -			goto close_fail; - 		} -	} else { -		struct inode *inode; - -		if (cprm.limit < binfmt->min_coredump) -			goto fail_unlock; - -		if (need_nonrelative && cn.corename[0] != '/') { -			printk(KERN_WARNING "Pid %d(%s) can only dump core "\ -				"to fully qualified path!\n", -				task_tgid_vnr(current), current->comm); -			printk(KERN_WARNING "Skipping core dump\n"); -			goto fail_unlock; -		} - -		cprm.file = filp_open(cn.corename, -				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, -				 0600); -		if (IS_ERR(cprm.file)) -			goto fail_unlock; - -		inode = cprm.file->f_path.dentry->d_inode; -		if (inode->i_nlink > 1) -			goto close_fail; -		if (d_unhashed(cprm.file->f_path.dentry)) -			goto close_fail; -		/* -		 * AK: actually i see no reason to not allow this for named -		 * pipes etc, but keep the previous behaviour for now. -		 */ -		if (!S_ISREG(inode->i_mode)) -			goto close_fail; -		/* -		 * Dont allow local users get cute and trick others to coredump -		 * into their pre-created files. -		 */ -		if (!uid_eq(inode->i_uid, current_fsuid())) -			goto close_fail; -		if (!cprm.file->f_op || !cprm.file->f_op->write) -			goto close_fail; -		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) -			goto close_fail; -	} - -	retval = binfmt->core_dump(&cprm); -	if (retval) -		current->signal->group_exit_code |= 0x80; - -	if (ispipe && core_pipe_limit) -		wait_for_dump_helpers(cprm.file); -close_fail: -	if (cprm.file) -		filp_close(cprm.file, NULL); -fail_dropcount: -	if (ispipe) -		atomic_dec(&core_dump_count); -fail_unlock: -	kfree(cn.corename); -fail_corename: -	coredump_finish(mm); -	revert_creds(old_cred); -fail_creds: -	put_cred(cred); -fail: -	return; -} - -/* - * Core dumping helper functions.  These are the only things you should - * do on a core-file: use only these functions to write out all the - * necessary info. - */ -int dump_write(struct file *file, const void *addr, int nr) -{ -	return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} -EXPORT_SYMBOL(dump_write); - -int dump_seek(struct file *file, loff_t off) -{ -	int ret = 1; - -	if (file->f_op->llseek && file->f_op->llseek != no_llseek) { -		if (file->f_op->llseek(file, off, SEEK_CUR) < 0) -			return 0; -	} else { -		char *buf = (char *)get_zeroed_page(GFP_KERNEL); - -		if (!buf) -			return 0; -		while (off > 0) { -			unsigned long n = off; - -			if (n > PAGE_SIZE) -				n = PAGE_SIZE; -			if (!dump_write(file, buf, n)) { -				ret = 0; -				break; -			} -			off -= n; -		} -		free_page((unsigned long)buf); -	} -	return ret; -} -EXPORT_SYMBOL(dump_seek); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index dde41a75c7c..59e3bbfac0b 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -206,6 +206,11 @@ static int init_inodecache(void)   */  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(exofs_inode_cachep);  } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index af74d9e27b7..6c205d0c565 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -206,6 +206,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(ext2_inode_cachep);  } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 09b8455bd7e..bd29894c8fb 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -532,6 +532,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(ext3_inode_cachep);  } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7f7dad78760..5439d6a56e9 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -233,7 +233,7 @@ group_extend_out:  	case EXT4_IOC_MOVE_EXT: {  		struct move_extent me; -		struct file *donor_filp; +		struct fd donor;  		int err;  		if (!(filp->f_mode & FMODE_READ) || @@ -245,11 +245,11 @@ group_extend_out:  			return -EFAULT;  		me.moved_len = 0; -		donor_filp = fget(me.donor_fd); -		if (!donor_filp) +		donor = fdget(me.donor_fd); +		if (!donor.file)  			return -EBADF; -		if (!(donor_filp->f_mode & FMODE_WRITE)) { +		if (!(donor.file->f_mode & FMODE_WRITE)) {  			err = -EBADF;  			goto mext_out;  		} @@ -258,14 +258,15 @@ group_extend_out:  			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {  			ext4_msg(sb, KERN_ERR,  				 "Online defrag not supported with bigalloc"); -			return -EOPNOTSUPP; +			err = -EOPNOTSUPP; +			goto mext_out;  		}  		err = mnt_want_write_file(filp);  		if (err)  			goto mext_out; -		err = ext4_move_extents(filp, donor_filp, me.orig_start, +		err = ext4_move_extents(filp, donor.file, me.orig_start,  					me.donor_start, me.len, &me.moved_len);  		mnt_drop_write_file(filp); @@ -273,7 +274,7 @@ group_extend_out:  				 &me, sizeof(me)))  			err = -EFAULT;  mext_out: -		fput(donor_filp); +		fdput(donor);  		return err;  	} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1f15cc836fb..69c55d4e462 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1019,6 +1019,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(ext4_inode_cachep);  } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 47d9eb0be88..4e5a6ac54eb 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -521,6 +521,11 @@ static int __init fat_init_inodecache(void)  static void __exit fat_destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(fat_inode_cachep);  } diff --git a/fs/fcntl.c b/fs/fcntl.c index 887b5ba8c9b..8f704291d4e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -26,124 +26,6 @@  #include <asm/siginfo.h>  #include <asm/uaccess.h> -void set_close_on_exec(unsigned int fd, int flag) -{ -	struct files_struct *files = current->files; -	struct fdtable *fdt; -	spin_lock(&files->file_lock); -	fdt = files_fdtable(files); -	if (flag) -		__set_close_on_exec(fd, fdt); -	else -		__clear_close_on_exec(fd, fdt); -	spin_unlock(&files->file_lock); -} - -static bool get_close_on_exec(unsigned int fd) -{ -	struct files_struct *files = current->files; -	struct fdtable *fdt; -	bool res; -	rcu_read_lock(); -	fdt = files_fdtable(files); -	res = close_on_exec(fd, fdt); -	rcu_read_unlock(); -	return res; -} - -SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) -{ -	int err = -EBADF; -	struct file * file, *tofree; -	struct files_struct * files = current->files; -	struct fdtable *fdt; - -	if ((flags & ~O_CLOEXEC) != 0) -		return -EINVAL; - -	if (unlikely(oldfd == newfd)) -		return -EINVAL; - -	spin_lock(&files->file_lock); -	err = expand_files(files, newfd); -	file = fcheck(oldfd); -	if (unlikely(!file)) -		goto Ebadf; -	if (unlikely(err < 0)) { -		if (err == -EMFILE) -			goto Ebadf; -		goto out_unlock; -	} -	/* -	 * We need to detect attempts to do dup2() over allocated but still -	 * not finished descriptor.  NB: OpenBSD avoids that at the price of -	 * extra work in their equivalent of fget() - they insert struct -	 * file immediately after grabbing descriptor, mark it larval if -	 * more work (e.g. actual opening) is needed and make sure that -	 * fget() treats larval files as absent.  Potentially interesting, -	 * but while extra work in fget() is trivial, locking implications -	 * and amount of surgery on open()-related paths in VFS are not. -	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" -	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of -	 * scope of POSIX or SUS, since neither considers shared descriptor -	 * tables and this condition does not arise without those. -	 */ -	err = -EBUSY; -	fdt = files_fdtable(files); -	tofree = fdt->fd[newfd]; -	if (!tofree && fd_is_open(newfd, fdt)) -		goto out_unlock; -	get_file(file); -	rcu_assign_pointer(fdt->fd[newfd], file); -	__set_open_fd(newfd, fdt); -	if (flags & O_CLOEXEC) -		__set_close_on_exec(newfd, fdt); -	else -		__clear_close_on_exec(newfd, fdt); -	spin_unlock(&files->file_lock); - -	if (tofree) -		filp_close(tofree, files); - -	return newfd; - -Ebadf: -	err = -EBADF; -out_unlock: -	spin_unlock(&files->file_lock); -	return err; -} - -SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) -{ -	if (unlikely(newfd == oldfd)) { /* corner case */ -		struct files_struct *files = current->files; -		int retval = oldfd; - -		rcu_read_lock(); -		if (!fcheck_files(files, oldfd)) -			retval = -EBADF; -		rcu_read_unlock(); -		return retval; -	} -	return sys_dup3(oldfd, newfd, 0); -} - -SYSCALL_DEFINE1(dup, unsigned int, fildes) -{ -	int ret = -EBADF; -	struct file *file = fget_raw(fildes); - -	if (file) { -		ret = get_unused_fd(); -		if (ret >= 0) -			fd_install(ret, file); -		else -			fput(file); -	} -	return ret; -} -  #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)  static int setfl(int fd, struct file * filp, unsigned long arg) @@ -267,7 +149,7 @@ pid_t f_getown(struct file *filp)  static int f_setown_ex(struct file *filp, unsigned long arg)  { -	struct f_owner_ex * __user owner_p = (void * __user)arg; +	struct f_owner_ex __user *owner_p = (void __user *)arg;  	struct f_owner_ex owner;  	struct pid *pid;  	int type; @@ -307,7 +189,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)  static int f_getown_ex(struct file *filp, unsigned long arg)  { -	struct f_owner_ex * __user owner_p = (void * __user)arg; +	struct f_owner_ex __user *owner_p = (void __user *)arg;  	struct f_owner_ex owner;  	int ret = 0; @@ -345,7 +227,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)  static int f_getowner_uids(struct file *filp, unsigned long arg)  {  	struct user_namespace *user_ns = current_user_ns(); -	uid_t * __user dst = (void * __user)arg; +	uid_t __user *dst = (void __user *)arg;  	uid_t src[2];  	int err; @@ -373,14 +255,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,  	switch (cmd) {  	case F_DUPFD: +		err = f_dupfd(arg, filp, 0); +		break;  	case F_DUPFD_CLOEXEC: -		if (arg >= rlimit(RLIMIT_NOFILE)) -			break; -		err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); -		if (err >= 0) { -			get_file(filp); -			fd_install(err, filp); -		} +		err = f_dupfd(arg, filp, FD_CLOEXEC);  		break;  	case F_GETFD:  		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; @@ -470,25 +348,23 @@ static int check_fcntl_cmd(unsigned cmd)  SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)  {	 -	struct file *filp; -	int fput_needed; +	struct fd f = fdget_raw(fd);  	long err = -EBADF; -	filp = fget_raw_light(fd, &fput_needed); -	if (!filp) +	if (!f.file)  		goto out; -	if (unlikely(filp->f_mode & FMODE_PATH)) { +	if (unlikely(f.file->f_mode & FMODE_PATH)) {  		if (!check_fcntl_cmd(cmd))  			goto out1;  	} -	err = security_file_fcntl(filp, cmd, arg); +	err = security_file_fcntl(f.file, cmd, arg);  	if (!err) -		err = do_fcntl(fd, cmd, arg, filp); +		err = do_fcntl(fd, cmd, arg, f.file);  out1: - 	fput_light(filp, fput_needed); + 	fdput(f);  out:  	return err;  } @@ -497,38 +373,36 @@ out:  SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,  		unsigned long, arg)  {	 -	struct file * filp; +	struct fd f = fdget_raw(fd);  	long err = -EBADF; -	int fput_needed; -	filp = fget_raw_light(fd, &fput_needed); -	if (!filp) +	if (!f.file)  		goto out; -	if (unlikely(filp->f_mode & FMODE_PATH)) { +	if (unlikely(f.file->f_mode & FMODE_PATH)) {  		if (!check_fcntl_cmd(cmd))  			goto out1;  	} -	err = security_file_fcntl(filp, cmd, arg); +	err = security_file_fcntl(f.file, cmd, arg);  	if (err)  		goto out1;  	switch (cmd) {  		case F_GETLK64: -			err = fcntl_getlk64(filp, (struct flock64 __user *) arg); +			err = fcntl_getlk64(f.file, (struct flock64 __user *) arg);  			break;  		case F_SETLK64:  		case F_SETLKW64: -			err = fcntl_setlk64(fd, filp, cmd, +			err = fcntl_setlk64(fd, f.file, cmd,  					(struct flock64 __user *) arg);  			break;  		default: -			err = do_fcntl(fd, cmd, arg, filp); +			err = do_fcntl(fd, cmd, arg, f.file);  			break;  	}  out1: -	fput_light(filp, fput_needed); +	fdput(f);  out:  	return err;  } diff --git a/fs/fhandle.c b/fs/fhandle.c index a48e4a139be..f775bfdd6e4 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -113,24 +113,21 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,  static struct vfsmount *get_vfsmount_from_fd(int fd)  { -	struct path path; +	struct vfsmount *mnt;  	if (fd == AT_FDCWD) {  		struct fs_struct *fs = current->fs;  		spin_lock(&fs->lock); -		path = fs->pwd; -		mntget(path.mnt); +		mnt = mntget(fs->pwd.mnt);  		spin_unlock(&fs->lock);  	} else { -		int fput_needed; -		struct file *file = fget_light(fd, &fput_needed); -		if (!file) +		struct fd f = fdget(fd); +		if (!f.file)  			return ERR_PTR(-EBADF); -		path = file->f_path; -		mntget(path.mnt); -		fput_light(file, fput_needed); +		mnt = mntget(f.file->f_path.mnt); +		fdput(f);  	} -	return path.mnt; +	return mnt;  }  static int vfs_dentry_acceptable(void *context, struct dentry *dentry) diff --git a/fs/file.c b/fs/file.c index ba3f6053025..0f1bda4bebf 100644 --- a/fs/file.c +++ b/fs/file.c @@ -6,6 +6,7 @@   *  Manage the dynamic fd arrays in the process files_struct.   */ +#include <linux/syscalls.h>  #include <linux/export.h>  #include <linux/fs.h>  #include <linux/mm.h> @@ -84,22 +85,14 @@ static void free_fdtable_work(struct work_struct *work)  	}  } -void free_fdtable_rcu(struct rcu_head *rcu) +static void free_fdtable_rcu(struct rcu_head *rcu)  {  	struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);  	struct fdtable_defer *fddef;  	BUG_ON(!fdt); +	BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT); -	if (fdt->max_fds <= NR_OPEN_DEFAULT) { -		/* -		 * This fdtable is embedded in the files structure and that -		 * structure itself is getting destroyed. -		 */ -		kmem_cache_free(files_cachep, -				container_of(fdt, struct files_struct, fdtab)); -		return; -	}  	if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {  		kfree(fdt->fd);  		kfree(fdt->open_fds); @@ -229,7 +222,7 @@ static int expand_fdtable(struct files_struct *files, int nr)  		copy_fdtable(new_fdt, cur_fdt);  		rcu_assign_pointer(files->fdt, new_fdt);  		if (cur_fdt->max_fds > NR_OPEN_DEFAULT) -			free_fdtable(cur_fdt); +			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);  	} else {  		/* Somebody else expanded, so undo our attempt */  		__free_fdtable(new_fdt); @@ -245,19 +238,12 @@ static int expand_fdtable(struct files_struct *files, int nr)   * expanded and execution may have blocked.   * The files->file_lock should be held on entry, and will be held on exit.   */ -int expand_files(struct files_struct *files, int nr) +static int expand_files(struct files_struct *files, int nr)  {  	struct fdtable *fdt;  	fdt = files_fdtable(files); -	/* -	 * N.B. For clone tasks sharing a files structure, this test -	 * will limit the total number of files that can be opened. -	 */ -	if (nr >= rlimit(RLIMIT_NOFILE)) -		return -EMFILE; -  	/* Do we need to expand? */  	if (nr < fdt->max_fds)  		return 0; @@ -270,6 +256,26 @@ int expand_files(struct files_struct *files, int nr)  	return expand_fdtable(files, nr);  } +static inline void __set_close_on_exec(int fd, struct fdtable *fdt) +{ +	__set_bit(fd, fdt->close_on_exec); +} + +static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) +{ +	__clear_bit(fd, fdt->close_on_exec); +} + +static inline void __set_open_fd(int fd, struct fdtable *fdt) +{ +	__set_bit(fd, fdt->open_fds); +} + +static inline void __clear_open_fd(int fd, struct fdtable *fdt) +{ +	__clear_bit(fd, fdt->open_fds); +} +  static int count_open_files(struct fdtable *fdt)  {  	int size = fdt->max_fds; @@ -395,6 +401,95 @@ out:  	return NULL;  } +static void close_files(struct files_struct * files) +{ +	int i, j; +	struct fdtable *fdt; + +	j = 0; + +	/* +	 * It is safe to dereference the fd table without RCU or +	 * ->file_lock because this is the last reference to the +	 * files structure.  But use RCU to shut RCU-lockdep up. +	 */ +	rcu_read_lock(); +	fdt = files_fdtable(files); +	rcu_read_unlock(); +	for (;;) { +		unsigned long set; +		i = j * BITS_PER_LONG; +		if (i >= fdt->max_fds) +			break; +		set = fdt->open_fds[j++]; +		while (set) { +			if (set & 1) { +				struct file * file = xchg(&fdt->fd[i], NULL); +				if (file) { +					filp_close(file, files); +					cond_resched(); +				} +			} +			i++; +			set >>= 1; +		} +	} +} + +struct files_struct *get_files_struct(struct task_struct *task) +{ +	struct files_struct *files; + +	task_lock(task); +	files = task->files; +	if (files) +		atomic_inc(&files->count); +	task_unlock(task); + +	return files; +} + +void put_files_struct(struct files_struct *files) +{ +	struct fdtable *fdt; + +	if (atomic_dec_and_test(&files->count)) { +		close_files(files); +		/* not really needed, since nobody can see us */ +		rcu_read_lock(); +		fdt = files_fdtable(files); +		rcu_read_unlock(); +		/* free the arrays if they are not embedded */ +		if (fdt != &files->fdtab) +			__free_fdtable(fdt); +		kmem_cache_free(files_cachep, files); +	} +} + +void reset_files_struct(struct files_struct *files) +{ +	struct task_struct *tsk = current; +	struct files_struct *old; + +	old = tsk->files; +	task_lock(tsk); +	tsk->files = files; +	task_unlock(tsk); +	put_files_struct(old); +} + +void exit_files(struct task_struct *tsk) +{ +	struct files_struct * files = tsk->files; + +	if (files) { +		task_lock(tsk); +		tsk->files = NULL; +		task_unlock(tsk); +		put_files_struct(files); +	} +} +  static void __devinit fdtable_defer_list_init(int cpu)  {  	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); @@ -424,12 +519,18 @@ struct files_struct init_files = {  	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),  }; +void daemonize_descriptors(void) +{ +	atomic_inc(&init_files.count); +	reset_files_struct(&init_files); +} +  /*   * allocate a file descriptor, mark it busy.   */ -int alloc_fd(unsigned start, unsigned flags) +int __alloc_fd(struct files_struct *files, +	       unsigned start, unsigned end, unsigned flags)  { -	struct files_struct *files = current->files;  	unsigned int fd;  	int error;  	struct fdtable *fdt; @@ -444,6 +545,14 @@ repeat:  	if (fd < fdt->max_fds)  		fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); +	/* +	 * N.B. For clone tasks sharing a files structure, this test +	 * will limit the total number of files that can be opened. +	 */ +	error = -EMFILE; +	if (fd >= end) +		goto out; +  	error = expand_files(files, fd);  	if (error < 0)  		goto out; @@ -477,8 +586,424 @@ out:  	return error;  } -int get_unused_fd(void) +static int alloc_fd(unsigned start, unsigned flags) +{ +	return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); +} + +int get_unused_fd_flags(unsigned flags) +{ +	return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); +} +EXPORT_SYMBOL(get_unused_fd_flags); + +static void __put_unused_fd(struct files_struct *files, unsigned int fd) +{ +	struct fdtable *fdt = files_fdtable(files); +	__clear_open_fd(fd, fdt); +	if (fd < files->next_fd) +		files->next_fd = fd; +} + +void put_unused_fd(unsigned int fd) +{ +	struct files_struct *files = current->files; +	spin_lock(&files->file_lock); +	__put_unused_fd(files, fd); +	spin_unlock(&files->file_lock); +} + +EXPORT_SYMBOL(put_unused_fd); + +/* + * Install a file pointer in the fd array. + * + * The VFS is full of places where we drop the files lock between + * setting the open_fds bitmap and installing the file in the file + * array.  At any such point, we are vulnerable to a dup2() race + * installing a file in the array before us.  We need to detect this and + * fput() the struct file we are about to overwrite in this case. + * + * It should never happen - if we allow dup2() do it, _really_ bad things + * will follow. + * + * NOTE: __fd_install() variant is really, really low-level; don't + * use it unless you are forced to by truly lousy API shoved down + * your throat.  'files' *MUST* be either current->files or obtained + * by get_files_struct(current) done by whoever had given it to you, + * or really bad things will happen.  Normally you want to use + * fd_install() instead. + */ + +void __fd_install(struct files_struct *files, unsigned int fd, +		struct file *file) +{ +	struct fdtable *fdt; +	spin_lock(&files->file_lock); +	fdt = files_fdtable(files); +	BUG_ON(fdt->fd[fd] != NULL); +	rcu_assign_pointer(fdt->fd[fd], file); +	spin_unlock(&files->file_lock); +} + +void fd_install(unsigned int fd, struct file *file) +{ +	__fd_install(current->files, fd, file); +} + +EXPORT_SYMBOL(fd_install); + +/* + * The same warnings as for __alloc_fd()/__fd_install() apply here... + */ +int __close_fd(struct files_struct *files, unsigned fd) +{ +	struct file *file; +	struct fdtable *fdt; + +	spin_lock(&files->file_lock); +	fdt = files_fdtable(files); +	if (fd >= fdt->max_fds) +		goto out_unlock; +	file = fdt->fd[fd]; +	if (!file) +		goto out_unlock; +	rcu_assign_pointer(fdt->fd[fd], NULL); +	__clear_close_on_exec(fd, fdt); +	__put_unused_fd(files, fd); +	spin_unlock(&files->file_lock); +	return filp_close(file, files); + +out_unlock: +	spin_unlock(&files->file_lock); +	return -EBADF; +} + +void do_close_on_exec(struct files_struct *files) +{ +	unsigned i; +	struct fdtable *fdt; + +	/* exec unshares first */ +	BUG_ON(atomic_read(&files->count) != 1); +	spin_lock(&files->file_lock); +	for (i = 0; ; i++) { +		unsigned long set; +		unsigned fd = i * BITS_PER_LONG; +		fdt = files_fdtable(files); +		if (fd >= fdt->max_fds) +			break; +		set = fdt->close_on_exec[i]; +		if (!set) +			continue; +		fdt->close_on_exec[i] = 0; +		for ( ; set ; fd++, set >>= 1) { +			struct file *file; +			if (!(set & 1)) +				continue; +			file = fdt->fd[fd]; +			if (!file) +				continue; +			rcu_assign_pointer(fdt->fd[fd], NULL); +			__put_unused_fd(files, fd); +			spin_unlock(&files->file_lock); +			filp_close(file, files); +			cond_resched(); +			spin_lock(&files->file_lock); +		} + +	} +	spin_unlock(&files->file_lock); +} + +struct file *fget(unsigned int fd) +{ +	struct file *file; +	struct files_struct *files = current->files; + +	rcu_read_lock(); +	file = fcheck_files(files, fd); +	if (file) { +		/* File object ref couldn't be taken */ +		if (file->f_mode & FMODE_PATH || +		    !atomic_long_inc_not_zero(&file->f_count)) +			file = NULL; +	} +	rcu_read_unlock(); + +	return file; +} + +EXPORT_SYMBOL(fget); + +struct file *fget_raw(unsigned int fd) +{ +	struct file *file; +	struct files_struct *files = current->files; + +	rcu_read_lock(); +	file = fcheck_files(files, fd); +	if (file) { +		/* File object ref couldn't be taken */ +		if (!atomic_long_inc_not_zero(&file->f_count)) +			file = NULL; +	} +	rcu_read_unlock(); + +	return file; +} + +EXPORT_SYMBOL(fget_raw); + +/* + * Lightweight file lookup - no refcnt increment if fd table isn't shared. + * + * You can use this instead of fget if you satisfy all of the following + * conditions: + * 1) You must call fput_light before exiting the syscall and returning control + *    to userspace (i.e. you cannot remember the returned struct file * after + *    returning to userspace). + * 2) You must not call filp_close on the returned struct file * in between + *    calls to fget_light and fput_light. + * 3) You must not clone the current task in between the calls to fget_light + *    and fput_light. + * + * The fput_needed flag returned by fget_light should be passed to the + * corresponding fput_light. + */ +struct file *fget_light(unsigned int fd, int *fput_needed) +{ +	struct file *file; +	struct files_struct *files = current->files; + +	*fput_needed = 0; +	if (atomic_read(&files->count) == 1) { +		file = fcheck_files(files, fd); +		if (file && (file->f_mode & FMODE_PATH)) +			file = NULL; +	} else { +		rcu_read_lock(); +		file = fcheck_files(files, fd); +		if (file) { +			if (!(file->f_mode & FMODE_PATH) && +			    atomic_long_inc_not_zero(&file->f_count)) +				*fput_needed = 1; +			else +				/* Didn't get the reference, someone's freed */ +				file = NULL; +		} +		rcu_read_unlock(); +	} + +	return file; +} +EXPORT_SYMBOL(fget_light); + +struct file *fget_raw_light(unsigned int fd, int *fput_needed) +{ +	struct file *file; +	struct files_struct *files = current->files; + +	*fput_needed = 0; +	if (atomic_read(&files->count) == 1) { +		file = fcheck_files(files, fd); +	} else { +		rcu_read_lock(); +		file = fcheck_files(files, fd); +		if (file) { +			if (atomic_long_inc_not_zero(&file->f_count)) +				*fput_needed = 1; +			else +				/* Didn't get the reference, someone's freed */ +				file = NULL; +		} +		rcu_read_unlock(); +	} + +	return file; +} + +void set_close_on_exec(unsigned int fd, int flag) +{ +	struct files_struct *files = current->files; +	struct fdtable *fdt; +	spin_lock(&files->file_lock); +	fdt = files_fdtable(files); +	if (flag) +		__set_close_on_exec(fd, fdt); +	else +		__clear_close_on_exec(fd, fdt); +	spin_unlock(&files->file_lock); +} + +bool get_close_on_exec(unsigned int fd) +{ +	struct files_struct *files = current->files; +	struct fdtable *fdt; +	bool res; +	rcu_read_lock(); +	fdt = files_fdtable(files); +	res = close_on_exec(fd, fdt); +	rcu_read_unlock(); +	return res; +} + +static int do_dup2(struct files_struct *files, +	struct file *file, unsigned fd, unsigned flags) +{ +	struct file *tofree; +	struct fdtable *fdt; + +	/* +	 * We need to detect attempts to do dup2() over allocated but still +	 * not finished descriptor.  NB: OpenBSD avoids that at the price of +	 * extra work in their equivalent of fget() - they insert struct +	 * file immediately after grabbing descriptor, mark it larval if +	 * more work (e.g. actual opening) is needed and make sure that +	 * fget() treats larval files as absent.  Potentially interesting, +	 * but while extra work in fget() is trivial, locking implications +	 * and amount of surgery on open()-related paths in VFS are not. +	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" +	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of +	 * scope of POSIX or SUS, since neither considers shared descriptor +	 * tables and this condition does not arise without those. +	 */ +	fdt = files_fdtable(files); +	tofree = fdt->fd[fd]; +	if (!tofree && fd_is_open(fd, fdt)) +		goto Ebusy; +	get_file(file); +	rcu_assign_pointer(fdt->fd[fd], file); +	__set_open_fd(fd, fdt); +	if (flags & O_CLOEXEC) +		__set_close_on_exec(fd, fdt); +	else +		__clear_close_on_exec(fd, fdt); +	spin_unlock(&files->file_lock); + +	if (tofree) +		filp_close(tofree, files); + +	return fd; + +Ebusy: +	spin_unlock(&files->file_lock); +	return -EBUSY; +} + +int replace_fd(unsigned fd, struct file *file, unsigned flags)  { -	return alloc_fd(0, 0); +	int err; +	struct files_struct *files = current->files; + +	if (!file) +		return __close_fd(files, fd); + +	if (fd >= rlimit(RLIMIT_NOFILE)) +		return -EMFILE; + +	spin_lock(&files->file_lock); +	err = expand_files(files, fd); +	if (unlikely(err < 0)) +		goto out_unlock; +	return do_dup2(files, file, fd, flags); + +out_unlock: +	spin_unlock(&files->file_lock); +	return err; +} + +SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +{ +	int err = -EBADF; +	struct file *file; +	struct files_struct *files = current->files; + +	if ((flags & ~O_CLOEXEC) != 0) +		return -EINVAL; + +	if (newfd >= rlimit(RLIMIT_NOFILE)) +		return -EMFILE; + +	spin_lock(&files->file_lock); +	err = expand_files(files, newfd); +	file = fcheck(oldfd); +	if (unlikely(!file)) +		goto Ebadf; +	if (unlikely(err < 0)) { +		if (err == -EMFILE) +			goto Ebadf; +		goto out_unlock; +	} +	return do_dup2(files, file, newfd, flags); + +Ebadf: +	err = -EBADF; +out_unlock: +	spin_unlock(&files->file_lock); +	return err; +} + +SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) +{ +	if (unlikely(newfd == oldfd)) { /* corner case */ +		struct files_struct *files = current->files; +		int retval = oldfd; + +		rcu_read_lock(); +		if (!fcheck_files(files, oldfd)) +			retval = -EBADF; +		rcu_read_unlock(); +		return retval; +	} +	return sys_dup3(oldfd, newfd, 0); +} + +SYSCALL_DEFINE1(dup, unsigned int, fildes) +{ +	int ret = -EBADF; +	struct file *file = fget_raw(fildes); + +	if (file) { +		ret = get_unused_fd(); +		if (ret >= 0) +			fd_install(ret, file); +		else +			fput(file); +	} +	return ret; +} + +int f_dupfd(unsigned int from, struct file *file, unsigned flags) +{ +	int err; +	if (from >= rlimit(RLIMIT_NOFILE)) +		return -EINVAL; +	err = alloc_fd(from, flags); +	if (err >= 0) { +		get_file(file); +		fd_install(err, file); +	} +	return err; +} + +int iterate_fd(struct files_struct *files, unsigned n, +		int (*f)(const void *, struct file *, unsigned), +		const void *p) +{ +	struct fdtable *fdt; +	struct file *file; +	int res = 0; +	if (!files) +		return 0; +	spin_lock(&files->file_lock); +	fdt = files_fdtable(files); +	while (!res && n < fdt->max_fds) { +		file = rcu_dereference_check_fdtable(files, fdt->fd[n++]); +		if (file) +			res = f(p, file, n); +	} +	spin_unlock(&files->file_lock); +	return res;  } -EXPORT_SYMBOL(get_unused_fd); +EXPORT_SYMBOL(iterate_fd); diff --git a/fs/file_table.c b/fs/file_table.c index 701985e4ccd..c6780163bf3 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -339,112 +339,6 @@ void __fput_sync(struct file *file)  EXPORT_SYMBOL(fput); -struct file *fget(unsigned int fd) -{ -	struct file *file; -	struct files_struct *files = current->files; - -	rcu_read_lock(); -	file = fcheck_files(files, fd); -	if (file) { -		/* File object ref couldn't be taken */ -		if (file->f_mode & FMODE_PATH || -		    !atomic_long_inc_not_zero(&file->f_count)) -			file = NULL; -	} -	rcu_read_unlock(); - -	return file; -} - -EXPORT_SYMBOL(fget); - -struct file *fget_raw(unsigned int fd) -{ -	struct file *file; -	struct files_struct *files = current->files; - -	rcu_read_lock(); -	file = fcheck_files(files, fd); -	if (file) { -		/* File object ref couldn't be taken */ -		if (!atomic_long_inc_not_zero(&file->f_count)) -			file = NULL; -	} -	rcu_read_unlock(); - -	return file; -} - -EXPORT_SYMBOL(fget_raw); - -/* - * Lightweight file lookup - no refcnt increment if fd table isn't shared. - * - * You can use this instead of fget if you satisfy all of the following - * conditions: - * 1) You must call fput_light before exiting the syscall and returning control - *    to userspace (i.e. you cannot remember the returned struct file * after - *    returning to userspace). - * 2) You must not call filp_close on the returned struct file * in between - *    calls to fget_light and fput_light. - * 3) You must not clone the current task in between the calls to fget_light - *    and fput_light. - * - * The fput_needed flag returned by fget_light should be passed to the - * corresponding fput_light. - */ -struct file *fget_light(unsigned int fd, int *fput_needed) -{ -	struct file *file; -	struct files_struct *files = current->files; - -	*fput_needed = 0; -	if (atomic_read(&files->count) == 1) { -		file = fcheck_files(files, fd); -		if (file && (file->f_mode & FMODE_PATH)) -			file = NULL; -	} else { -		rcu_read_lock(); -		file = fcheck_files(files, fd); -		if (file) { -			if (!(file->f_mode & FMODE_PATH) && -			    atomic_long_inc_not_zero(&file->f_count)) -				*fput_needed = 1; -			else -				/* Didn't get the reference, someone's freed */ -				file = NULL; -		} -		rcu_read_unlock(); -	} - -	return file; -} - -struct file *fget_raw_light(unsigned int fd, int *fput_needed) -{ -	struct file *file; -	struct files_struct *files = current->files; - -	*fput_needed = 0; -	if (atomic_read(&files->count) == 1) { -		file = fcheck_files(files, fd); -	} else { -		rcu_read_lock(); -		file = fcheck_files(files, fd); -		if (file) { -			if (atomic_long_inc_not_zero(&file->f_count)) -				*fput_needed = 1; -			else -				/* Didn't get the reference, someone's freed */ -				file = NULL; -		} -		rcu_read_unlock(); -	} - -	return file; -} -  void put_filp(struct file *file)  {  	if (atomic_long_dec_and_test(&file->f_count)) { diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index d4fabd26084..fed2c8afb3a 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -279,6 +279,11 @@ static void __exit  vxfs_cleanup(void)  {  	unregister_filesystem(&vxfs_fs_type); +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(vxfs_inode_cachep);  } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f4246cfc8d8..8c23fa7a91e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -148,8 +148,7 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,  		if (ff->reserved_req) {  			req = ff->reserved_req;  			ff->reserved_req = NULL; -			get_file(file); -			req->stolen_file = file; +			req->stolen_file = get_file(file);  		}  		spin_unlock(&fc->lock);  	} while (!req); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fca222dabe3..f0eda124cff 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1197,6 +1197,12 @@ static void fuse_fs_cleanup(void)  {  	unregister_filesystem(&fuse_fs_type);  	unregister_fuseblk(); + +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(fuse_inode_cachep);  } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 0b63d135a09..e93ddaadfd1 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -492,6 +492,12 @@ static int __init init_hfs_fs(void)  static void __exit exit_hfs_fs(void)  {  	unregister_filesystem(&hfs_fs_type); + +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(hfs_inode_cachep);  } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index fdafb2d7165..811a84d2d96 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -635,6 +635,12 @@ static int __init init_hfsplus_fs(void)  static void __exit exit_hfsplus_fs(void)  {  	unregister_filesystem(&hfsplus_fs_type); + +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(hfsplus_inode_cachep);  } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a152783602d..bc28bf077a6 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -210,6 +210,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(hpfs_inode_cachep);  } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6e572c4fbf6..9460120a517 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1048,6 +1048,11 @@ static int __init init_hugetlbfs_fs(void)  static void __exit exit_hugetlbfs_fs(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(hugetlbfs_inode_cachep);  	kern_unmount(hugetlbfs_vfsmount);  	unregister_filesystem(&hugetlbfs_fs_type); diff --git a/fs/ioctl.c b/fs/ioctl.c index 29167bebe87..3bdad6d1f26 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -603,21 +603,14 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,  SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)  { -	struct file *filp; -	int error = -EBADF; -	int fput_needed; - -	filp = fget_light(fd, &fput_needed); -	if (!filp) -		goto out; - -	error = security_file_ioctl(filp, cmd, arg); -	if (error) -		goto out_fput; +	int error; +	struct fd f = fdget(fd); -	error = do_vfs_ioctl(filp, fd, cmd, arg); - out_fput: -	fput_light(filp, fput_needed); - out: +	if (!f.file) +		return -EBADF; +	error = security_file_ioctl(f.file, cmd, arg); +	if (!error) +		error = do_vfs_ioctl(f.file, fd, cmd, arg); +	fdput(f);  	return error;  } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index a7d8e6cc5e0..67ce52507d7 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -115,6 +115,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(isofs_inode_cachep);  } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 61ea41389f9..ff487954cd9 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -418,6 +418,12 @@ static void __exit exit_jffs2_fs(void)  	unregister_filesystem(&jffs2_fs_type);  	jffs2_destroy_slab_caches();  	jffs2_compressors_exit(); + +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(jffs2_inode_cachep);  } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 706692f2403..efdf8835dfc 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -911,6 +911,12 @@ static void __exit exit_jfs_fs(void)  	jfs_proc_clean();  #endif  	unregister_filesystem(&jfs_fs_type); + +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(jfs_inode_cachep);  } diff --git a/fs/locks.c b/fs/locks.c index 7e81bfc7516..abc7dc6c490 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1625,15 +1625,13 @@ EXPORT_SYMBOL(flock_lock_file_wait);   */  SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)  { -	struct file *filp; -	int fput_needed; +	struct fd f = fdget(fd);  	struct file_lock *lock;  	int can_sleep, unlock;  	int error;  	error = -EBADF; -	filp = fget_light(fd, &fput_needed); -	if (!filp) +	if (!f.file)  		goto out;  	can_sleep = !(cmd & LOCK_NB); @@ -1641,31 +1639,31 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)  	unlock = (cmd == LOCK_UN);  	if (!unlock && !(cmd & LOCK_MAND) && -	    !(filp->f_mode & (FMODE_READ|FMODE_WRITE))) +	    !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))  		goto out_putf; -	error = flock_make_lock(filp, &lock, cmd); +	error = flock_make_lock(f.file, &lock, cmd);  	if (error)  		goto out_putf;  	if (can_sleep)  		lock->fl_flags |= FL_SLEEP; -	error = security_file_lock(filp, lock->fl_type); +	error = security_file_lock(f.file, lock->fl_type);  	if (error)  		goto out_free; -	if (filp->f_op && filp->f_op->flock) -		error = filp->f_op->flock(filp, +	if (f.file->f_op && f.file->f_op->flock) +		error = f.file->f_op->flock(f.file,  					  (can_sleep) ? F_SETLKW : F_SETLK,  					  lock);  	else -		error = flock_lock_file_wait(filp, lock); +		error = flock_lock_file_wait(f.file, lock);   out_free:  	locks_free_lock(lock);   out_putf: -	fput_light(filp, fput_needed); +	fdput(f);   out:  	return error;  } diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index bda39085309..adb90116d36 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -417,5 +417,10 @@ int logfs_init_inode_cache(void)  void logfs_destroy_inode_cache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(logfs_inode_cache);  } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index d0e42c67892..4fc5f8ab1c4 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -100,6 +100,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(minix_inode_cachep);  } diff --git a/fs/namei.c b/fs/namei.c index a856e7f7b6e..aa30d19e9ed 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1797,8 +1797,6 @@ static int path_init(int dfd, const char *name, unsigned int flags,  		     struct nameidata *nd, struct file **fp)  {  	int retval = 0; -	int fput_needed; -	struct file *file;  	nd->last_type = LAST_ROOT; /* if there are only slashes... */  	nd->flags = flags | LOOKUP_JUMPED; @@ -1850,44 +1848,41 @@ static int path_init(int dfd, const char *name, unsigned int flags,  			get_fs_pwd(current->fs, &nd->path);  		}  	} else { +		struct fd f = fdget_raw(dfd);  		struct dentry *dentry; -		file = fget_raw_light(dfd, &fput_needed); -		retval = -EBADF; -		if (!file) -			goto out_fail; +		if (!f.file) +			return -EBADF; -		dentry = file->f_path.dentry; +		dentry = f.file->f_path.dentry;  		if (*name) { -			retval = -ENOTDIR; -			if (!S_ISDIR(dentry->d_inode->i_mode)) -				goto fput_fail; +			if (!S_ISDIR(dentry->d_inode->i_mode)) { +				fdput(f); +				return -ENOTDIR; +			}  			retval = inode_permission(dentry->d_inode, MAY_EXEC); -			if (retval) -				goto fput_fail; +			if (retval) { +				fdput(f); +				return retval; +			}  		} -		nd->path = file->f_path; +		nd->path = f.file->f_path;  		if (flags & LOOKUP_RCU) { -			if (fput_needed) -				*fp = file; +			if (f.need_put) +				*fp = f.file;  			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);  			lock_rcu_walk();  		} else { -			path_get(&file->f_path); -			fput_light(file, fput_needed); +			path_get(&nd->path); +			fdput(f);  		}  	}  	nd->inode = nd->path.dentry->d_inode;  	return 0; - -fput_fail: -	fput_light(file, fput_needed); -out_fail: -	return retval;  }  static inline int lookup_last(struct nameidata *nd, struct path *path) @@ -3971,7 +3966,7 @@ EXPORT_SYMBOL(user_path_at);  EXPORT_SYMBOL(follow_down_one);  EXPORT_SYMBOL(follow_down);  EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ +EXPORT_SYMBOL(get_write_access); /* nfsd */  EXPORT_SYMBOL(getname);  EXPORT_SYMBOL(lock_rename);  EXPORT_SYMBOL(lookup_one_len); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index eaa74323663..d7e9fe77188 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -89,6 +89,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(ncp_inode_cachep);  } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9b47610338f..e4c716d374a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1571,6 +1571,11 @@ static int __init nfs_init_inodecache(void)  static void nfs_destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(nfs_inode_cachep);  } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index cc894eda385..48a1bad3733 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2837,8 +2837,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)  		return -ENOMEM;  	}  	fp->fi_lease = fl; -	fp->fi_deleg_file = fl->fl_file; -	get_file(fp->fi_deleg_file); +	fp->fi_deleg_file = get_file(fl->fl_file);  	atomic_set(&fp->fi_delegees, 1);  	list_add(&dp->dl_perfile, &fp->fi_delegations);  	return 0; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6a10812711c..3c991dc84f2 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1382,6 +1382,12 @@ static void nilfs_segbuf_init_once(void *obj)  static void nilfs_destroy_cachep(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier(); +  	if (nilfs_inode_cachep)  		kmem_cache_destroy(nilfs_inode_cachep);  	if (nilfs_transaction_cachep) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index d4380366973..721d692fa8d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -58,7 +58,9 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,  	return fsnotify_remove_notify_event(group);  } -static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) +static int create_fd(struct fsnotify_group *group, +			struct fsnotify_event *event, +			struct file **file)  {  	int client_fd;  	struct file *new_file; @@ -98,7 +100,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)  		put_unused_fd(client_fd);  		client_fd = PTR_ERR(new_file);  	} else { -		fd_install(client_fd, new_file); +		*file = new_file;  	}  	return client_fd; @@ -106,13 +108,15 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)  static int fill_event_metadata(struct fsnotify_group *group,  				   struct fanotify_event_metadata *metadata, -				   struct fsnotify_event *event) +				   struct fsnotify_event *event, +				   struct file **file)  {  	int ret = 0;  	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,  		 group, metadata, event); +	*file = NULL;  	metadata->event_len = FAN_EVENT_METADATA_LEN;  	metadata->metadata_len = FAN_EVENT_METADATA_LEN;  	metadata->vers = FANOTIFY_METADATA_VERSION; @@ -121,7 +125,7 @@ static int fill_event_metadata(struct fsnotify_group *group,  	if (unlikely(event->mask & FAN_Q_OVERFLOW))  		metadata->fd = FAN_NOFD;  	else { -		metadata->fd = create_fd(group, event); +		metadata->fd = create_fd(group, event, file);  		if (metadata->fd < 0)  			ret = metadata->fd;  	} @@ -220,25 +224,6 @@ static int prepare_for_access_response(struct fsnotify_group *group,  	return 0;  } -static void remove_access_response(struct fsnotify_group *group, -				   struct fsnotify_event *event, -				   __s32 fd) -{ -	struct fanotify_response_event *re; - -	if (!(event->mask & FAN_ALL_PERM_EVENTS)) -		return; - -	re = dequeue_re(group, fd); -	if (!re) -		return; - -	BUG_ON(re->event != event); - -	kmem_cache_free(fanotify_response_event_cache, re); - -	return; -}  #else  static int prepare_for_access_response(struct fsnotify_group *group,  				       struct fsnotify_event *event, @@ -247,12 +232,6 @@ static int prepare_for_access_response(struct fsnotify_group *group,  	return 0;  } -static void remove_access_response(struct fsnotify_group *group, -				   struct fsnotify_event *event, -				   __s32 fd) -{ -	return; -}  #endif  static ssize_t copy_event_to_user(struct fsnotify_group *group, @@ -260,31 +239,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,  				  char __user *buf)  {  	struct fanotify_event_metadata fanotify_event_metadata; +	struct file *f;  	int fd, ret;  	pr_debug("%s: group=%p event=%p\n", __func__, group, event); -	ret = fill_event_metadata(group, &fanotify_event_metadata, event); +	ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);  	if (ret < 0)  		goto out;  	fd = fanotify_event_metadata.fd; -	ret = prepare_for_access_response(group, event, fd); -	if (ret) -		goto out_close_fd; -  	ret = -EFAULT;  	if (copy_to_user(buf, &fanotify_event_metadata,  			 fanotify_event_metadata.event_len)) -		goto out_kill_access_response; +		goto out_close_fd; +	ret = prepare_for_access_response(group, event, fd); +	if (ret) +		goto out_close_fd; + +	fd_install(fd, f);  	return fanotify_event_metadata.event_len; -out_kill_access_response: -	remove_access_response(group, event, fd);  out_close_fd: -	if (fd != FAN_NOFD) -		sys_close(fd); +	if (fd != FAN_NOFD) { +		put_unused_fd(fd); +		fput(f); +	}  out:  #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS  	if (event->mask & FAN_ALL_PERM_EVENTS) { @@ -470,24 +451,22 @@ static int fanotify_find_path(int dfd, const char __user *filename,  		 dfd, filename, flags);  	if (filename == NULL) { -		struct file *file; -		int fput_needed; +		struct fd f = fdget(dfd);  		ret = -EBADF; -		file = fget_light(dfd, &fput_needed); -		if (!file) +		if (!f.file)  			goto out;  		ret = -ENOTDIR;  		if ((flags & FAN_MARK_ONLYDIR) && -		    !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) { -			fput_light(file, fput_needed); +		    !(S_ISDIR(f.file->f_path.dentry->d_inode->i_mode))) { +			fdput(f);  			goto out;  		} -		*path = file->f_path; +		*path = f.file->f_path;  		path_get(path); -		fput_light(file, fput_needed); +		fdput(f);  	} else {  		unsigned int lookup_flags = 0; @@ -767,9 +746,9 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,  	struct inode *inode = NULL;  	struct vfsmount *mnt = NULL;  	struct fsnotify_group *group; -	struct file *filp; +	struct fd f;  	struct path path; -	int ret, fput_needed; +	int ret;  	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",  		 __func__, fanotify_fd, flags, dfd, pathname, mask); @@ -803,15 +782,15 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,  #endif  		return -EINVAL; -	filp = fget_light(fanotify_fd, &fput_needed); -	if (unlikely(!filp)) +	f = fdget(fanotify_fd); +	if (unlikely(!f.file))  		return -EBADF;  	/* verify that this is indeed an fanotify instance */  	ret = -EINVAL; -	if (unlikely(filp->f_op != &fanotify_fops)) +	if (unlikely(f.file->f_op != &fanotify_fops))  		goto fput_and_out; -	group = filp->private_data; +	group = f.file->private_data;  	/*  	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not @@ -858,7 +837,7 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,  	path_put(&path);  fput_and_out: -	fput_light(filp, fput_needed); +	fdput(f);  	return ret;  } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 8445fbc8985..c311dda054a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -757,16 +757,16 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,  	struct fsnotify_group *group;  	struct inode *inode;  	struct path path; -	struct file *filp; -	int ret, fput_needed; +	struct fd f; +	int ret;  	unsigned flags = 0; -	filp = fget_light(fd, &fput_needed); -	if (unlikely(!filp)) +	f = fdget(fd); +	if (unlikely(!f.file))  		return -EBADF;  	/* verify that this is indeed an inotify instance */ -	if (unlikely(filp->f_op != &inotify_fops)) { +	if (unlikely(f.file->f_op != &inotify_fops)) {  		ret = -EINVAL;  		goto fput_and_out;  	} @@ -782,13 +782,13 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,  	/* inode held in place by reference to path; group by fget on fd */  	inode = path.dentry->d_inode; -	group = filp->private_data; +	group = f.file->private_data;  	/* create/update an inode mark */  	ret = inotify_update_watch(group, inode, mask);  	path_put(&path);  fput_and_out: -	fput_light(filp, fput_needed); +	fdput(f);  	return ret;  } @@ -796,19 +796,19 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)  {  	struct fsnotify_group *group;  	struct inotify_inode_mark *i_mark; -	struct file *filp; -	int ret = 0, fput_needed; +	struct fd f; +	int ret = 0; -	filp = fget_light(fd, &fput_needed); -	if (unlikely(!filp)) +	f = fdget(fd); +	if (unlikely(!f.file))  		return -EBADF;  	/* verify that this is indeed an inotify instance */  	ret = -EINVAL; -	if (unlikely(filp->f_op != &inotify_fops)) +	if (unlikely(f.file->f_op != &inotify_fops))  		goto out; -	group = filp->private_data; +	group = f.file->private_data;  	ret = -EINVAL;  	i_mark = inotify_idr_find(group, wd); @@ -823,7 +823,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)  	fsnotify_put_mark(&i_mark->fsn_mark);  out: -	fput_light(filp, fput_needed); +	fdput(f);  	return ret;  } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index da01c165067..4a8289f8b16 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3193,6 +3193,12 @@ static void __exit exit_ntfs_fs(void)  	ntfs_debug("Unregistering NTFS driver.");  	unregister_filesystem(&ntfs_fs_type); + +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(ntfs_big_inode_cache);  	kmem_cache_destroy(ntfs_inode_cache);  	kmem_cache_destroy(ntfs_name_cache); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a4e855e3690..f7c648d7d6b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1746,8 +1746,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,  	long fd;  	int sectsize;  	char *p = (char *)page; -	struct file *filp = NULL; -	struct inode *inode = NULL; +	struct fd f; +	struct inode *inode;  	ssize_t ret = -EINVAL;  	int live_threshold; @@ -1766,26 +1766,26 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,  	if (fd < 0 || fd >= INT_MAX)  		goto out; -	filp = fget(fd); -	if (filp == NULL) +	f = fdget(fd); +	if (f.file == NULL)  		goto out;  	if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||  	    reg->hr_block_bytes == 0) -		goto out; +		goto out2; -	inode = igrab(filp->f_mapping->host); +	inode = igrab(f.file->f_mapping->host);  	if (inode == NULL) -		goto out; +		goto out2;  	if (!S_ISBLK(inode->i_mode)) -		goto out; +		goto out3; -	reg->hr_bdev = I_BDEV(filp->f_mapping->host); +	reg->hr_bdev = I_BDEV(f.file->f_mapping->host);  	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);  	if (ret) {  		reg->hr_bdev = NULL; -		goto out; +		goto out3;  	}  	inode = NULL; @@ -1797,7 +1797,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,  		     "blocksize %u incorrect for device, expected %d",  		     reg->hr_block_bytes, sectsize);  		ret = -EINVAL; -		goto out; +		goto out3;  	}  	o2hb_init_region_params(reg); @@ -1811,13 +1811,13 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,  	ret = o2hb_map_slot_data(reg);  	if (ret) {  		mlog_errno(ret); -		goto out; +		goto out3;  	}  	ret = o2hb_populate_slot_data(reg);  	if (ret) {  		mlog_errno(ret); -		goto out; +		goto out3;  	}  	INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); @@ -1847,7 +1847,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,  	if (IS_ERR(hb_task)) {  		ret = PTR_ERR(hb_task);  		mlog_errno(ret); -		goto out; +		goto out3;  	}  	spin_lock(&o2hb_live_lock); @@ -1863,7 +1863,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,  	if (reg->hr_aborted_start) {  		ret = -EIO; -		goto out; +		goto out3;  	}  	/* Ok, we were woken.  Make sure it wasn't by drop_item() */ @@ -1882,11 +1882,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,  		printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",  		       config_item_name(®->hr_item), reg->hr_dev_name); +out3: +	iput(inode); +out2: +	fdput(f);  out: -	if (filp) -		fput(filp); -	if (inode) -		iput(inode);  	if (ret < 0) {  		if (reg->hr_bdev) {  			blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 83b6f98e066..16b712d260d 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -691,6 +691,11 @@ static void __exit exit_dlmfs_fs(void)  	flush_workqueue(user_dlm_worker);  	destroy_workqueue(user_dlm_worker); +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(dlmfs_inode_cache);  	bdi_destroy(&dlmfs_backing_dev_info); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 68f4541c2db..0e91ec22a94 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1818,6 +1818,11 @@ static int ocfs2_initialize_mem_caches(void)  static void ocfs2_free_mem_caches(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	if (ocfs2_inode_cachep)  		kmem_cache_destroy(ocfs2_inode_cachep);  	ocfs2_inode_cachep = NULL; diff --git a/fs/open.c b/fs/open.c index b0bae3a4182..44da0feeca2 100644 --- a/fs/open.c +++ b/fs/open.c @@ -132,27 +132,27 @@ SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)  static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)  { -	struct inode * inode; +	struct inode *inode;  	struct dentry *dentry; -	struct file * file; +	struct fd f;  	int error;  	error = -EINVAL;  	if (length < 0)  		goto out;  	error = -EBADF; -	file = fget(fd); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		goto out;  	/* explicitly opened as large or we are on 64-bit box */ -	if (file->f_flags & O_LARGEFILE) +	if (f.file->f_flags & O_LARGEFILE)  		small = 0; -	dentry = file->f_path.dentry; +	dentry = f.file->f_path.dentry;  	inode = dentry->d_inode;  	error = -EINVAL; -	if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) +	if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))  		goto out_putf;  	error = -EINVAL; @@ -165,14 +165,14 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)  		goto out_putf;  	sb_start_write(inode->i_sb); -	error = locks_verify_truncate(inode, file, length); +	error = locks_verify_truncate(inode, f.file, length);  	if (!error) -		error = security_path_truncate(&file->f_path); +		error = security_path_truncate(&f.file->f_path);  	if (!error) -		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); +		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);  	sb_end_write(inode->i_sb);  out_putf: -	fput(file); +	fdput(f);  out:  	return error;  } @@ -276,15 +276,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)  { -	struct file *file; +	struct fd f = fdget(fd);  	int error = -EBADF; -	file = fget(fd); -	if (file) { -		error = do_fallocate(file, mode, offset, len); -		fput(file); +	if (f.file) { +		error = do_fallocate(f.file, mode, offset, len); +		fdput(f);  	} -  	return error;  } @@ -400,16 +398,15 @@ out:  SYSCALL_DEFINE1(fchdir, unsigned int, fd)  { -	struct file *file; +	struct fd f = fdget_raw(fd);  	struct inode *inode; -	int error, fput_needed; +	int error = -EBADF;  	error = -EBADF; -	file = fget_raw_light(fd, &fput_needed); -	if (!file) +	if (!f.file)  		goto out; -	inode = file->f_path.dentry->d_inode; +	inode = f.file->f_path.dentry->d_inode;  	error = -ENOTDIR;  	if (!S_ISDIR(inode->i_mode)) @@ -417,9 +414,9 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)  	error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);  	if (!error) -		set_fs_pwd(current->fs, &file->f_path); +		set_fs_pwd(current->fs, &f.file->f_path);  out_putf: -	fput_light(file, fput_needed); +	fdput(f);  out:  	return error;  } @@ -582,23 +579,20 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group  SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)  { -	struct file * file; +	struct fd f = fdget(fd);  	int error = -EBADF; -	struct dentry * dentry; -	file = fget(fd); -	if (!file) +	if (!f.file)  		goto out; -	error = mnt_want_write_file(file); +	error = mnt_want_write_file(f.file);  	if (error)  		goto out_fput; -	dentry = file->f_path.dentry; -	audit_inode(NULL, dentry); -	error = chown_common(&file->f_path, user, group); -	mnt_drop_write_file(file); +	audit_inode(NULL, f.file->f_path.dentry); +	error = chown_common(&f.file->f_path, user, group); +	mnt_drop_write_file(f.file);  out_fput: -	fput(file); +	fdput(f);  out:  	return error;  } @@ -803,50 +797,6 @@ struct file *dentry_open(const struct path *path, int flags,  }  EXPORT_SYMBOL(dentry_open); -static void __put_unused_fd(struct files_struct *files, unsigned int fd) -{ -	struct fdtable *fdt = files_fdtable(files); -	__clear_open_fd(fd, fdt); -	if (fd < files->next_fd) -		files->next_fd = fd; -} - -void put_unused_fd(unsigned int fd) -{ -	struct files_struct *files = current->files; -	spin_lock(&files->file_lock); -	__put_unused_fd(files, fd); -	spin_unlock(&files->file_lock); -} - -EXPORT_SYMBOL(put_unused_fd); - -/* - * Install a file pointer in the fd array. - * - * The VFS is full of places where we drop the files lock between - * setting the open_fds bitmap and installing the file in the file - * array.  At any such point, we are vulnerable to a dup2() race - * installing a file in the array before us.  We need to detect this and - * fput() the struct file we are about to overwrite in this case. - * - * It should never happen - if we allow dup2() do it, _really_ bad things - * will follow. - */ - -void fd_install(unsigned int fd, struct file *file) -{ -	struct files_struct *files = current->files; -	struct fdtable *fdt; -	spin_lock(&files->file_lock); -	fdt = files_fdtable(files); -	BUG_ON(fdt->fd[fd] != NULL); -	rcu_assign_pointer(fdt->fd[fd], file); -	spin_unlock(&files->file_lock); -} - -EXPORT_SYMBOL(fd_install); -  static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)  {  	int lookup_flags = 0; @@ -858,7 +808,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o  		op->mode = 0;  	/* Must never be set by userspace */ -	flags &= ~FMODE_NONOTIFY; +	flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;  	/*  	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only @@ -1038,23 +988,7 @@ EXPORT_SYMBOL(filp_close);   */  SYSCALL_DEFINE1(close, unsigned int, fd)  { -	struct file * filp; -	struct files_struct *files = current->files; -	struct fdtable *fdt; -	int retval; - -	spin_lock(&files->file_lock); -	fdt = files_fdtable(files); -	if (fd >= fdt->max_fds) -		goto out_unlock; -	filp = fdt->fd[fd]; -	if (!filp) -		goto out_unlock; -	rcu_assign_pointer(fdt->fd[fd], NULL); -	__clear_close_on_exec(fd, fdt); -	__put_unused_fd(files, fd); -	spin_unlock(&files->file_lock); -	retval = filp_close(filp, files); +	int retval = __close_fd(current->files, fd);  	/* can't restart close syscall because file table entry was cleared */  	if (unlikely(retval == -ERESTARTSYS || @@ -1064,10 +998,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)  		retval = -EINTR;  	return retval; - -out_unlock: -	spin_unlock(&files->file_lock); -	return -EBADF;  }  EXPORT_SYMBOL(sys_close); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 4a3477949bc..2ad080faca3 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -463,6 +463,11 @@ static int __init init_openprom_fs(void)  static void __exit exit_openprom_fs(void)  {  	unregister_filesystem(&openprom_fs_type); +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(op_inode_cachep);  } diff --git a/fs/pipe.c b/fs/pipe.c index 8d85d7068c1..bd3479db4b6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1064,9 +1064,8 @@ err_inode:  	return err;  } -int do_pipe_flags(int *fd, int flags) +static int __do_pipe_flags(int *fd, struct file **files, int flags)  { -	struct file *files[2];  	int error;  	int fdw, fdr; @@ -1088,11 +1087,8 @@ int do_pipe_flags(int *fd, int flags)  	fdw = error;  	audit_fd_pair(fdr, fdw); -	fd_install(fdr, files[0]); -	fd_install(fdw, files[1]);  	fd[0] = fdr;  	fd[1] = fdw; -  	return 0;   err_fdr: @@ -1103,21 +1099,38 @@ int do_pipe_flags(int *fd, int flags)  	return error;  } +int do_pipe_flags(int *fd, int flags) +{ +	struct file *files[2]; +	int error = __do_pipe_flags(fd, files, flags); +	if (!error) { +		fd_install(fd[0], files[0]); +		fd_install(fd[1], files[1]); +	} +	return error; +} +  /*   * sys_pipe() is the normal C calling standard for creating   * a pipe. It's not the way Unix traditionally does this, though.   */  SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)  { +	struct file *files[2];  	int fd[2];  	int error; -	error = do_pipe_flags(fd, flags); +	error = __do_pipe_flags(fd, files, flags);  	if (!error) { -		if (copy_to_user(fildes, fd, sizeof(fd))) { -			sys_close(fd[0]); -			sys_close(fd[1]); +		if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { +			fput(files[0]); +			fput(files[1]); +			put_unused_fd(fd[0]); +			put_unused_fd(fd[1]);  			error = -EFAULT; +		} else { +			fd_install(fd[0], files[0]); +			fd_install(fd[1], files[1]);  		}  	}  	return error; diff --git a/fs/proc/Makefile b/fs/proc/Makefile index c1c72933592..99349efbbc2 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -8,7 +8,7 @@ proc-y			:= nommu.o task_nommu.o  proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o  proc-y       += inode.o root.o base.o generic.o array.o \ -		proc_tty.o +		proc_tty.o fd.o  proc-y	+= cmdline.o  proc-y	+= consoles.o  proc-y	+= cpuinfo.o diff --git a/fs/proc/base.c b/fs/proc/base.c index acd1960c28a..d295af99367 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -90,6 +90,7 @@  #endif  #include <trace/events/oom.h>  #include "internal.h" +#include "fd.h"  /* NOTE:   *	Implementing inode permission operations in /proc is almost @@ -136,8 +137,6 @@ struct pid_entry {  		NULL, &proc_single_file_operations,	\  		{ .proc_show = show } ) -static int proc_fd_permission(struct inode *inode, int mask); -  /*   * Count the number of hardlinks for the pid_entry table, excluding the .   * and .. links. @@ -1500,7 +1499,7 @@ out:  	return error;  } -static const struct inode_operations proc_pid_link_inode_operations = { +const struct inode_operations proc_pid_link_inode_operations = {  	.readlink	= proc_pid_readlink,  	.follow_link	= proc_pid_follow_link,  	.setattr	= proc_setattr, @@ -1509,21 +1508,6 @@ static const struct inode_operations proc_pid_link_inode_operations = {  /* building an inode */ -static int task_dumpable(struct task_struct *task) -{ -	int dumpable = 0; -	struct mm_struct *mm; - -	task_lock(task); -	mm = task->mm; -	if (mm) -		dumpable = get_dumpable(mm); -	task_unlock(task); -	if(dumpable == 1) -		return 1; -	return 0; -} -  struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)  {  	struct inode * inode; @@ -1649,15 +1633,6 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)  	return 0;  } -static int pid_delete_dentry(const struct dentry * dentry) -{ -	/* Is the task we represent dead? -	 * If so, then don't put the dentry on the lru list, -	 * kill it immediately. -	 */ -	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; -} -  const struct dentry_operations pid_dentry_operations =  {  	.d_revalidate	= pid_revalidate, @@ -1720,289 +1695,6 @@ end_instantiate:  	return filldir(dirent, name, len, filp->f_pos, ino, type);  } -static unsigned name_to_int(struct dentry *dentry) -{ -	const char *name = dentry->d_name.name; -	int len = dentry->d_name.len; -	unsigned n = 0; - -	if (len > 1 && *name == '0') -		goto out; -	while (len-- > 0) { -		unsigned c = *name++ - '0'; -		if (c > 9) -			goto out; -		if (n >= (~0U-9)/10) -			goto out; -		n *= 10; -		n += c; -	} -	return n; -out: -	return ~0U; -} - -#define PROC_FDINFO_MAX 64 - -static int proc_fd_info(struct inode *inode, struct path *path, char *info) -{ -	struct task_struct *task = get_proc_task(inode); -	struct files_struct *files = NULL; -	struct file *file; -	int fd = proc_fd(inode); - -	if (task) { -		files = get_files_struct(task); -		put_task_struct(task); -	} -	if (files) { -		/* -		 * We are not taking a ref to the file structure, so we must -		 * hold ->file_lock. -		 */ -		spin_lock(&files->file_lock); -		file = fcheck_files(files, fd); -		if (file) { -			unsigned int f_flags; -			struct fdtable *fdt; - -			fdt = files_fdtable(files); -			f_flags = file->f_flags & ~O_CLOEXEC; -			if (close_on_exec(fd, fdt)) -				f_flags |= O_CLOEXEC; - -			if (path) { -				*path = file->f_path; -				path_get(&file->f_path); -			} -			if (info) -				snprintf(info, PROC_FDINFO_MAX, -					 "pos:\t%lli\n" -					 "flags:\t0%o\n", -					 (long long) file->f_pos, -					 f_flags); -			spin_unlock(&files->file_lock); -			put_files_struct(files); -			return 0; -		} -		spin_unlock(&files->file_lock); -		put_files_struct(files); -	} -	return -ENOENT; -} - -static int proc_fd_link(struct dentry *dentry, struct path *path) -{ -	return proc_fd_info(dentry->d_inode, path, NULL); -} - -static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) -{ -	struct inode *inode; -	struct task_struct *task; -	int fd; -	struct files_struct *files; -	const struct cred *cred; - -	if (flags & LOOKUP_RCU) -		return -ECHILD; - -	inode = dentry->d_inode; -	task = get_proc_task(inode); -	fd = proc_fd(inode); - -	if (task) { -		files = get_files_struct(task); -		if (files) { -			struct file *file; -			rcu_read_lock(); -			file = fcheck_files(files, fd); -			if (file) { -				unsigned f_mode = file->f_mode; - -				rcu_read_unlock(); -				put_files_struct(files); - -				if (task_dumpable(task)) { -					rcu_read_lock(); -					cred = __task_cred(task); -					inode->i_uid = cred->euid; -					inode->i_gid = cred->egid; -					rcu_read_unlock(); -				} else { -					inode->i_uid = GLOBAL_ROOT_UID; -					inode->i_gid = GLOBAL_ROOT_GID; -				} - -				if (S_ISLNK(inode->i_mode)) { -					unsigned i_mode = S_IFLNK; -					if (f_mode & FMODE_READ) -						i_mode |= S_IRUSR | S_IXUSR; -					if (f_mode & FMODE_WRITE) -						i_mode |= S_IWUSR | S_IXUSR; -					inode->i_mode = i_mode; -				} - -				security_task_to_inode(task, inode); -				put_task_struct(task); -				return 1; -			} -			rcu_read_unlock(); -			put_files_struct(files); -		} -		put_task_struct(task); -	} -	d_drop(dentry); -	return 0; -} - -static const struct dentry_operations tid_fd_dentry_operations = -{ -	.d_revalidate	= tid_fd_revalidate, -	.d_delete	= pid_delete_dentry, -}; - -static struct dentry *proc_fd_instantiate(struct inode *dir, -	struct dentry *dentry, struct task_struct *task, const void *ptr) -{ -	unsigned fd = (unsigned long)ptr; - 	struct inode *inode; - 	struct proc_inode *ei; -	struct dentry *error = ERR_PTR(-ENOENT); - -	inode = proc_pid_make_inode(dir->i_sb, task); -	if (!inode) -		goto out; -	ei = PROC_I(inode); -	ei->fd = fd; - -	inode->i_mode = S_IFLNK; -	inode->i_op = &proc_pid_link_inode_operations; -	inode->i_size = 64; -	ei->op.proc_get_link = proc_fd_link; -	d_set_d_op(dentry, &tid_fd_dentry_operations); -	d_add(dentry, inode); -	/* Close the race of the process dying before we return the dentry */ -	if (tid_fd_revalidate(dentry, 0)) -		error = NULL; - - out: -	return error; -} - -static struct dentry *proc_lookupfd_common(struct inode *dir, -					   struct dentry *dentry, -					   instantiate_t instantiate) -{ -	struct task_struct *task = get_proc_task(dir); -	unsigned fd = name_to_int(dentry); -	struct dentry *result = ERR_PTR(-ENOENT); - -	if (!task) -		goto out_no_task; -	if (fd == ~0U) -		goto out; - -	result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); -out: -	put_task_struct(task); -out_no_task: -	return result; -} - -static int proc_readfd_common(struct file * filp, void * dirent, -			      filldir_t filldir, instantiate_t instantiate) -{ -	struct dentry *dentry = filp->f_path.dentry; -	struct inode *inode = dentry->d_inode; -	struct task_struct *p = get_proc_task(inode); -	unsigned int fd, ino; -	int retval; -	struct files_struct * files; - -	retval = -ENOENT; -	if (!p) -		goto out_no_task; -	retval = 0; - -	fd = filp->f_pos; -	switch (fd) { -		case 0: -			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) -				goto out; -			filp->f_pos++; -		case 1: -			ino = parent_ino(dentry); -			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) -				goto out; -			filp->f_pos++; -		default: -			files = get_files_struct(p); -			if (!files) -				goto out; -			rcu_read_lock(); -			for (fd = filp->f_pos-2; -			     fd < files_fdtable(files)->max_fds; -			     fd++, filp->f_pos++) { -				char name[PROC_NUMBUF]; -				int len; -				int rv; - -				if (!fcheck_files(files, fd)) -					continue; -				rcu_read_unlock(); - -				len = snprintf(name, sizeof(name), "%d", fd); -				rv = proc_fill_cache(filp, dirent, filldir, -						     name, len, instantiate, p, -						     (void *)(unsigned long)fd); -				if (rv < 0) -					goto out_fd_loop; -				rcu_read_lock(); -			} -			rcu_read_unlock(); -out_fd_loop: -			put_files_struct(files); -	} -out: -	put_task_struct(p); -out_no_task: -	return retval; -} - -static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, -				    unsigned int flags) -{ -	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); -} - -static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) -{ -	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); -} - -static ssize_t proc_fdinfo_read(struct file *file, char __user *buf, -				      size_t len, loff_t *ppos) -{ -	char tmp[PROC_FDINFO_MAX]; -	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp); -	if (!err) -		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp)); -	return err; -} - -static const struct file_operations proc_fdinfo_file_operations = { -	.open           = nonseekable_open, -	.read		= proc_fdinfo_read, -	.llseek		= no_llseek, -}; - -static const struct file_operations proc_fd_operations = { -	.read		= generic_read_dir, -	.readdir	= proc_readfd, -	.llseek		= default_llseek, -}; -  #ifdef CONFIG_CHECKPOINT_RESTORE  /* @@ -2121,7 +1813,7 @@ out:  }  struct map_files_info { -	struct file	*file; +	fmode_t		mode;  	unsigned long	len;  	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */  }; @@ -2130,13 +1822,10 @@ static struct dentry *  proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,  			   struct task_struct *task, const void *ptr)  { -	const struct file *file = ptr; +	fmode_t mode = (fmode_t)(unsigned long)ptr;  	struct proc_inode *ei;  	struct inode *inode; -	if (!file) -		return ERR_PTR(-ENOENT); -  	inode = proc_pid_make_inode(dir->i_sb, task);  	if (!inode)  		return ERR_PTR(-ENOENT); @@ -2148,9 +1837,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,  	inode->i_size = 64;  	inode->i_mode = S_IFLNK; -	if (file->f_mode & FMODE_READ) +	if (mode & FMODE_READ)  		inode->i_mode |= S_IRUSR; -	if (file->f_mode & FMODE_WRITE) +	if (mode & FMODE_WRITE)  		inode->i_mode |= S_IWUSR;  	d_set_d_op(dentry, &tid_map_files_dentry_operations); @@ -2194,7 +1883,8 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,  	if (!vma)  		goto out_no_vma; -	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); +	result = proc_map_files_instantiate(dir, dentry, task, +			(void *)(unsigned long)vma->vm_file->f_mode);  out_no_vma:  	up_read(&mm->mmap_sem); @@ -2295,8 +1985,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)  				if (++pos <= filp->f_pos)  					continue; -				get_file(vma->vm_file); -				info.file = vma->vm_file; +				info.mode = vma->vm_file->f_mode;  				info.len = snprintf(info.name,  						sizeof(info.name), "%lx-%lx",  						vma->vm_start, vma->vm_end); @@ -2311,19 +2000,11 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)  			ret = proc_fill_cache(filp, dirent, filldir,  					      p->name, p->len,  					      proc_map_files_instantiate, -					      task, p->file); +					      task, +					      (void *)(unsigned long)p->mode);  			if (ret)  				break;  			filp->f_pos++; -			fput(p->file); -		} -		for (; i < nr_files; i++) { -			/* -			 * In case of error don't forget -			 * to put rest of file refs. -			 */ -			p = flex_array_get(fa, i); -			fput(p->file);  		}  		if (fa)  			flex_array_free(fa); @@ -2345,82 +2026,6 @@ static const struct file_operations proc_map_files_operations = {  #endif /* CONFIG_CHECKPOINT_RESTORE */ -/* - * /proc/pid/fd needs a special permission handler so that a process can still - * access /proc/self/fd after it has executed a setuid(). - */ -static int proc_fd_permission(struct inode *inode, int mask) -{ -	int rv = generic_permission(inode, mask); -	if (rv == 0) -		return 0; -	if (task_pid(current) == proc_pid(inode)) -		rv = 0; -	return rv; -} - -/* - * proc directories can do almost nothing.. - */ -static const struct inode_operations proc_fd_inode_operations = { -	.lookup		= proc_lookupfd, -	.permission	= proc_fd_permission, -	.setattr	= proc_setattr, -}; - -static struct dentry *proc_fdinfo_instantiate(struct inode *dir, -	struct dentry *dentry, struct task_struct *task, const void *ptr) -{ -	unsigned fd = (unsigned long)ptr; - 	struct inode *inode; - 	struct proc_inode *ei; -	struct dentry *error = ERR_PTR(-ENOENT); - -	inode = proc_pid_make_inode(dir->i_sb, task); -	if (!inode) -		goto out; -	ei = PROC_I(inode); -	ei->fd = fd; -	inode->i_mode = S_IFREG | S_IRUSR; -	inode->i_fop = &proc_fdinfo_file_operations; -	d_set_d_op(dentry, &tid_fd_dentry_operations); -	d_add(dentry, inode); -	/* Close the race of the process dying before we return the dentry */ -	if (tid_fd_revalidate(dentry, 0)) -		error = NULL; - - out: -	return error; -} - -static struct dentry *proc_lookupfdinfo(struct inode *dir, -					struct dentry *dentry, -					unsigned int flags) -{ -	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); -} - -static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) -{ -	return proc_readfd_common(filp, dirent, filldir, -				  proc_fdinfo_instantiate); -} - -static const struct file_operations proc_fdinfo_operations = { -	.read		= generic_read_dir, -	.readdir	= proc_readfdinfo, -	.llseek		= default_llseek, -}; - -/* - * proc directories can do almost nothing.. - */ -static const struct inode_operations proc_fdinfo_inode_operations = { -	.lookup		= proc_lookupfdinfo, -	.setattr	= proc_setattr, -}; - -  static struct dentry *proc_pident_instantiate(struct inode *dir,  	struct dentry *dentry, struct task_struct *task, const void *ptr)  { diff --git a/fs/proc/fd.c b/fs/proc/fd.c new file mode 100644 index 00000000000..f28a875f877 --- /dev/null +++ b/fs/proc/fd.c @@ -0,0 +1,367 @@ +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/dcache.h> +#include <linux/path.h> +#include <linux/fdtable.h> +#include <linux/namei.h> +#include <linux/pid.h> +#include <linux/security.h> +#include <linux/file.h> +#include <linux/seq_file.h> + +#include <linux/proc_fs.h> + +#include "internal.h" +#include "fd.h" + +static int seq_show(struct seq_file *m, void *v) +{ +	struct files_struct *files = NULL; +	int f_flags = 0, ret = -ENOENT; +	struct file *file = NULL; +	struct task_struct *task; + +	task = get_proc_task(m->private); +	if (!task) +		return -ENOENT; + +	files = get_files_struct(task); +	put_task_struct(task); + +	if (files) { +		int fd = proc_fd(m->private); + +		spin_lock(&files->file_lock); +		file = fcheck_files(files, fd); +		if (file) { +			struct fdtable *fdt = files_fdtable(files); + +			f_flags = file->f_flags; +			if (close_on_exec(fd, fdt)) +				f_flags |= O_CLOEXEC; + +			get_file(file); +			ret = 0; +		} +		spin_unlock(&files->file_lock); +		put_files_struct(files); +	} + +	if (!ret) { +                seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", +			   (long long)file->f_pos, f_flags); +		fput(file); +	} + +	return ret; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ +	return single_open(file, seq_show, inode); +} + +static const struct file_operations proc_fdinfo_file_operations = { +	.open		= seq_fdinfo_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +{ +	struct files_struct *files; +	struct task_struct *task; +	const struct cred *cred; +	struct inode *inode; +	int fd; + +	if (flags & LOOKUP_RCU) +		return -ECHILD; + +	inode = dentry->d_inode; +	task = get_proc_task(inode); +	fd = proc_fd(inode); + +	if (task) { +		files = get_files_struct(task); +		if (files) { +			struct file *file; + +			rcu_read_lock(); +			file = fcheck_files(files, fd); +			if (file) { +				unsigned f_mode = file->f_mode; + +				rcu_read_unlock(); +				put_files_struct(files); + +				if (task_dumpable(task)) { +					rcu_read_lock(); +					cred = __task_cred(task); +					inode->i_uid = cred->euid; +					inode->i_gid = cred->egid; +					rcu_read_unlock(); +				} else { +					inode->i_uid = GLOBAL_ROOT_UID; +					inode->i_gid = GLOBAL_ROOT_GID; +				} + +				if (S_ISLNK(inode->i_mode)) { +					unsigned i_mode = S_IFLNK; +					if (f_mode & FMODE_READ) +						i_mode |= S_IRUSR | S_IXUSR; +					if (f_mode & FMODE_WRITE) +						i_mode |= S_IWUSR | S_IXUSR; +					inode->i_mode = i_mode; +				} + +				security_task_to_inode(task, inode); +				put_task_struct(task); +				return 1; +			} +			rcu_read_unlock(); +			put_files_struct(files); +		} +		put_task_struct(task); +	} + +	d_drop(dentry); +	return 0; +} + +static const struct dentry_operations tid_fd_dentry_operations = { +	.d_revalidate	= tid_fd_revalidate, +	.d_delete	= pid_delete_dentry, +}; + +static int proc_fd_link(struct dentry *dentry, struct path *path) +{ +	struct files_struct *files = NULL; +	struct task_struct *task; +	int ret = -ENOENT; + +	task = get_proc_task(dentry->d_inode); +	if (task) { +		files = get_files_struct(task); +		put_task_struct(task); +	} + +	if (files) { +		int fd = proc_fd(dentry->d_inode); +		struct file *fd_file; + +		spin_lock(&files->file_lock); +		fd_file = fcheck_files(files, fd); +		if (fd_file) { +			*path = fd_file->f_path; +			path_get(&fd_file->f_path); +			ret = 0; +		} +		spin_unlock(&files->file_lock); +		put_files_struct(files); +	} + +	return ret; +} + +static struct dentry * +proc_fd_instantiate(struct inode *dir, struct dentry *dentry, +		    struct task_struct *task, const void *ptr) +{ +	struct dentry *error = ERR_PTR(-ENOENT); +	unsigned fd = (unsigned long)ptr; +	struct proc_inode *ei; +	struct inode *inode; + +	inode = proc_pid_make_inode(dir->i_sb, task); +	if (!inode) +		goto out; + +	ei = PROC_I(inode); +	ei->fd = fd; + +	inode->i_mode = S_IFLNK; +	inode->i_op = &proc_pid_link_inode_operations; +	inode->i_size = 64; + +	ei->op.proc_get_link = proc_fd_link; + +	d_set_d_op(dentry, &tid_fd_dentry_operations); +	d_add(dentry, inode); + +	/* Close the race of the process dying before we return the dentry */ +	if (tid_fd_revalidate(dentry, 0)) +		error = NULL; + out: +	return error; +} + +static struct dentry *proc_lookupfd_common(struct inode *dir, +					   struct dentry *dentry, +					   instantiate_t instantiate) +{ +	struct task_struct *task = get_proc_task(dir); +	struct dentry *result = ERR_PTR(-ENOENT); +	unsigned fd = name_to_int(dentry); + +	if (!task) +		goto out_no_task; +	if (fd == ~0U) +		goto out; + +	result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); +out: +	put_task_struct(task); +out_no_task: +	return result; +} + +static int proc_readfd_common(struct file * filp, void * dirent, +			      filldir_t filldir, instantiate_t instantiate) +{ +	struct dentry *dentry = filp->f_path.dentry; +	struct inode *inode = dentry->d_inode; +	struct task_struct *p = get_proc_task(inode); +	struct files_struct *files; +	unsigned int fd, ino; +	int retval; + +	retval = -ENOENT; +	if (!p) +		goto out_no_task; +	retval = 0; + +	fd = filp->f_pos; +	switch (fd) { +		case 0: +			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) +				goto out; +			filp->f_pos++; +		case 1: +			ino = parent_ino(dentry); +			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) +				goto out; +			filp->f_pos++; +		default: +			files = get_files_struct(p); +			if (!files) +				goto out; +			rcu_read_lock(); +			for (fd = filp->f_pos - 2; +			     fd < files_fdtable(files)->max_fds; +			     fd++, filp->f_pos++) { +				char name[PROC_NUMBUF]; +				int len; +				int rv; + +				if (!fcheck_files(files, fd)) +					continue; +				rcu_read_unlock(); + +				len = snprintf(name, sizeof(name), "%d", fd); +				rv = proc_fill_cache(filp, dirent, filldir, +						     name, len, instantiate, p, +						     (void *)(unsigned long)fd); +				if (rv < 0) +					goto out_fd_loop; +				rcu_read_lock(); +			} +			rcu_read_unlock(); +out_fd_loop: +			put_files_struct(files); +	} +out: +	put_task_struct(p); +out_no_task: +	return retval; +} + +static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) +{ +	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); +} + +const struct file_operations proc_fd_operations = { +	.read		= generic_read_dir, +	.readdir	= proc_readfd, +	.llseek		= default_llseek, +}; + +static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, +				    unsigned int flags) +{ +	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); +} + +/* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +int proc_fd_permission(struct inode *inode, int mask) +{ +	int rv = generic_permission(inode, mask); +	if (rv == 0) +		return 0; +	if (task_pid(current) == proc_pid(inode)) +		rv = 0; +	return rv; +} + +const struct inode_operations proc_fd_inode_operations = { +	.lookup		= proc_lookupfd, +	.permission	= proc_fd_permission, +	.setattr	= proc_setattr, +}; + +static struct dentry * +proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, +			struct task_struct *task, const void *ptr) +{ +	struct dentry *error = ERR_PTR(-ENOENT); +	unsigned fd = (unsigned long)ptr; +	struct proc_inode *ei; +	struct inode *inode; + +	inode = proc_pid_make_inode(dir->i_sb, task); +	if (!inode) +		goto out; + +	ei = PROC_I(inode); +	ei->fd = fd; + +	inode->i_mode = S_IFREG | S_IRUSR; +	inode->i_fop = &proc_fdinfo_file_operations; + +	d_set_d_op(dentry, &tid_fd_dentry_operations); +	d_add(dentry, inode); + +	/* Close the race of the process dying before we return the dentry */ +	if (tid_fd_revalidate(dentry, 0)) +		error = NULL; + out: +	return error; +} + +static struct dentry * +proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ +	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); +} + +static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) +{ +	return proc_readfd_common(filp, dirent, filldir, +				  proc_fdinfo_instantiate); +} + +const struct inode_operations proc_fdinfo_inode_operations = { +	.lookup		= proc_lookupfdinfo, +	.setattr	= proc_setattr, +}; + +const struct file_operations proc_fdinfo_operations = { +	.read		= generic_read_dir, +	.readdir	= proc_readfdinfo, +	.llseek		= default_llseek, +}; diff --git a/fs/proc/fd.h b/fs/proc/fd.h new file mode 100644 index 00000000000..cbb1d47deda --- /dev/null +++ b/fs/proc/fd.h @@ -0,0 +1,14 @@ +#ifndef __PROCFS_FD_H__ +#define __PROCFS_FD_H__ + +#include <linux/fs.h> + +extern const struct file_operations proc_fd_operations; +extern const struct inode_operations proc_fd_inode_operations; + +extern const struct file_operations proc_fdinfo_operations; +extern const struct inode_operations proc_fdinfo_inode_operations; + +extern int proc_fd_permission(struct inode *inode, int mask); + +#endif /* __PROCFS_FD_H__ */ diff --git a/fs/proc/internal.h b/fs/proc/internal.h index e1167a1c912..67925a7bd8c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -9,6 +9,7 @@   * 2 of the License, or (at your option) any later version.   */ +#include <linux/sched.h>  #include <linux/proc_fs.h>  struct  ctl_table_header; @@ -65,6 +66,7 @@ extern const struct file_operations proc_clear_refs_operations;  extern const struct file_operations proc_pagemap_operations;  extern const struct file_operations proc_net_operations;  extern const struct inode_operations proc_net_inode_operations; +extern const struct inode_operations proc_pid_link_inode_operations;  struct proc_maps_private {  	struct pid *pid; @@ -91,6 +93,52 @@ static inline int proc_fd(struct inode *inode)  	return PROC_I(inode)->fd;  } +static inline int task_dumpable(struct task_struct *task) +{ +	int dumpable = 0; +	struct mm_struct *mm; + +	task_lock(task); +	mm = task->mm; +	if (mm) +		dumpable = get_dumpable(mm); +	task_unlock(task); +	if(dumpable == 1) +		return 1; +	return 0; +} + +static inline int pid_delete_dentry(const struct dentry * dentry) +{ +	/* Is the task we represent dead? +	 * If so, then don't put the dentry on the lru list, +	 * kill it immediately. +	 */ +	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + +static inline unsigned name_to_int(struct dentry *dentry) +{ +	const char *name = dentry->d_name.name; +	int len = dentry->d_name.len; +	unsigned n = 0; + +	if (len > 1 && *name == '0') +		goto out; +	while (len-- > 0) { +		unsigned c = *name++ - '0'; +		if (c > 9) +			goto out; +		if (n >= (~0U-9)/10) +			goto out; +		n *= 10; +		n += c; +	} +	return n; +out: +	return ~0U; +} +  struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,  		struct dentry *dentry);  int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 5c3c7b02e17..43098bb5723 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -391,6 +391,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(qnx4_inode_cachep);  } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index f4eef0b5e7b..b6addf56048 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -651,6 +651,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(qnx6_inode_cachep);  } diff --git a/fs/read_write.c b/fs/read_write.c index 1adfb691e4f..d06534857e9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -232,23 +232,18 @@ EXPORT_SYMBOL(vfs_llseek);  SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)  {  	off_t retval; -	struct file * file; -	int fput_needed; - -	retval = -EBADF; -	file = fget_light(fd, &fput_needed); -	if (!file) -		goto bad; +	struct fd f = fdget(fd); +	if (!f.file) +		return -EBADF;  	retval = -EINVAL;  	if (origin <= SEEK_MAX) { -		loff_t res = vfs_llseek(file, offset, origin); +		loff_t res = vfs_llseek(f.file, offset, origin);  		retval = res;  		if (res != (loff_t)retval)  			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */  	} -	fput_light(file, fput_needed); -bad: +	fdput(f);  	return retval;  } @@ -258,20 +253,17 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,  		unsigned int, origin)  {  	int retval; -	struct file * file; +	struct fd f = fdget(fd);  	loff_t offset; -	int fput_needed; -	retval = -EBADF; -	file = fget_light(fd, &fput_needed); -	if (!file) -		goto bad; +	if (!f.file) +		return -EBADF;  	retval = -EINVAL;  	if (origin > SEEK_MAX)  		goto out_putf; -	offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low, +	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,  			origin);  	retval = (int)offset; @@ -281,8 +273,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,  			retval = 0;  	}  out_putf: -	fput_light(file, fput_needed); -bad: +	fdput(f);  	return retval;  }  #endif @@ -461,34 +452,29 @@ static inline void file_pos_write(struct file *file, loff_t pos)  SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)  { -	struct file *file; +	struct fd f = fdget(fd);  	ssize_t ret = -EBADF; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (file) { -		loff_t pos = file_pos_read(file); -		ret = vfs_read(file, buf, count, &pos); -		file_pos_write(file, pos); -		fput_light(file, fput_needed); +	if (f.file) { +		loff_t pos = file_pos_read(f.file); +		ret = vfs_read(f.file, buf, count, &pos); +		file_pos_write(f.file, pos); +		fdput(f);  	} -  	return ret;  }  SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,  		size_t, count)  { -	struct file *file; +	struct fd f = fdget(fd);  	ssize_t ret = -EBADF; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (file) { -		loff_t pos = file_pos_read(file); -		ret = vfs_write(file, buf, count, &pos); -		file_pos_write(file, pos); -		fput_light(file, fput_needed); +	if (f.file) { +		loff_t pos = file_pos_read(f.file); +		ret = vfs_write(f.file, buf, count, &pos); +		file_pos_write(f.file, pos); +		fdput(f);  	}  	return ret; @@ -497,19 +483,18 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,  SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,  			size_t count, loff_t pos)  { -	struct file *file; +	struct fd f;  	ssize_t ret = -EBADF; -	int fput_needed;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (file) { +	f = fdget(fd); +	if (f.file) {  		ret = -ESPIPE; -		if (file->f_mode & FMODE_PREAD) -			ret = vfs_read(file, buf, count, &pos); -		fput_light(file, fput_needed); +		if (f.file->f_mode & FMODE_PREAD) +			ret = vfs_read(f.file, buf, count, &pos); +		fdput(f);  	}  	return ret; @@ -526,19 +511,18 @@ SYSCALL_ALIAS(sys_pread64, SyS_pread64);  SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,  			 size_t count, loff_t pos)  { -	struct file *file; +	struct fd f;  	ssize_t ret = -EBADF; -	int fput_needed;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (file) { +	f = fdget(fd); +	if (f.file) {  		ret = -ESPIPE; -		if (file->f_mode & FMODE_PWRITE)   -			ret = vfs_write(file, buf, count, &pos); -		fput_light(file, fput_needed); +		if (f.file->f_mode & FMODE_PWRITE)   +			ret = vfs_write(f.file, buf, count, &pos); +		fdput(f);  	}  	return ret; @@ -789,16 +773,14 @@ EXPORT_SYMBOL(vfs_writev);  SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,  		unsigned long, vlen)  { -	struct file *file; +	struct fd f = fdget(fd);  	ssize_t ret = -EBADF; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (file) { -		loff_t pos = file_pos_read(file); -		ret = vfs_readv(file, vec, vlen, &pos); -		file_pos_write(file, pos); -		fput_light(file, fput_needed); +	if (f.file) { +		loff_t pos = file_pos_read(f.file); +		ret = vfs_readv(f.file, vec, vlen, &pos); +		file_pos_write(f.file, pos); +		fdput(f);  	}  	if (ret > 0) @@ -810,16 +792,14 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,  SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,  		unsigned long, vlen)  { -	struct file *file; +	struct fd f = fdget(fd);  	ssize_t ret = -EBADF; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (file) { -		loff_t pos = file_pos_read(file); -		ret = vfs_writev(file, vec, vlen, &pos); -		file_pos_write(file, pos); -		fput_light(file, fput_needed); +	if (f.file) { +		loff_t pos = file_pos_read(f.file); +		ret = vfs_writev(f.file, vec, vlen, &pos); +		file_pos_write(f.file, pos); +		fdput(f);  	}  	if (ret > 0) @@ -838,19 +818,18 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,  		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)  {  	loff_t pos = pos_from_hilo(pos_h, pos_l); -	struct file *file; +	struct fd f;  	ssize_t ret = -EBADF; -	int fput_needed;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (file) { +	f = fdget(fd); +	if (f.file) {  		ret = -ESPIPE; -		if (file->f_mode & FMODE_PREAD) -			ret = vfs_readv(file, vec, vlen, &pos); -		fput_light(file, fput_needed); +		if (f.file->f_mode & FMODE_PREAD) +			ret = vfs_readv(f.file, vec, vlen, &pos); +		fdput(f);  	}  	if (ret > 0) @@ -863,19 +842,18 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,  		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)  {  	loff_t pos = pos_from_hilo(pos_h, pos_l); -	struct file *file; +	struct fd f;  	ssize_t ret = -EBADF; -	int fput_needed;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (file) { +	f = fdget(fd); +	if (f.file) {  		ret = -ESPIPE; -		if (file->f_mode & FMODE_PWRITE) -			ret = vfs_writev(file, vec, vlen, &pos); -		fput_light(file, fput_needed); +		if (f.file->f_mode & FMODE_PWRITE) +			ret = vfs_writev(f.file, vec, vlen, &pos); +		fdput(f);  	}  	if (ret > 0) @@ -884,31 +862,31 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,  	return ret;  } -static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, -			   size_t count, loff_t max) +ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, +		    loff_t max)  { -	struct file * in_file, * out_file; -	struct inode * in_inode, * out_inode; +	struct fd in, out; +	struct inode *in_inode, *out_inode;  	loff_t pos;  	ssize_t retval; -	int fput_needed_in, fput_needed_out, fl; +	int fl;  	/*  	 * Get input file, and verify that it is ok..  	 */  	retval = -EBADF; -	in_file = fget_light(in_fd, &fput_needed_in); -	if (!in_file) +	in = fdget(in_fd); +	if (!in.file)  		goto out; -	if (!(in_file->f_mode & FMODE_READ)) +	if (!(in.file->f_mode & FMODE_READ))  		goto fput_in;  	retval = -ESPIPE;  	if (!ppos) -		ppos = &in_file->f_pos; +		ppos = &in.file->f_pos;  	else -		if (!(in_file->f_mode & FMODE_PREAD)) +		if (!(in.file->f_mode & FMODE_PREAD))  			goto fput_in; -	retval = rw_verify_area(READ, in_file, ppos, count); +	retval = rw_verify_area(READ, in.file, ppos, count);  	if (retval < 0)  		goto fput_in;  	count = retval; @@ -917,15 +895,15 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,  	 * Get output file, and verify that it is ok..  	 */  	retval = -EBADF; -	out_file = fget_light(out_fd, &fput_needed_out); -	if (!out_file) +	out = fdget(out_fd); +	if (!out.file)  		goto fput_in; -	if (!(out_file->f_mode & FMODE_WRITE)) +	if (!(out.file->f_mode & FMODE_WRITE))  		goto fput_out;  	retval = -EINVAL; -	in_inode = in_file->f_path.dentry->d_inode; -	out_inode = out_file->f_path.dentry->d_inode; -	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); +	in_inode = in.file->f_path.dentry->d_inode; +	out_inode = out.file->f_path.dentry->d_inode; +	retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);  	if (retval < 0)  		goto fput_out;  	count = retval; @@ -949,10 +927,10 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,  	 * and the application is arguably buggy if it doesn't expect  	 * EAGAIN on a non-blocking file descriptor.  	 */ -	if (in_file->f_flags & O_NONBLOCK) +	if (in.file->f_flags & O_NONBLOCK)  		fl = SPLICE_F_NONBLOCK;  #endif -	retval = do_splice_direct(in_file, ppos, out_file, count, fl); +	retval = do_splice_direct(in.file, ppos, out.file, count, fl);  	if (retval > 0) {  		add_rchar(current, retval); @@ -965,9 +943,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,  		retval = -EOVERFLOW;  fput_out: -	fput_light(out_file, fput_needed_out); +	fdput(out);  fput_in: -	fput_light(in_file, fput_needed_in); +	fdput(in);  out:  	return retval;  } diff --git a/fs/read_write.h b/fs/read_write.h index d07b954c6e0..d3e00ef6742 100644 --- a/fs/read_write.h +++ b/fs/read_write.h @@ -12,3 +12,5 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,  		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn);  ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,  		unsigned long nr_segs, loff_t *ppos, io_fn_t fn); +ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, +		    loff_t max); diff --git a/fs/readdir.c b/fs/readdir.c index 39e3370d79c..5e69ef533b7 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -106,22 +106,20 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,  		struct old_linux_dirent __user *, dirent, unsigned int, count)  {  	int error; -	struct file * file; +	struct fd f = fdget(fd);  	struct readdir_callback buf; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (!file) +	if (!f.file)  		return -EBADF;  	buf.result = 0;  	buf.dirent = dirent; -	error = vfs_readdir(file, fillonedir, &buf); +	error = vfs_readdir(f.file, fillonedir, &buf);  	if (buf.result)  		error = buf.result; -	fput_light(file, fput_needed); +	fdput(f);  	return error;  } @@ -191,17 +189,16 @@ efault:  SYSCALL_DEFINE3(getdents, unsigned int, fd,  		struct linux_dirent __user *, dirent, unsigned int, count)  { -	struct file * file; +	struct fd f;  	struct linux_dirent __user * lastdirent;  	struct getdents_callback buf; -	int fput_needed;  	int error;  	if (!access_ok(VERIFY_WRITE, dirent, count))  		return -EFAULT; -	file = fget_light(fd, &fput_needed); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		return -EBADF;  	buf.current_dir = dirent; @@ -209,17 +206,17 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,  	buf.count = count;  	buf.error = 0; -	error = vfs_readdir(file, filldir, &buf); +	error = vfs_readdir(f.file, filldir, &buf);  	if (error >= 0)  		error = buf.error;  	lastdirent = buf.previous;  	if (lastdirent) { -		if (put_user(file->f_pos, &lastdirent->d_off)) +		if (put_user(f.file->f_pos, &lastdirent->d_off))  			error = -EFAULT;  		else  			error = count - buf.count;  	} -	fput_light(file, fput_needed); +	fdput(f);  	return error;  } @@ -272,17 +269,16 @@ efault:  SYSCALL_DEFINE3(getdents64, unsigned int, fd,  		struct linux_dirent64 __user *, dirent, unsigned int, count)  { -	struct file * file; +	struct fd f;  	struct linux_dirent64 __user * lastdirent;  	struct getdents_callback64 buf; -	int fput_needed;  	int error;  	if (!access_ok(VERIFY_WRITE, dirent, count))  		return -EFAULT; -	file = fget_light(fd, &fput_needed); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		return -EBADF;  	buf.current_dir = dirent; @@ -290,17 +286,17 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,  	buf.count = count;  	buf.error = 0; -	error = vfs_readdir(file, filldir64, &buf); +	error = vfs_readdir(f.file, filldir64, &buf);  	if (error >= 0)  		error = buf.error;  	lastdirent = buf.previous;  	if (lastdirent) { -		typeof(lastdirent->d_off) d_off = file->f_pos; +		typeof(lastdirent->d_off) d_off = f.file->f_pos;  		if (__put_user(d_off, &lastdirent->d_off))  			error = -EFAULT;  		else  			error = count - buf.count;  	} -	fput_light(file, fput_needed); +	fdput(f);  	return error;  } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7a37dabf5a9..1078ae17999 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -608,6 +608,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(reiserfs_inode_cachep);  } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 77c5f217398..fd7c5f60b46 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -648,6 +648,11 @@ error_register:  static void __exit exit_romfs_fs(void)  {  	unregister_filesystem(&romfs_fs_type); +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(romfs_inode_cachep);  } diff --git a/fs/select.c b/fs/select.c index db14c781335..2ef72d96503 100644 --- a/fs/select.c +++ b/fs/select.c @@ -220,8 +220,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,  	struct poll_table_entry *entry = poll_get_entry(pwq);  	if (!entry)  		return; -	get_file(filp); -	entry->filp = filp; +	entry->filp = get_file(filp);  	entry->wait_address = wait_address;  	entry->key = p->_key;  	init_waitqueue_func_entry(&entry->wait, pollwake); @@ -429,8 +428,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)  		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {  			unsigned long in, out, ex, all_bits, bit = 1, mask, j;  			unsigned long res_in = 0, res_out = 0, res_ex = 0; -			const struct file_operations *f_op = NULL; -			struct file *file = NULL;  			in = *inp++; out = *outp++; ex = *exp++;  			all_bits = in | out | ex; @@ -440,20 +437,21 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)  			}  			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { -				int fput_needed; +				struct fd f;  				if (i >= n)  					break;  				if (!(bit & all_bits))  					continue; -				file = fget_light(i, &fput_needed); -				if (file) { -					f_op = file->f_op; +				f = fdget(i); +				if (f.file) { +					const struct file_operations *f_op; +					f_op = f.file->f_op;  					mask = DEFAULT_POLLMASK;  					if (f_op && f_op->poll) {  						wait_key_set(wait, in, out, bit); -						mask = (*f_op->poll)(file, wait); +						mask = (*f_op->poll)(f.file, wait);  					} -					fput_light(file, fput_needed); +					fdput(f);  					if ((mask & POLLIN_SET) && (in & bit)) {  						res_in |= bit;  						retval++; @@ -726,20 +724,17 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)  	mask = 0;  	fd = pollfd->fd;  	if (fd >= 0) { -		int fput_needed; -		struct file * file; - -		file = fget_light(fd, &fput_needed); +		struct fd f = fdget(fd);  		mask = POLLNVAL; -		if (file != NULL) { +		if (f.file) {  			mask = DEFAULT_POLLMASK; -			if (file->f_op && file->f_op->poll) { +			if (f.file->f_op && f.file->f_op->poll) {  				pwait->_key = pollfd->events|POLLERR|POLLHUP; -				mask = file->f_op->poll(file, pwait); +				mask = f.file->f_op->poll(f.file, pwait);  			}  			/* Mask out unneeded events. */  			mask &= pollfd->events | POLLERR | POLLHUP; -			fput_light(file, fput_needed); +			fdput(f);  		}  	}  	pollfd->revents = mask; diff --git a/fs/signalfd.c b/fs/signalfd.c index 9f35a37173d..8bee4e57091 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -269,13 +269,12 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,  		if (ufd < 0)  			kfree(ctx);  	} else { -		int fput_needed; -		struct file *file = fget_light(ufd, &fput_needed); -		if (!file) +		struct fd f = fdget(ufd); +		if (!f.file)  			return -EBADF; -		ctx = file->private_data; -		if (file->f_op != &signalfd_fops) { -			fput_light(file, fput_needed); +		ctx = f.file->private_data; +		if (f.file->f_op != &signalfd_fops) { +			fdput(f);  			return -EINVAL;  		}  		spin_lock_irq(¤t->sighand->siglock); @@ -283,7 +282,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,  		spin_unlock_irq(¤t->sighand->siglock);  		wake_up(¤t->sighand->signalfd_wqh); -		fput_light(file, fput_needed); +		fdput(f);  	}  	return ufd; diff --git a/fs/splice.c b/fs/splice.c index 41514dd8946..13e5b4776e7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1666,9 +1666,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,  SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,  		unsigned long, nr_segs, unsigned int, flags)  { -	struct file *file; +	struct fd f;  	long error; -	int fput;  	if (unlikely(nr_segs > UIO_MAXIOV))  		return -EINVAL; @@ -1676,14 +1675,14 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,  		return 0;  	error = -EBADF; -	file = fget_light(fd, &fput); -	if (file) { -		if (file->f_mode & FMODE_WRITE) -			error = vmsplice_to_pipe(file, iov, nr_segs, flags); -		else if (file->f_mode & FMODE_READ) -			error = vmsplice_to_user(file, iov, nr_segs, flags); +	f = fdget(fd); +	if (f.file) { +		if (f.file->f_mode & FMODE_WRITE) +			error = vmsplice_to_pipe(f.file, iov, nr_segs, flags); +		else if (f.file->f_mode & FMODE_READ) +			error = vmsplice_to_user(f.file, iov, nr_segs, flags); -		fput_light(file, fput); +		fdput(f);  	}  	return error; @@ -1693,30 +1692,27 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,  		int, fd_out, loff_t __user *, off_out,  		size_t, len, unsigned int, flags)  { +	struct fd in, out;  	long error; -	struct file *in, *out; -	int fput_in, fput_out;  	if (unlikely(!len))  		return 0;  	error = -EBADF; -	in = fget_light(fd_in, &fput_in); -	if (in) { -		if (in->f_mode & FMODE_READ) { -			out = fget_light(fd_out, &fput_out); -			if (out) { -				if (out->f_mode & FMODE_WRITE) -					error = do_splice(in, off_in, -							  out, off_out, +	in = fdget(fd_in); +	if (in.file) { +		if (in.file->f_mode & FMODE_READ) { +			out = fdget(fd_out); +			if (out.file) { +				if (out.file->f_mode & FMODE_WRITE) +					error = do_splice(in.file, off_in, +							  out.file, off_out,  							  len, flags); -				fput_light(out, fput_out); +				fdput(out);  			}  		} - -		fput_light(in, fput_in); +		fdput(in);  	} -  	return error;  } @@ -2027,26 +2023,25 @@ static long do_tee(struct file *in, struct file *out, size_t len,  SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)  { -	struct file *in; -	int error, fput_in; +	struct fd in; +	int error;  	if (unlikely(!len))  		return 0;  	error = -EBADF; -	in = fget_light(fdin, &fput_in); -	if (in) { -		if (in->f_mode & FMODE_READ) { -			int fput_out; -			struct file *out = fget_light(fdout, &fput_out); - -			if (out) { -				if (out->f_mode & FMODE_WRITE) -					error = do_tee(in, out, len, flags); -				fput_light(out, fput_out); +	in = fdget(fdin); +	if (in.file) { +		if (in.file->f_mode & FMODE_READ) { +			struct fd out = fdget(fdout); +			if (out.file) { +				if (out.file->f_mode & FMODE_WRITE) +					error = do_tee(in.file, out.file, +							len, flags); +				fdput(out);  			}  		} - 		fput_light(in, fput_in); + 		fdput(in);   	}  	return error; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 29cd014ed3a..260e3928d4f 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -425,6 +425,11 @@ static int __init init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(squashfs_inode_cachep);  } diff --git a/fs/stat.c b/fs/stat.c index 208039eec6c..eae494630a3 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -57,13 +57,13 @@ EXPORT_SYMBOL(vfs_getattr);  int vfs_fstat(unsigned int fd, struct kstat *stat)  { -	int fput_needed; -	struct file *f = fget_raw_light(fd, &fput_needed); +	struct fd f = fdget_raw(fd);  	int error = -EBADF; -	if (f) { -		error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat); -		fput_light(f, fput_needed); +	if (f.file) { +		error = vfs_getattr(f.file->f_path.mnt, f.file->f_path.dentry, +				    stat); +		fdput(f);  	}  	return error;  } diff --git a/fs/statfs.c b/fs/statfs.c index 95ad5c0e586..f8e832e6f0a 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -87,12 +87,11 @@ int user_statfs(const char __user *pathname, struct kstatfs *st)  int fd_statfs(int fd, struct kstatfs *st)  { -	int fput_needed; -	struct file *file = fget_light(fd, &fput_needed); +	struct fd f = fdget(fd);  	int error = -EBADF; -	if (file) { -		error = vfs_statfs(&file->f_path, st); -		fput_light(file, fput_needed); +	if (f.file) { +		error = vfs_statfs(&f.file->f_path, st); +		fdput(f);  	}  	return error;  } diff --git a/fs/super.c b/fs/super.c index 0902cfa6a12..5fdf7ff32c4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -307,12 +307,6 @@ void deactivate_locked_super(struct super_block *s)  		/* caches are now gone, we can safely kill the shrinker now */  		unregister_shrinker(&s->s_shrink); - -		/* -		 * We need to call rcu_barrier so all the delayed rcu free -		 * inodes are flushed before we release the fs module. -		 */ -		rcu_barrier();  		put_filesystem(fs);  		put_super(s);  	} else { diff --git a/fs/sync.c b/fs/sync.c index eb8722dc556..14eefeb4463 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -148,21 +148,19 @@ void emergency_sync(void)   */  SYSCALL_DEFINE1(syncfs, int, fd)  { -	struct file *file; +	struct fd f = fdget(fd);  	struct super_block *sb;  	int ret; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (!file) +	if (!f.file)  		return -EBADF; -	sb = file->f_dentry->d_sb; +	sb = f.file->f_dentry->d_sb;  	down_read(&sb->s_umount);  	ret = sync_filesystem(sb);  	up_read(&sb->s_umount); -	fput_light(file, fput_needed); +	fdput(f);  	return ret;  } @@ -201,14 +199,12 @@ EXPORT_SYMBOL(vfs_fsync);  static int do_fsync(unsigned int fd, int datasync)  { -	struct file *file; +	struct fd f = fdget(fd);  	int ret = -EBADF; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (file) { -		ret = vfs_fsync(file, datasync); -		fput_light(file, fput_needed); +	if (f.file) { +		ret = vfs_fsync(f.file, datasync); +		fdput(f);  	}  	return ret;  } @@ -291,10 +287,9 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,  				unsigned int flags)  {  	int ret; -	struct file *file; +	struct fd f;  	struct address_space *mapping;  	loff_t endbyte;			/* inclusive */ -	int fput_needed;  	umode_t i_mode;  	ret = -EINVAL; @@ -333,17 +328,17 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,  		endbyte--;		/* inclusive */  	ret = -EBADF; -	file = fget_light(fd, &fput_needed); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		goto out; -	i_mode = file->f_path.dentry->d_inode->i_mode; +	i_mode = f.file->f_path.dentry->d_inode->i_mode;  	ret = -ESPIPE;  	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&  			!S_ISLNK(i_mode))  		goto out_put; -	mapping = file->f_mapping; +	mapping = f.file->f_mapping;  	if (!mapping) {  		ret = -EINVAL;  		goto out_put; @@ -366,7 +361,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,  		ret = filemap_fdatawait_range(mapping, offset, endbyte);  out_put: -	fput_light(file, fput_needed); +	fdput(f);  out:  	return ret;  } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index b23ab736685..d33e506c1ea 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -360,5 +360,10 @@ int __init sysv_init_icache(void)  void sysv_destroy_icache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(sysv_inode_cachep);  } diff --git a/fs/timerfd.c b/fs/timerfd.c index dffeb3795af..d03822bbf19 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -234,19 +234,17 @@ static const struct file_operations timerfd_fops = {  	.llseek		= noop_llseek,  }; -static struct file *timerfd_fget(int fd) +static int timerfd_fget(int fd, struct fd *p)  { -	struct file *file; - -	file = fget(fd); -	if (!file) -		return ERR_PTR(-EBADF); -	if (file->f_op != &timerfd_fops) { -		fput(file); -		return ERR_PTR(-EINVAL); +	struct fd f = fdget(fd); +	if (!f.file) +		return -EBADF; +	if (f.file->f_op != &timerfd_fops) { +		fdput(f); +		return -EINVAL;  	} - -	return file; +	*p = f; +	return 0;  }  SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) @@ -284,7 +282,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,  		const struct itimerspec __user *, utmr,  		struct itimerspec __user *, otmr)  { -	struct file *file; +	struct fd f;  	struct timerfd_ctx *ctx;  	struct itimerspec ktmr, kotmr;  	int ret; @@ -297,10 +295,10 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,  	    !timespec_valid(&ktmr.it_interval))  		return -EINVAL; -	file = timerfd_fget(ufd); -	if (IS_ERR(file)) -		return PTR_ERR(file); -	ctx = file->private_data; +	ret = timerfd_fget(ufd, &f); +	if (ret) +		return ret; +	ctx = f.file->private_data;  	timerfd_setup_cancel(ctx, flags); @@ -334,7 +332,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,  	ret = timerfd_setup(ctx, flags, &ktmr);  	spin_unlock_irq(&ctx->wqh.lock); -	fput(file); +	fdput(f);  	if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))  		return -EFAULT; @@ -343,14 +341,13 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,  SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)  { -	struct file *file; +	struct fd f;  	struct timerfd_ctx *ctx;  	struct itimerspec kotmr; - -	file = timerfd_fget(ufd); -	if (IS_ERR(file)) -		return PTR_ERR(file); -	ctx = file->private_data; +	int ret = timerfd_fget(ufd, &f); +	if (ret) +		return ret; +	ctx = f.file->private_data;  	spin_lock_irq(&ctx->wqh.lock);  	if (ctx->expired && ctx->tintv.tv64) { @@ -362,7 +359,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)  	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));  	kotmr.it_interval = ktime_to_timespec(ctx->tintv);  	spin_unlock_irq(&ctx->wqh.lock); -	fput(file); +	fdput(f);  	return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;  } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 681f3a94244..49825427a0e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2298,6 +2298,12 @@ static void __exit ubifs_exit(void)  	dbg_debugfs_exit();  	ubifs_compressors_exit();  	unregister_shrinker(&ubifs_shrinker_info); + +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(ubifs_inode_slab);  	unregister_filesystem(&ubifs_fs_type);  } diff --git a/fs/udf/super.c b/fs/udf/super.c index 862741dddf2..d44fb568abe 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -171,6 +171,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(udf_inode_cachep);  } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 444927e5706..f7cfecfe1ca 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1466,6 +1466,11 @@ static int init_inodecache(void)  static void destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(ufs_inode_cachep);  } diff --git a/fs/utimes.c b/fs/utimes.c index fa4dbe451e2..bb0696a4173 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -140,19 +140,18 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,  		goto out;  	if (filename == NULL && dfd != AT_FDCWD) { -		int fput_needed; -		struct file *file; +		struct fd f;  		if (flags & AT_SYMLINK_NOFOLLOW)  			goto out; -		file = fget_light(dfd, &fput_needed); +		f = fdget(dfd);  		error = -EBADF; -		if (!file) +		if (!f.file)  			goto out; -		error = utimes_common(&file->f_path, times); -		fput_light(file, fput_needed); +		error = utimes_common(&f.file->f_path, times); +		fdput(f);  	} else {  		struct path path;  		int lookup_flags = 0; diff --git a/fs/xattr.c b/fs/xattr.c index f7f7f09b0b4..ca15fbd391c 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -403,22 +403,20 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,  SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,  		const void __user *,value, size_t, size, int, flags)  { -	int fput_needed; -	struct file *f; +	struct fd f = fdget(fd);  	struct dentry *dentry;  	int error = -EBADF; -	f = fget_light(fd, &fput_needed); -	if (!f) +	if (!f.file)  		return error; -	dentry = f->f_path.dentry; +	dentry = f.file->f_path.dentry;  	audit_inode(NULL, dentry); -	error = mnt_want_write_file(f); +	error = mnt_want_write_file(f.file);  	if (!error) {  		error = setxattr(dentry, name, value, size, flags); -		mnt_drop_write_file(f); +		mnt_drop_write_file(f.file);  	} -	fput_light(f, fput_needed); +	fdput(f);  	return error;  } @@ -502,16 +500,14 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,  SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,  		void __user *, value, size_t, size)  { -	int fput_needed; -	struct file *f; +	struct fd f = fdget(fd);  	ssize_t error = -EBADF; -	f = fget_light(fd, &fput_needed); -	if (!f) +	if (!f.file)  		return error; -	audit_inode(NULL, f->f_path.dentry); -	error = getxattr(f->f_path.dentry, name, value, size); -	fput_light(f, fput_needed); +	audit_inode(NULL, f.file->f_path.dentry); +	error = getxattr(f.file->f_path.dentry, name, value, size); +	fdput(f);  	return error;  } @@ -583,16 +579,14 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,  SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)  { -	int fput_needed; -	struct file *f; +	struct fd f = fdget(fd);  	ssize_t error = -EBADF; -	f = fget_light(fd, &fput_needed); -	if (!f) +	if (!f.file)  		return error; -	audit_inode(NULL, f->f_path.dentry); -	error = listxattr(f->f_path.dentry, list, size); -	fput_light(f, fput_needed); +	audit_inode(NULL, f.file->f_path.dentry); +	error = listxattr(f.file->f_path.dentry, list, size); +	fdput(f);  	return error;  } @@ -652,22 +646,20 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,  SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)  { -	int fput_needed; -	struct file *f; +	struct fd f = fdget(fd);  	struct dentry *dentry;  	int error = -EBADF; -	f = fget_light(fd, &fput_needed); -	if (!f) +	if (!f.file)  		return error; -	dentry = f->f_path.dentry; +	dentry = f.file->f_path.dentry;  	audit_inode(NULL, dentry); -	error = mnt_want_write_file(f); +	error = mnt_want_write_file(f.file);  	if (!error) {  		error = removexattr(dentry, name); -		mnt_drop_write_file(f); +		mnt_drop_write_file(f.file);  	} -	fput_light(f, fput_needed); +	fdput(f);  	return error;  } diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index e00de08dc8a..b9b8646e62d 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -48,44 +48,44 @@ xfs_swapext(  	xfs_swapext_t	*sxp)  {  	xfs_inode_t     *ip, *tip; -	struct file	*file, *tmp_file; +	struct fd	f, tmp;  	int		error = 0;  	/* Pull information for the target fd */ -	file = fget((int)sxp->sx_fdtarget); -	if (!file) { +	f = fdget((int)sxp->sx_fdtarget); +	if (!f.file) {  		error = XFS_ERROR(EINVAL);  		goto out;  	} -	if (!(file->f_mode & FMODE_WRITE) || -	    !(file->f_mode & FMODE_READ) || -	    (file->f_flags & O_APPEND)) { +	if (!(f.file->f_mode & FMODE_WRITE) || +	    !(f.file->f_mode & FMODE_READ) || +	    (f.file->f_flags & O_APPEND)) {  		error = XFS_ERROR(EBADF);  		goto out_put_file;  	} -	tmp_file = fget((int)sxp->sx_fdtmp); -	if (!tmp_file) { +	tmp = fdget((int)sxp->sx_fdtmp); +	if (!tmp.file) {  		error = XFS_ERROR(EINVAL);  		goto out_put_file;  	} -	if (!(tmp_file->f_mode & FMODE_WRITE) || -	    !(tmp_file->f_mode & FMODE_READ) || -	    (tmp_file->f_flags & O_APPEND)) { +	if (!(tmp.file->f_mode & FMODE_WRITE) || +	    !(tmp.file->f_mode & FMODE_READ) || +	    (tmp.file->f_flags & O_APPEND)) {  		error = XFS_ERROR(EBADF);  		goto out_put_tmp_file;  	} -	if (IS_SWAPFILE(file->f_path.dentry->d_inode) || -	    IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) { +	if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) || +	    IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) {  		error = XFS_ERROR(EINVAL);  		goto out_put_tmp_file;  	} -	ip = XFS_I(file->f_path.dentry->d_inode); -	tip = XFS_I(tmp_file->f_path.dentry->d_inode); +	ip = XFS_I(f.file->f_path.dentry->d_inode); +	tip = XFS_I(tmp.file->f_path.dentry->d_inode);  	if (ip->i_mount != tip->i_mount) {  		error = XFS_ERROR(EINVAL); @@ -105,9 +105,9 @@ xfs_swapext(  	error = xfs_swap_extents(ip, tip, sxp);   out_put_tmp_file: -	fput(tmp_file); +	fdput(tmp);   out_put_file: -	fput(file); +	fdput(f);   out:  	return error;  } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0e0232c3b6d..8305f2ac677 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -70,16 +70,16 @@ xfs_find_handle(  	int			hsize;  	xfs_handle_t		handle;  	struct inode		*inode; -	struct file		*file = NULL; +	struct fd		f;  	struct path		path;  	int			error;  	struct xfs_inode	*ip;  	if (cmd == XFS_IOC_FD_TO_HANDLE) { -		file = fget(hreq->fd); -		if (!file) +		f = fdget(hreq->fd); +		if (!f.file)  			return -EBADF; -		inode = file->f_path.dentry->d_inode; +		inode = f.file->f_path.dentry->d_inode;  	} else {  		error = user_lpath((const char __user *)hreq->path, &path);  		if (error) @@ -134,7 +134,7 @@ xfs_find_handle(   out_put:  	if (cmd == XFS_IOC_FD_TO_HANDLE) -		fput(file); +		fdput(f);  	else  		path_put(&path);  	return error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 001537f92ca..e0fd2734189 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1506,6 +1506,11 @@ xfs_init_zones(void)  STATIC void  xfs_destroy_zones(void)  { +	/* +	 * Make sure all delayed rcu free are flushed before we +	 * destroy caches. +	 */ +	rcu_barrier();  	kmem_zone_destroy(xfs_ili_zone);  	kmem_zone_destroy(xfs_inode_zone);  	kmem_zone_destroy(xfs_efi_zone); diff --git a/include/linux/compat.h b/include/linux/compat.h index 09b28b7369d..fd4e29956d1 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -590,6 +590,9 @@ asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid,  		unsigned long liovcnt, const struct compat_iovec __user *rvec,  		unsigned long riovcnt, unsigned long flags); +asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, +				    compat_off_t __user *offset, compat_size_t count); +  #else  #define is_compat_task() (0) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 158a41eed31..45052aa814c 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -30,31 +30,11 @@ struct fdtable {  	struct fdtable *next;  }; -static inline void __set_close_on_exec(int fd, struct fdtable *fdt) -{ -	__set_bit(fd, fdt->close_on_exec); -} - -static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) -{ -	__clear_bit(fd, fdt->close_on_exec); -} -  static inline bool close_on_exec(int fd, const struct fdtable *fdt)  {  	return test_bit(fd, fdt->close_on_exec);  } -static inline void __set_open_fd(int fd, struct fdtable *fdt) -{ -	__set_bit(fd, fdt->open_fds); -} - -static inline void __clear_open_fd(int fd, struct fdtable *fdt) -{ -	__clear_bit(fd, fdt->open_fds); -} -  static inline bool fd_is_open(int fd, const struct fdtable *fdt)  {  	return test_bit(fd, fdt->open_fds); @@ -93,15 +73,8 @@ struct file_operations;  struct vfsmount;  struct dentry; -extern int expand_files(struct files_struct *, int nr); -extern void free_fdtable_rcu(struct rcu_head *rcu);  extern void __init files_defer_init(void); -static inline void free_fdtable(struct fdtable *fdt) -{ -	call_rcu(&fdt->rcu, free_fdtable_rcu); -} -  static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)  {  	struct file * file = NULL; @@ -122,8 +95,20 @@ struct task_struct;  struct files_struct *get_files_struct(struct task_struct *);  void put_files_struct(struct files_struct *fs);  void reset_files_struct(struct files_struct *); +void daemonize_descriptors(void);  int unshare_files(struct files_struct **);  struct files_struct *dup_fd(struct files_struct *, int *); +void do_close_on_exec(struct files_struct *); +int iterate_fd(struct files_struct *, unsigned, +		int (*)(const void *, struct file *, unsigned), +		const void *); + +extern int __alloc_fd(struct files_struct *files, +		      unsigned start, unsigned end, unsigned flags); +extern void __fd_install(struct files_struct *files, +		      unsigned int fd, struct file *file); +extern int __close_fd(struct files_struct *files, +		      unsigned int fd);  extern struct kmem_cache *files_cachep; diff --git a/include/linux/file.h b/include/linux/file.h index a22408bac0d..cbacf4faf44 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -26,15 +26,44 @@ static inline void fput_light(struct file *file, int fput_needed)  		fput(file);  } +struct fd { +	struct file *file; +	int need_put; +}; + +static inline void fdput(struct fd fd) +{ +	if (fd.need_put) +		fput(fd.file); +} +  extern struct file *fget(unsigned int fd);  extern struct file *fget_light(unsigned int fd, int *fput_needed); + +static inline struct fd fdget(unsigned int fd) +{ +	int b; +	struct file *f = fget_light(fd, &b); +	return (struct fd){f,b}; +} +  extern struct file *fget_raw(unsigned int fd);  extern struct file *fget_raw_light(unsigned int fd, int *fput_needed); + +static inline struct fd fdget_raw(unsigned int fd) +{ +	int b; +	struct file *f = fget_raw_light(fd, &b); +	return (struct fd){f,b}; +} + +extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); +extern int replace_fd(unsigned fd, struct file *file, unsigned flags);  extern void set_close_on_exec(unsigned int fd, int flag); +extern bool get_close_on_exec(unsigned int fd);  extern void put_filp(struct file *); -extern int alloc_fd(unsigned start, unsigned flags); -extern int get_unused_fd(void); -#define get_unused_fd_flags(flags) alloc_fd(0, (flags)) +extern int get_unused_fd_flags(unsigned flags); +#define get_unused_fd() get_unused_fd_flags(0)  extern void put_unused_fd(unsigned int fd);  extern void fd_install(unsigned int fd, struct file *file); diff --git a/include/linux/fs.h b/include/linux/fs.h index aa110476a95..ca6d8c806f4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1074,7 +1074,11 @@ struct file_handle {  	unsigned char f_handle[0];  }; -#define get_file(x)	atomic_long_inc(&(x)->f_count) +static inline struct file *get_file(struct file *f) +{ +	atomic_long_inc(&f->f_count); +	return f; +}  #define fput_atomic(x)	atomic_long_add_unless(&(x)->f_count, -1, 1)  #define file_count(x)	atomic_long_read(&(x)->f_count) @@ -1126,9 +1130,9 @@ static inline int file_check_writeable(struct file *filp)  /* Page cache limit. The filesystems should put that into their s_maxbytes      limits, otherwise bad things can happen in VM. */   #if BITS_PER_LONG==32 -#define MAX_LFS_FILESIZE	(((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)  +#define MAX_LFS_FILESIZE	(((loff_t)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)   #elif BITS_PER_LONG==64 -#define MAX_LFS_FILESIZE 	0x7fffffffffffffffUL +#define MAX_LFS_FILESIZE 	((loff_t)0x7fffffffffffffff)  #endif  #define FL_POSIX	1 diff --git a/include/linux/net.h b/include/linux/net.h index 99276c3dc89..6ab31cabef7 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -65,6 +65,7 @@ typedef enum {  struct poll_table_struct;  struct pipe_inode_info;  struct inode; +struct file;  struct net;  #define SOCK_ASYNC_NOSPACE	0 @@ -246,7 +247,7 @@ extern int   	     sock_sendmsg(struct socket *sock, struct msghdr *msg,  				  size_t len);  extern int	     sock_recvmsg(struct socket *sock, struct msghdr *msg,  				  size_t size, int flags); -extern int 	     sock_map_fd(struct socket *sock, int flags); +extern struct file  *sock_alloc_file(struct socket *sock, int flags, const char *dname);  extern struct socket *sockfd_lookup(int fd, int *err);  extern struct socket *sock_from_file(struct file *file, int *err);  #define		     sockfd_put(sock) fput(sock->file) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c5612f0374..9d51e260bde 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -405,6 +405,7 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}  extern void set_dumpable(struct mm_struct *mm, int value);  extern int get_dumpable(struct mm_struct *mm); +extern int __get_dumpable(unsigned long mm_flags);  /* get/set_dumpable() values */  #define SUID_DUMPABLE_DISABLED	0 diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 9a08acc9e64..6d255e535d0 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -944,7 +944,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,  		size_t, msg_len, unsigned int, msg_prio,  		const struct timespec __user *, u_abs_timeout)  { -	struct file *filp; +	struct fd f;  	struct inode *inode;  	struct ext_wait_queue wait;  	struct ext_wait_queue *receiver; @@ -967,21 +967,21 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,  	audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL); -	filp = fget(mqdes); -	if (unlikely(!filp)) { +	f = fdget(mqdes); +	if (unlikely(!f.file)) {  		ret = -EBADF;  		goto out;  	} -	inode = filp->f_path.dentry->d_inode; -	if (unlikely(filp->f_op != &mqueue_file_operations)) { +	inode = f.file->f_path.dentry->d_inode; +	if (unlikely(f.file->f_op != &mqueue_file_operations)) {  		ret = -EBADF;  		goto out_fput;  	}  	info = MQUEUE_I(inode); -	audit_inode(NULL, filp->f_path.dentry); +	audit_inode(NULL, f.file->f_path.dentry); -	if (unlikely(!(filp->f_mode & FMODE_WRITE))) { +	if (unlikely(!(f.file->f_mode & FMODE_WRITE))) {  		ret = -EBADF;  		goto out_fput;  	} @@ -1023,7 +1023,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,  	}  	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) { -		if (filp->f_flags & O_NONBLOCK) { +		if (f.file->f_flags & O_NONBLOCK) {  			ret = -EAGAIN;  		} else {  			wait.task = current; @@ -1056,7 +1056,7 @@ out_free:  	if (ret)  		free_msg(msg_ptr);  out_fput: -	fput(filp); +	fdput(f);  out:  	return ret;  } @@ -1067,7 +1067,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,  {  	ssize_t ret;  	struct msg_msg *msg_ptr; -	struct file *filp; +	struct fd f;  	struct inode *inode;  	struct mqueue_inode_info *info;  	struct ext_wait_queue wait; @@ -1084,21 +1084,21 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,  	audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL); -	filp = fget(mqdes); -	if (unlikely(!filp)) { +	f = fdget(mqdes); +	if (unlikely(!f.file)) {  		ret = -EBADF;  		goto out;  	} -	inode = filp->f_path.dentry->d_inode; -	if (unlikely(filp->f_op != &mqueue_file_operations)) { +	inode = f.file->f_path.dentry->d_inode; +	if (unlikely(f.file->f_op != &mqueue_file_operations)) {  		ret = -EBADF;  		goto out_fput;  	}  	info = MQUEUE_I(inode); -	audit_inode(NULL, filp->f_path.dentry); +	audit_inode(NULL, f.file->f_path.dentry); -	if (unlikely(!(filp->f_mode & FMODE_READ))) { +	if (unlikely(!(f.file->f_mode & FMODE_READ))) {  		ret = -EBADF;  		goto out_fput;  	} @@ -1130,7 +1130,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,  	}  	if (info->attr.mq_curmsgs == 0) { -		if (filp->f_flags & O_NONBLOCK) { +		if (f.file->f_flags & O_NONBLOCK) {  			spin_unlock(&info->lock);  			ret = -EAGAIN;  		} else { @@ -1160,7 +1160,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,  		free_msg(msg_ptr);  	}  out_fput: -	fput(filp); +	fdput(f);  out:  	return ret;  } @@ -1174,7 +1174,7 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,  		const struct sigevent __user *, u_notification)  {  	int ret; -	struct file *filp; +	struct fd f;  	struct sock *sock;  	struct inode *inode;  	struct sigevent notification; @@ -1220,13 +1220,13 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,  			skb_put(nc, NOTIFY_COOKIE_LEN);  			/* and attach it to the socket */  retry: -			filp = fget(notification.sigev_signo); -			if (!filp) { +			f = fdget(notification.sigev_signo); +			if (!f.file) {  				ret = -EBADF;  				goto out;  			} -			sock = netlink_getsockbyfilp(filp); -			fput(filp); +			sock = netlink_getsockbyfilp(f.file); +			fdput(f);  			if (IS_ERR(sock)) {  				ret = PTR_ERR(sock);  				sock = NULL; @@ -1245,14 +1245,14 @@ retry:  		}  	} -	filp = fget(mqdes); -	if (!filp) { +	f = fdget(mqdes); +	if (!f.file) {  		ret = -EBADF;  		goto out;  	} -	inode = filp->f_path.dentry->d_inode; -	if (unlikely(filp->f_op != &mqueue_file_operations)) { +	inode = f.file->f_path.dentry->d_inode; +	if (unlikely(f.file->f_op != &mqueue_file_operations)) {  		ret = -EBADF;  		goto out_fput;  	} @@ -1292,7 +1292,7 @@ retry:  	}  	spin_unlock(&info->lock);  out_fput: -	fput(filp); +	fdput(f);  out:  	if (sock) {  		netlink_detachskb(sock, nc); @@ -1308,7 +1308,7 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,  {  	int ret;  	struct mq_attr mqstat, omqstat; -	struct file *filp; +	struct fd f;  	struct inode *inode;  	struct mqueue_inode_info *info; @@ -1319,14 +1319,14 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,  			return -EINVAL;  	} -	filp = fget(mqdes); -	if (!filp) { +	f = fdget(mqdes); +	if (!f.file) {  		ret = -EBADF;  		goto out;  	} -	inode = filp->f_path.dentry->d_inode; -	if (unlikely(filp->f_op != &mqueue_file_operations)) { +	inode = f.file->f_path.dentry->d_inode; +	if (unlikely(f.file->f_op != &mqueue_file_operations)) {  		ret = -EBADF;  		goto out_fput;  	} @@ -1335,15 +1335,15 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,  	spin_lock(&info->lock);  	omqstat = info->attr; -	omqstat.mq_flags = filp->f_flags & O_NONBLOCK; +	omqstat.mq_flags = f.file->f_flags & O_NONBLOCK;  	if (u_mqstat) {  		audit_mq_getsetattr(mqdes, &mqstat); -		spin_lock(&filp->f_lock); +		spin_lock(&f.file->f_lock);  		if (mqstat.mq_flags & O_NONBLOCK) -			filp->f_flags |= O_NONBLOCK; +			f.file->f_flags |= O_NONBLOCK;  		else -			filp->f_flags &= ~O_NONBLOCK; -		spin_unlock(&filp->f_lock); +			f.file->f_flags &= ~O_NONBLOCK; +		spin_unlock(&f.file->f_lock);  		inode->i_atime = inode->i_ctime = CURRENT_TIME;  	} @@ -1356,7 +1356,7 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,  		ret = -EFAULT;  out_fput: -	fput(filp); +	fdput(f);  out:  	return ret;  } diff --git a/kernel/events/core.c b/kernel/events/core.c index deec4e50eb3..f16f3c58f11 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -468,14 +468,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,  {  	struct perf_cgroup *cgrp;  	struct cgroup_subsys_state *css; -	struct file *file; -	int ret = 0, fput_needed; +	struct fd f = fdget(fd); +	int ret = 0; -	file = fget_light(fd, &fput_needed); -	if (!file) +	if (!f.file)  		return -EBADF; -	css = cgroup_css_from_dir(file, perf_subsys_id); +	css = cgroup_css_from_dir(f.file, perf_subsys_id);  	if (IS_ERR(css)) {  		ret = PTR_ERR(css);  		goto out; @@ -501,7 +500,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,  		ret = -EINVAL;  	}  out: -	fput_light(file, fput_needed); +	fdput(f);  	return ret;  } @@ -3234,21 +3233,18 @@ unlock:  static const struct file_operations perf_fops; -static struct file *perf_fget_light(int fd, int *fput_needed) +static inline int perf_fget_light(int fd, struct fd *p)  { -	struct file *file; - -	file = fget_light(fd, fput_needed); -	if (!file) -		return ERR_PTR(-EBADF); +	struct fd f = fdget(fd); +	if (!f.file) +		return -EBADF; -	if (file->f_op != &perf_fops) { -		fput_light(file, *fput_needed); -		*fput_needed = 0; -		return ERR_PTR(-EBADF); +	if (f.file->f_op != &perf_fops) { +		fdput(f); +		return -EBADF;  	} - -	return file; +	*p = f; +	return 0;  }  static int perf_event_set_output(struct perf_event *event, @@ -3280,22 +3276,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  	case PERF_EVENT_IOC_SET_OUTPUT:  	{ -		struct file *output_file = NULL; -		struct perf_event *output_event = NULL; -		int fput_needed = 0;  		int ret; -  		if (arg != -1) { -			output_file = perf_fget_light(arg, &fput_needed); -			if (IS_ERR(output_file)) -				return PTR_ERR(output_file); -			output_event = output_file->private_data; +			struct perf_event *output_event; +			struct fd output; +			ret = perf_fget_light(arg, &output); +			if (ret) +				return ret; +			output_event = output.file->private_data; +			ret = perf_event_set_output(event, output_event); +			fdput(output); +		} else { +			ret = perf_event_set_output(event, NULL);  		} - -		ret = perf_event_set_output(event, output_event); -		if (output_event) -			fput_light(output_file, fput_needed); -  		return ret;  	} @@ -6443,12 +6436,11 @@ SYSCALL_DEFINE5(perf_event_open,  	struct perf_event_attr attr;  	struct perf_event_context *ctx;  	struct file *event_file = NULL; -	struct file *group_file = NULL; +	struct fd group = {NULL, 0};  	struct task_struct *task = NULL;  	struct pmu *pmu;  	int event_fd;  	int move_group = 0; -	int fput_needed = 0;  	int err;  	/* for future expandability... */ @@ -6478,17 +6470,15 @@ SYSCALL_DEFINE5(perf_event_open,  	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))  		return -EINVAL; -	event_fd = get_unused_fd_flags(O_RDWR); +	event_fd = get_unused_fd();  	if (event_fd < 0)  		return event_fd;  	if (group_fd != -1) { -		group_file = perf_fget_light(group_fd, &fput_needed); -		if (IS_ERR(group_file)) { -			err = PTR_ERR(group_file); +		err = perf_fget_light(group_fd, &group); +		if (err)  			goto err_fd; -		} -		group_leader = group_file->private_data; +		group_leader = group.file->private_data;  		if (flags & PERF_FLAG_FD_OUTPUT)  			output_event = group_leader;  		if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6664,7 +6654,7 @@ SYSCALL_DEFINE5(perf_event_open,  	 * of the group leader will find the pointer to itself in  	 * perf_group_detach().  	 */ -	fput_light(group_file, fput_needed); +	fdput(group);  	fd_install(event_fd, event_file);  	return event_fd; @@ -6678,7 +6668,7 @@ err_task:  	if (task)  		put_task_struct(task);  err_group_fd: -	fput_light(group_file, fput_needed); +	fdput(group);  err_fd:  	put_unused_fd(event_fd);  	return err; diff --git a/kernel/exit.c b/kernel/exit.c index 42f25952edd..346616c0092 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -457,108 +457,13 @@ void daemonize(const char *name, ...)  	/* Become as one with the init task */  	daemonize_fs_struct(); -	exit_files(current); -	current->files = init_task.files; -	atomic_inc(¤t->files->count); +	daemonize_descriptors();  	reparent_to_kthreadd();  }  EXPORT_SYMBOL(daemonize); -static void close_files(struct files_struct * files) -{ -	int i, j; -	struct fdtable *fdt; - -	j = 0; - -	/* -	 * It is safe to dereference the fd table without RCU or -	 * ->file_lock because this is the last reference to the -	 * files structure.  But use RCU to shut RCU-lockdep up. -	 */ -	rcu_read_lock(); -	fdt = files_fdtable(files); -	rcu_read_unlock(); -	for (;;) { -		unsigned long set; -		i = j * BITS_PER_LONG; -		if (i >= fdt->max_fds) -			break; -		set = fdt->open_fds[j++]; -		while (set) { -			if (set & 1) { -				struct file * file = xchg(&fdt->fd[i], NULL); -				if (file) { -					filp_close(file, files); -					cond_resched(); -				} -			} -			i++; -			set >>= 1; -		} -	} -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ -	struct files_struct *files; - -	task_lock(task); -	files = task->files; -	if (files) -		atomic_inc(&files->count); -	task_unlock(task); - -	return files; -} - -void put_files_struct(struct files_struct *files) -{ -	struct fdtable *fdt; - -	if (atomic_dec_and_test(&files->count)) { -		close_files(files); -		/* -		 * Free the fd and fdset arrays if we expanded them. -		 * If the fdtable was embedded, pass files for freeing -		 * at the end of the RCU grace period. Otherwise, -		 * you can free files immediately. -		 */ -		rcu_read_lock(); -		fdt = files_fdtable(files); -		if (fdt != &files->fdtab) -			kmem_cache_free(files_cachep, files); -		free_fdtable(fdt); -		rcu_read_unlock(); -	} -} - -void reset_files_struct(struct files_struct *files) -{ -	struct task_struct *tsk = current; -	struct files_struct *old; - -	old = tsk->files; -	task_lock(tsk); -	tsk->files = files; -	task_unlock(tsk); -	put_files_struct(old); -} - -void exit_files(struct task_struct *tsk) -{ -	struct files_struct * files = tsk->files; - -	if (files) { -		task_lock(tsk); -		tsk->files = NULL; -		task_unlock(tsk); -		put_files_struct(files); -	} -} -  #ifdef CONFIG_MM_OWNER  /*   * A task is exiting.   If it owned this mm, find a new owner for the mm. diff --git a/kernel/sys.c b/kernel/sys.c index 241507f23ec..f9492284e5d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1788,15 +1788,15 @@ SYSCALL_DEFINE1(umask, int, mask)  #ifdef CONFIG_CHECKPOINT_RESTORE  static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)  { -	struct file *exe_file; +	struct fd exe;  	struct dentry *dentry;  	int err; -	exe_file = fget(fd); -	if (!exe_file) +	exe = fdget(fd); +	if (!exe.file)  		return -EBADF; -	dentry = exe_file->f_path.dentry; +	dentry = exe.file->f_path.dentry;  	/*  	 * Because the original mm->exe_file points to executable file, make @@ -1805,7 +1805,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)  	 */  	err = -EACCES;  	if (!S_ISREG(dentry->d_inode->i_mode)	|| -	    exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) +	    exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)  		goto exit;  	err = inode_permission(dentry->d_inode, MAY_EXEC); @@ -1839,12 +1839,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)  		goto exit_unlock;  	err = 0; -	set_mm_exe_file(mm, exe_file); +	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */  exit_unlock:  	up_write(&mm->mmap_sem);  exit: -	fput(exe_file); +	fdput(exe);  	return err;  } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5eab1f3edfa..610f0838d55 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -424,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)  	struct nlattr *na;  	size_t size;  	u32 fd; -	struct file *file; -	int fput_needed; +	struct fd f;  	na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];  	if (!na)  		return -EINVAL;  	fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); -	file = fget_light(fd, &fput_needed); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		return 0;  	size = nla_total_size(sizeof(struct cgroupstats)); @@ -453,7 +452,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)  	stats = nla_data(na);  	memset(stats, 0, sizeof(*stats)); -	rc = cgroupstats_build(stats, file->f_dentry); +	rc = cgroupstats_build(stats, f.file->f_dentry);  	if (rc < 0) {  		nlmsg_free(rep_skb);  		goto err; @@ -462,7 +461,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)  	rc = send_reply(rep_skb, info);  err: -	fput_light(file, fput_needed); +	fdput(f);  	return rc;  } diff --git a/mm/fadvise.c b/mm/fadvise.c index 9b75a045dbf..a47f0f50c89 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -26,7 +26,7 @@   */  SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)  { -	struct file *file = fget(fd); +	struct fd f = fdget(fd);  	struct address_space *mapping;  	struct backing_dev_info *bdi;  	loff_t endbyte;			/* inclusive */ @@ -35,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)  	unsigned long nrpages;  	int ret = 0; -	if (!file) +	if (!f.file)  		return -EBADF; -	if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { +	if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {  		ret = -ESPIPE;  		goto out;  	} -	mapping = file->f_mapping; +	mapping = f.file->f_mapping;  	if (!mapping || len < 0) {  		ret = -EINVAL;  		goto out; @@ -76,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)  	switch (advice) {  	case POSIX_FADV_NORMAL: -		file->f_ra.ra_pages = bdi->ra_pages; -		spin_lock(&file->f_lock); -		file->f_mode &= ~FMODE_RANDOM; -		spin_unlock(&file->f_lock); +		f.file->f_ra.ra_pages = bdi->ra_pages; +		spin_lock(&f.file->f_lock); +		f.file->f_mode &= ~FMODE_RANDOM; +		spin_unlock(&f.file->f_lock);  		break;  	case POSIX_FADV_RANDOM: -		spin_lock(&file->f_lock); -		file->f_mode |= FMODE_RANDOM; -		spin_unlock(&file->f_lock); +		spin_lock(&f.file->f_lock); +		f.file->f_mode |= FMODE_RANDOM; +		spin_unlock(&f.file->f_lock);  		break;  	case POSIX_FADV_SEQUENTIAL: -		file->f_ra.ra_pages = bdi->ra_pages * 2; -		spin_lock(&file->f_lock); -		file->f_mode &= ~FMODE_RANDOM; -		spin_unlock(&file->f_lock); +		f.file->f_ra.ra_pages = bdi->ra_pages * 2; +		spin_lock(&f.file->f_lock); +		f.file->f_mode &= ~FMODE_RANDOM; +		spin_unlock(&f.file->f_lock);  		break;  	case POSIX_FADV_WILLNEED:  		/* First and last PARTIAL page! */ @@ -106,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)  		 * Ignore return value because fadvise() shall return  		 * success even if filesystem can't retrieve a hint,  		 */ -		force_page_cache_readahead(mapping, file, start_index, +		force_page_cache_readahead(mapping, f.file, start_index,  					   nrpages);  		break;  	case POSIX_FADV_NOREUSE: @@ -128,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)  		ret = -EINVAL;  	}  out: -	fput(file); +	fdput(f);  	return ret;  }  #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS diff --git a/mm/fremap.c b/mm/fremap.c index 9ed4fd43246..048659c0c03 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -195,10 +195,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,  		 */  		if (mapping_cap_account_dirty(mapping)) {  			unsigned long addr; -			struct file *file = vma->vm_file; +			struct file *file = get_file(vma->vm_file);  			flags &= MAP_NONBLOCK; -			get_file(file);  			addr = mmap_region(file, start, size,  					flags, vma->vm_flags, pgoff);  			fput(file); diff --git a/mm/mmap.c b/mm/mmap.c index ae18a48e7e4..872441e8191 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1301,8 +1301,7 @@ munmap_back:  				goto free_vma;  			correct_wcount = 1;  		} -		vma->vm_file = file; -		get_file(file); +		vma->vm_file = get_file(file);  		error = file->f_op->mmap(file, vma);  		if (error)  			goto unmap_and_free_vma; diff --git a/mm/nommu.c b/mm/nommu.c index d4b0c10872d..dee2ff89fd5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1282,10 +1282,8 @@ unsigned long do_mmap_pgoff(struct file *file,  	vma->vm_pgoff = pgoff;  	if (file) { -		region->vm_file = file; -		get_file(file); -		vma->vm_file = file; -		get_file(file); +		region->vm_file = get_file(file); +		vma->vm_file = get_file(file);  		if (vm_flags & VM_EXECUTABLE) {  			added_exe_file_vma(current->mm);  			vma->vm_mm = current->mm; diff --git a/mm/readahead.c b/mm/readahead.c index ea8f8fa2164..7963f239123 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -579,19 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp,  SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)  {  	ssize_t ret; -	struct file *file; +	struct fd f;  	ret = -EBADF; -	file = fget(fd); -	if (file) { -		if (file->f_mode & FMODE_READ) { -			struct address_space *mapping = file->f_mapping; +	f = fdget(fd); +	if (f.file) { +		if (f.file->f_mode & FMODE_READ) { +			struct address_space *mapping = f.file->f_mapping;  			pgoff_t start = offset >> PAGE_CACHE_SHIFT;  			pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;  			unsigned long len = end - start + 1; -			ret = do_readahead(mapping, file, start, len); +			ret = do_readahead(mapping, f.file, start, len);  		} -		fput(file); +		fdput(f);  	}  	return ret;  } diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 505f0ce3f10..15656b8573f 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -793,30 +793,28 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd)  static int p9_socket_open(struct p9_client *client, struct socket *csocket)  {  	struct p9_trans_fd *p; -	int ret, fd; +	struct file *file; +	int ret;  	p = kmalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);  	if (!p)  		return -ENOMEM;  	csocket->sk->sk_allocation = GFP_NOIO; -	fd = sock_map_fd(csocket, 0); -	if (fd < 0) { +	file = sock_alloc_file(csocket, 0, NULL); +	if (IS_ERR(file)) {  		pr_err("%s (%d): failed to map fd\n",  		       __func__, task_pid_nr(current));  		sock_release(csocket);  		kfree(p); -		return fd; +		return PTR_ERR(file);  	} -	get_file(csocket->file); -	get_file(csocket->file); -	p->wr = p->rd = csocket->file; +	get_file(file); +	p->wr = p->rd = file;  	client->trans = p;  	client->status = Connected; -	sys_close(fd);	/* still racy */ -  	p->rd->f_flags |= O_NONBLOCK;  	p->conn = p9_conn_create(client); diff --git a/net/compat.c b/net/compat.c index 74ed1d7a84a..79ae8848500 100644 --- a/net/compat.c +++ b/net/compat.c @@ -301,8 +301,7 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)  			break;  		}  		/* Bump the usage count and install the file. */ -		get_file(fp[i]); -		fd_install(new_fd, fp[i]); +		fd_install(new_fd, get_file(fp[i]));  	}  	if (i > 0) { diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 4a83fb3c8e8..79285a36035 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -239,38 +239,24 @@ out_free_devname:  	return ret;  } +static int update_netprio(const void *v, struct file *file, unsigned n) +{ +	int err; +	struct socket *sock = sock_from_file(file, &err); +	if (sock) +		sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; +	return 0; +} +  void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  {  	struct task_struct *p; +	void *v;  	cgroup_taskset_for_each(p, cgrp, tset) { -		unsigned int fd; -		struct fdtable *fdt; -		struct files_struct *files; -  		task_lock(p); -		files = p->files; -		if (!files) { -			task_unlock(p); -			continue; -		} - -		spin_lock(&files->file_lock); -		fdt = files_fdtable(files); -		for (fd = 0; fd < fdt->max_fds; fd++) { -			struct file *file; -			struct socket *sock; -			int err; - -			file = fcheck_files(files, fd); -			if (!file) -				continue; - -			sock = sock_from_file(file, &err); -			if (sock) -				sock_update_netprioidx(sock->sk, p); -		} -		spin_unlock(&files->file_lock); +		v = (void *)(unsigned long)task_netprioidx(p); +		iterate_fd(p->files, 0, update_netprio, v);  		task_unlock(p);  	}  } diff --git a/net/core/scm.c b/net/core/scm.c index 9c1c63da3ca..ab570841a53 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -301,11 +301,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)  			break;  		}  		/* Bump the usage count and install the file. */ -		get_file(fp[i]);  		sock = sock_from_file(fp[i], &err);  		if (sock)  			sock_update_netprioidx(sock->sk, current); -		fd_install(new_fd, fp[i]); +		fd_install(new_fd, get_file(fp[i]));  	}  	if (i > 0) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d37d24ff197..59d16ea927f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -70,6 +70,7 @@  #include <linux/init.h>  #include <linux/crypto.h>  #include <linux/slab.h> +#include <linux/file.h>  #include <net/ip.h>  #include <net/icmp.h> @@ -4292,6 +4293,7 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval  {  	sctp_peeloff_arg_t peeloff;  	struct socket *newsock; +	struct file *newfile;  	int retval = 0;  	if (len < sizeof(sctp_peeloff_arg_t)) @@ -4305,22 +4307,35 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval  		goto out;  	/* Map the socket to an unused fd that can be returned to the user.  */ -	retval = sock_map_fd(newsock, 0); +	retval = get_unused_fd();  	if (retval < 0) {  		sock_release(newsock);  		goto out;  	} +	newfile = sock_alloc_file(newsock, 0, NULL); +	if (unlikely(IS_ERR(newfile))) { +		put_unused_fd(retval); +		sock_release(newsock); +		return PTR_ERR(newfile); +	} +  	SCTP_DEBUG_PRINTK("%s: sk: %p newsk: %p sd: %d\n",  			  __func__, sk, newsock->sk, retval);  	/* Return the fd mapped to the new socket.  */ +	if (put_user(len, optlen)) { +		fput(newfile); +		put_unused_fd(retval); +		return -EFAULT; +	}  	peeloff.sd = retval; -	if (put_user(len, optlen)) +	if (copy_to_user(optval, &peeloff, len)) { +		fput(newfile); +		put_unused_fd(retval);  		return -EFAULT; -	if (copy_to_user(optval, &peeloff, len)) -		retval = -EFAULT; - +	} +	fd_install(retval, newfile);  out:  	return retval;  } diff --git a/net/socket.c b/net/socket.c index 80dc7e84b04..d92c490e66f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -347,17 +347,11 @@ static struct file_system_type sock_fs_type = {   *	but we take care of internal coherence yet.   */ -static int sock_alloc_file(struct socket *sock, struct file **f, int flags, -			   const char *dname) +struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)  {  	struct qstr name = { .name = "" };  	struct path path;  	struct file *file; -	int fd; - -	fd = get_unused_fd_flags(flags); -	if (unlikely(fd < 0)) -		return fd;  	if (dname) {  		name.name = dname; @@ -367,10 +361,8 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags,  		name.len = strlen(name.name);  	}  	path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); -	if (unlikely(!path.dentry)) { -		put_unused_fd(fd); -		return -ENOMEM; -	} +	if (unlikely(!path.dentry)) +		return ERR_PTR(-ENOMEM);  	path.mnt = mntget(sock_mnt);  	d_instantiate(path.dentry, SOCK_INODE(sock)); @@ -382,30 +374,33 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags,  		/* drop dentry, keep inode */  		ihold(path.dentry->d_inode);  		path_put(&path); -		put_unused_fd(fd); -		return -ENFILE; +		return ERR_PTR(-ENFILE);  	}  	sock->file = file;  	file->f_flags = O_RDWR | (flags & O_NONBLOCK);  	file->f_pos = 0;  	file->private_data = sock; - -	*f = file; -	return fd; +	return file;  } +EXPORT_SYMBOL(sock_alloc_file); -int sock_map_fd(struct socket *sock, int flags) +static int sock_map_fd(struct socket *sock, int flags)  {  	struct file *newfile; -	int fd = sock_alloc_file(sock, &newfile, flags, NULL); +	int fd = get_unused_fd_flags(flags); +	if (unlikely(fd < 0)) +		return fd; -	if (likely(fd >= 0)) +	newfile = sock_alloc_file(sock, flags, NULL); +	if (likely(!IS_ERR(newfile))) {  		fd_install(fd, newfile); +		return fd; +	} -	return fd; +	put_unused_fd(fd); +	return PTR_ERR(newfile);  } -EXPORT_SYMBOL(sock_map_fd);  struct socket *sock_from_file(struct file *file, int *err)  { @@ -1466,17 +1461,32 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,  	if (err < 0)  		goto out_release_both; -	fd1 = sock_alloc_file(sock1, &newfile1, flags, NULL); +	fd1 = get_unused_fd_flags(flags);  	if (unlikely(fd1 < 0)) {  		err = fd1;  		goto out_release_both;  	} - -	fd2 = sock_alloc_file(sock2, &newfile2, flags, NULL); +	fd2 = get_unused_fd_flags(flags);  	if (unlikely(fd2 < 0)) {  		err = fd2; +		put_unused_fd(fd1); +		goto out_release_both; +	} + +	newfile1 = sock_alloc_file(sock1, flags, NULL); +	if (unlikely(IS_ERR(newfile1))) { +		err = PTR_ERR(newfile1); +		put_unused_fd(fd1); +		put_unused_fd(fd2); +		goto out_release_both; +	} + +	newfile2 = sock_alloc_file(sock2, flags, NULL); +	if (IS_ERR(newfile2)) { +		err = PTR_ERR(newfile2);  		fput(newfile1);  		put_unused_fd(fd1); +		put_unused_fd(fd2);  		sock_release(sock2);  		goto out;  	} @@ -1608,13 +1618,19 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,  	 */  	__module_get(newsock->ops->owner); -	newfd = sock_alloc_file(newsock, &newfile, flags, -				sock->sk->sk_prot_creator->name); +	newfd = get_unused_fd_flags(flags);  	if (unlikely(newfd < 0)) {  		err = newfd;  		sock_release(newsock);  		goto out_put;  	} +	newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); +	if (unlikely(IS_ERR(newfile))) { +		err = PTR_ERR(newfile); +		put_unused_fd(newfd); +		sock_release(newsock); +		goto out_put; +	}  	err = security_socket_accept(sock, newsock);  	if (err) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 6c77f63c759..651d8456611 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2088,15 +2088,19 @@ static int selinux_bprm_secureexec(struct linux_binprm *bprm)  	return (atsecure || cap_bprm_secureexec(bprm));  } +static int match_file(const void *p, struct file *file, unsigned fd) +{ +	return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0; +} +  /* Derived from fs/exec.c:flush_old_files. */  static inline void flush_unauthorized_files(const struct cred *cred,  					    struct files_struct *files)  {  	struct file *file, *devnull = NULL;  	struct tty_struct *tty; -	struct fdtable *fdt; -	long j = -1;  	int drop_tty = 0; +	unsigned n;  	tty = get_current_tty();  	if (tty) { @@ -2123,58 +2127,23 @@ static inline void flush_unauthorized_files(const struct cred *cred,  		no_tty();  	/* Revalidate access to inherited open files. */ -	spin_lock(&files->file_lock); -	for (;;) { -		unsigned long set, i; -		int fd; - -		j++; -		i = j * BITS_PER_LONG; -		fdt = files_fdtable(files); -		if (i >= fdt->max_fds) -			break; -		set = fdt->open_fds[j]; -		if (!set) -			continue; -		spin_unlock(&files->file_lock); -		for ( ; set ; i++, set >>= 1) { -			if (set & 1) { -				file = fget(i); -				if (!file) -					continue; -				if (file_has_perm(cred, -						  file, -						  file_to_av(file))) { -					sys_close(i); -					fd = get_unused_fd(); -					if (fd != i) { -						if (fd >= 0) -							put_unused_fd(fd); -						fput(file); -						continue; -					} -					if (devnull) { -						get_file(devnull); -					} else { -						devnull = dentry_open( -							&selinux_null, -							O_RDWR, cred); -						if (IS_ERR(devnull)) { -							devnull = NULL; -							put_unused_fd(fd); -							fput(file); -							continue; -						} -					} -					fd_install(fd, devnull); -				} -				fput(file); -			} -		} -		spin_lock(&files->file_lock); +	n = iterate_fd(files, 0, match_file, cred); +	if (!n) /* none found? */ +		return; +	devnull = dentry_open(&selinux_null, O_RDWR, cred); +	if (!IS_ERR(devnull)) { +		/* replace all the matching ones with this */ +		do { +			replace_fd(n - 1, get_file(devnull), 0); +		} while ((n = iterate_fd(files, n, match_file, cred)) != 0); +		fput(devnull); +	} else { +		/* just close all the matching ones */ +		do { +			replace_fd(n - 1, NULL, 0); +		} while ((n = iterate_fd(files, n, match_file, cred)) != 0);  	} -	spin_unlock(&files->file_lock);  }  /* diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 53b5ada8f7c..20554eff5a2 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -1563,25 +1563,25 @@ static int snd_pcm_drop(struct snd_pcm_substream *substream)  /* WARNING: Don't forget to fput back the file */ -static struct file *snd_pcm_file_fd(int fd) +static struct file *snd_pcm_file_fd(int fd, int *fput_needed)  {  	struct file *file;  	struct inode *inode;  	unsigned int minor; -	file = fget(fd); +	file = fget_light(fd, fput_needed);  	if (!file)  		return NULL;  	inode = file->f_path.dentry->d_inode;  	if (!S_ISCHR(inode->i_mode) ||  	    imajor(inode) != snd_major) { -		fput(file); +		fput_light(file, *fput_needed);  		return NULL;  	}  	minor = iminor(inode);  	if (!snd_lookup_minor_data(minor, SNDRV_DEVICE_TYPE_PCM_PLAYBACK) &&  	    !snd_lookup_minor_data(minor, SNDRV_DEVICE_TYPE_PCM_CAPTURE)) { -		fput(file); +		fput_light(file, *fput_needed);  		return NULL;  	}  	return file; @@ -1597,8 +1597,9 @@ static int snd_pcm_link(struct snd_pcm_substream *substream, int fd)  	struct snd_pcm_file *pcm_file;  	struct snd_pcm_substream *substream1;  	struct snd_pcm_group *group; +	int fput_needed; -	file = snd_pcm_file_fd(fd); +	file = snd_pcm_file_fd(fd, &fput_needed);  	if (!file)  		return -EBADFD;  	pcm_file = file->private_data; @@ -1633,7 +1634,7 @@ static int snd_pcm_link(struct snd_pcm_substream *substream, int fd)  	write_unlock_irq(&snd_pcm_link_rwlock);  	up_write(&snd_pcm_link_rwsem);   _nolock: -	fput(file); +	fput_light(file, fput_needed);  	if (res < 0)  		kfree(group);  	return res; |