diff options
84 files changed, 1326 insertions, 639 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 7f647e17830..0f103e39b4f 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -138,8 +138,8 @@ evict_inode:  put_super:		write  write_super:		read  sync_fs:		read -freeze_fs:		read -unfreeze_fs:		read +freeze_fs:		write +unfreeze_fs:		write  statfs:			maybe(read)	(see below)  remount_fs:		write  umount_begin:		no diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 8c235b6e424..88152f214f4 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/fs:  - nr_open  - overflowuid  - overflowgid +- protected_hardlinks +- protected_symlinks  - suid_dumpable  - super-max  - super-nr @@ -157,6 +159,46 @@ The default is 65534.  ============================================================== +protected_hardlinks: + +A long-standing class of security issues is the hardlink-based +time-of-check-time-of-use race, most commonly seen in world-writable +directories like /tmp. The common method of exploitation of this flaw +is to cross privilege boundaries when following a given hardlink (i.e. a +root process follows a hardlink created by another user). Additionally, +on systems without separated partitions, this stops unauthorized users +from "pinning" vulnerable setuid/setgid files against being upgraded by +the administrator, or linking to special files. + +When set to "0", hardlink creation behavior is unrestricted. + +When set to "1" hardlinks cannot be created by users if they do not +already own the source file, or do not have read/write access to it. + +This protection is based on the restrictions in Openwall and grsecurity. + +============================================================== + +protected_symlinks: + +A long-standing class of security issues is the symlink-based +time-of-check-time-of-use race, most commonly seen in world-writable +directories like /tmp. The common method of exploitation of this flaw +is to cross privilege boundaries when following a given symlink (i.e. a +root process follows a symlink belonging to another user). For a likely +incomplete list of hundreds of examples across the years, please see: +http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp + +When set to "0", symlink following behavior is unrestricted. + +When set to "1" symlinks are permitted to be followed only when outside +a sticky world-writable directory, or when the uid of the symlink and +follower match, or when the directory owner matches the symlink's owner. + +This protection is based on the restrictions in Openwall and grsecurity. + +============================================================== +  suid_dumpable:  This value can be used to query and set the core dump mode for setuid diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index d544d7816df..dba1ce235da 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -186,10 +186,13 @@ static void spufs_prune_dir(struct dentry *dir)  static int spufs_rmdir(struct inode *parent, struct dentry *dir)  {  	/* remove all entries */ +	int res;  	spufs_prune_dir(dir);  	d_drop(dir); - -	return simple_rmdir(parent, dir); +	res = simple_rmdir(parent, dir); +	/* We have to give up the mm_struct */ +	spu_forget(SPUFS_I(dir->d_inode)->i_ctx); +	return res;  }  static int spufs_fill_dir(struct dentry *dir, @@ -245,9 +248,6 @@ static int spufs_dir_close(struct inode *inode, struct file *file)  	mutex_unlock(&parent->i_mutex);  	WARN_ON(ret); -	/* We have to give up the mm_struct */ -	spu_forget(ctx); -  	return dcache_dir_close(inode, file);  } @@ -450,28 +450,24 @@ spufs_create_context(struct inode *inode, struct dentry *dentry,  	struct spu_context *neighbor;  	struct path path = {.mnt = mnt, .dentry = dentry}; -	ret = -EPERM;  	if ((flags & SPU_CREATE_NOSCHED) &&  	    !capable(CAP_SYS_NICE)) -		goto out_unlock; +		return -EPERM; -	ret = -EINVAL;  	if ((flags & (SPU_CREATE_NOSCHED | SPU_CREATE_ISOLATE))  	    == SPU_CREATE_ISOLATE) -		goto out_unlock; +		return -EINVAL; -	ret = -ENODEV;  	if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader) -		goto out_unlock; +		return -ENODEV;  	gang = NULL;  	neighbor = NULL;  	affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU);  	if (affinity) {  		gang = SPUFS_I(inode)->i_gang; -		ret = -EINVAL;  		if (!gang) -			goto out_unlock; +			return -EINVAL;  		mutex_lock(&gang->aff_mutex);  		neighbor = spufs_assert_affinity(flags, gang, aff_filp);  		if (IS_ERR(neighbor)) { @@ -492,22 +488,12 @@ spufs_create_context(struct inode *inode, struct dentry *dentry,  	}  	ret = spufs_context_open(&path); -	if (ret < 0) { +	if (ret < 0)  		WARN_ON(spufs_rmdir(inode, dentry)); -		if (affinity) -			mutex_unlock(&gang->aff_mutex); -		mutex_unlock(&inode->i_mutex); -		spu_forget(SPUFS_I(dentry->d_inode)->i_ctx); -		goto out; -	}  out_aff_unlock:  	if (affinity)  		mutex_unlock(&gang->aff_mutex); -out_unlock: -	mutex_unlock(&inode->i_mutex); -out: -	dput(dentry);  	return ret;  } @@ -580,18 +566,13 @@ static int spufs_create_gang(struct inode *inode,  	int ret;  	ret = spufs_mkgang(inode, dentry, mode & S_IRWXUGO); -	if (ret) -		goto out; - -	ret = spufs_gang_open(&path); -	if (ret < 0) { -		int err = simple_rmdir(inode, dentry); -		WARN_ON(err); +	if (!ret) { +		ret = spufs_gang_open(&path); +		if (ret < 0) { +			int err = simple_rmdir(inode, dentry); +			WARN_ON(err); +		}  	} - -out: -	mutex_unlock(&inode->i_mutex); -	dput(dentry);  	return ret;  } @@ -601,40 +582,32 @@ static struct file_system_type spufs_type;  long spufs_create(struct path *path, struct dentry *dentry,  		unsigned int flags, umode_t mode, struct file *filp)  { +	struct inode *dir = path->dentry->d_inode;  	int ret; -	ret = -EINVAL;  	/* check if we are on spufs */  	if (path->dentry->d_sb->s_type != &spufs_type) -		goto out; +		return -EINVAL;  	/* don't accept undefined flags */  	if (flags & (~SPU_CREATE_FLAG_ALL)) -		goto out; +		return -EINVAL;  	/* only threads can be underneath a gang */ -	if (path->dentry != path->dentry->d_sb->s_root) { -		if ((flags & SPU_CREATE_GANG) || -		    !SPUFS_I(path->dentry->d_inode)->i_gang) -			goto out; -	} +	if (path->dentry != path->dentry->d_sb->s_root) +		if ((flags & SPU_CREATE_GANG) || !SPUFS_I(dir)->i_gang) +			return -EINVAL;  	mode &= ~current_umask();  	if (flags & SPU_CREATE_GANG) -		ret = spufs_create_gang(path->dentry->d_inode, -					 dentry, path->mnt, mode); +		ret = spufs_create_gang(dir, dentry, path->mnt, mode);  	else -		ret = spufs_create_context(path->dentry->d_inode, -					    dentry, path->mnt, flags, mode, +		ret = spufs_create_context(dir, dentry, path->mnt, flags, mode,  					    filp);  	if (ret >= 0) -		fsnotify_mkdir(path->dentry->d_inode, dentry); -	return ret; +		fsnotify_mkdir(dir, dentry); -out: -	mutex_unlock(&path->dentry->d_inode->i_mutex); -	dput(dentry);  	return ret;  } diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 5665dcc382c..5b7d8ffbf89 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -70,7 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags,  	ret = PTR_ERR(dentry);  	if (!IS_ERR(dentry)) {  		ret = spufs_create(&path, dentry, flags, mode, neighbor); -		path_put(&path); +		done_path_create(&path, dentry);  	}  	return ret; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index d91a3a0b232..deb4a456cf8 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -156,9 +156,7 @@ static int dev_mkdir(const char *name, umode_t mode)  	if (!err)  		/* mark as kernel-created inode */  		dentry->d_inode->i_private = &thread; -	dput(dentry); -	mutex_unlock(&path.dentry->d_inode->i_mutex); -	path_put(&path); +	done_path_create(&path, dentry);  	return err;  } @@ -218,10 +216,7 @@ static int handle_create(const char *nodename, umode_t mode, struct device *dev)  		/* mark as kernel-created inode */  		dentry->d_inode->i_private = &thread;  	} -	dput(dentry); - -	mutex_unlock(&path.dentry->d_inode->i_mutex); -	path_put(&path); +	done_path_create(&path, dentry);  	return err;  } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c index 57bf1d7ee80..9ab24528f9b 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c @@ -1188,7 +1188,7 @@ exit:  	kfree(buf);  	/* close file before return */  	if (fp) -		filp_close(fp, current->files); +		filp_close(fp, NULL);  	/* restore previous address limit */  	set_fs(old_fs); diff --git a/drivers/staging/bcm/Misc.c b/drivers/staging/bcm/Misc.c index 9a60d4cd218..f545716c666 100644 --- a/drivers/staging/bcm/Misc.c +++ b/drivers/staging/bcm/Misc.c @@ -157,12 +157,7 @@ static int create_worker_threads(struct bcm_mini_adapter *psAdapter)  static struct file *open_firmware_file(struct bcm_mini_adapter *Adapter, const char *path)  { -	struct file *flp = NULL; -	mm_segment_t oldfs; -	oldfs = get_fs(); -	set_fs(get_ds()); -	flp = filp_open(path, O_RDONLY, S_IRWXU); -	set_fs(oldfs); +	struct file *flp = filp_open(path, O_RDONLY, S_IRWXU);  	if (IS_ERR(flp)) {  		pr_err(DRV_NAME "Unable To Open File %s, err %ld", path, PTR_ERR(flp));  		flp = NULL; @@ -183,14 +178,12 @@ static int BcmFileDownload(struct bcm_mini_adapter *Adapter, const char *path, u  {  	int errorno = 0;  	struct file *flp = NULL; -	mm_segment_t oldfs;  	struct timeval tv = {0};  	flp = open_firmware_file(Adapter, path);  	if (!flp) { -		errorno = -ENOENT;  		BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Unable to Open %s\n", path); -		goto exit_download; +		return -ENOENT;  	}  	BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Opened file is = %s and length =0x%lx to be downloaded at =0x%x", path, (unsigned long)flp->f_dentry->d_inode->i_size, loc);  	do_gettimeofday(&tv); @@ -201,10 +194,7 @@ static int BcmFileDownload(struct bcm_mini_adapter *Adapter, const char *path, u  		errorno = -EIO;  		goto exit_download;  	} -	oldfs = get_fs(); -	set_fs(get_ds());  	vfs_llseek(flp, 0, 0); -	set_fs(oldfs);  	if (Adapter->bcm_file_readback_from_chip(Adapter->pvInterfaceAdapter, flp, loc)) {  		BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Failed to read back firmware!");  		errorno = -EIO; @@ -212,12 +202,7 @@ static int BcmFileDownload(struct bcm_mini_adapter *Adapter, const char *path, u  	}  exit_download: -	oldfs = get_fs(); -	set_fs(get_ds()); -	if (flp && !(IS_ERR(flp))) -		filp_close(flp, current->files); -	set_fs(oldfs); - +	filp_close(flp, NULL);  	return errorno;  } @@ -1056,10 +1041,8 @@ OUT:  static int bcm_parse_target_params(struct bcm_mini_adapter *Adapter)  {  	struct file *flp = NULL; -	mm_segment_t oldfs = {0};  	char *buff;  	int len = 0; -	loff_t pos = 0;  	buff = kmalloc(BUFFER_1K, GFP_KERNEL);  	if (!buff) @@ -1079,20 +1062,16 @@ static int bcm_parse_target_params(struct bcm_mini_adapter *Adapter)  		Adapter->pstargetparams = NULL;  		return -ENOENT;  	} -	oldfs = get_fs(); -	set_fs(get_ds()); -	len = vfs_read(flp, (void __user __force *)buff, BUFFER_1K, &pos); -	set_fs(oldfs); +	len = kernel_read(flp, 0, buff, BUFFER_1K); +	filp_close(flp, NULL);  	if (len != sizeof(STARGETPARAMS)) {  		BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Mismatch in Target Param Structure!\n");  		kfree(buff);  		kfree(Adapter->pstargetparams);  		Adapter->pstargetparams = NULL; -		filp_close(flp, current->files);  		return -ENOENT;  	} -	filp_close(flp, current->files);  	/* Check for autolink in config params */  	/* diff --git a/drivers/staging/gdm72xx/sdio_boot.c b/drivers/staging/gdm72xx/sdio_boot.c index 760efee23d4..65624bca8b3 100644 --- a/drivers/staging/gdm72xx/sdio_boot.c +++ b/drivers/staging/gdm72xx/sdio_boot.c @@ -66,9 +66,8 @@ static int download_image(struct sdio_func *func, char *img_name)  		return -ENOENT;  	} -	if (filp->f_dentry) -		inode = filp->f_dentry->d_inode; -	if (!inode || !S_ISREG(inode->i_mode)) { +	inode = filp->f_dentry->d_inode; +	if (!S_ISREG(inode->i_mode)) {  		printk(KERN_ERR "Invalid file type: %s\n", img_name);  		ret = -EINVAL;  		goto out; @@ -123,7 +122,7 @@ static int download_image(struct sdio_func *func, char *img_name)  		pno++;  	}  out: -	filp_close(filp, current->files); +	filp_close(filp, NULL);  	return ret;  } diff --git a/drivers/staging/gdm72xx/usb_boot.c b/drivers/staging/gdm72xx/usb_boot.c index fef290c38db..e3dbd5a552c 100644 --- a/drivers/staging/gdm72xx/usb_boot.c +++ b/drivers/staging/gdm72xx/usb_boot.c @@ -173,14 +173,12 @@ int usb_boot(struct usb_device *usbdev, u16 pid)  	filp = filp_open(img_name, O_RDONLY | O_LARGEFILE, 0);  	if (IS_ERR(filp)) {  		printk(KERN_ERR "Can't find %s.\n", img_name); -		set_fs(fs);  		ret = PTR_ERR(filp);  		goto restore_fs;  	} -	if (filp->f_dentry) -		inode = filp->f_dentry->d_inode; -	if (!inode || !S_ISREG(inode->i_mode)) { +	inode = filp->f_dentry->d_inode; +	if (!S_ISREG(inode->i_mode)) {  		printk(KERN_ERR "Invalid file type: %s\n", img_name);  		ret = -EINVAL;  		goto out; @@ -262,7 +260,7 @@ int usb_boot(struct usb_device *usbdev, u16 pid)  		ret = -EINVAL;  	}  out: -	filp_close(filp, current->files); +	filp_close(filp, NULL);  restore_fs:  	set_fs(fs); @@ -322,13 +320,11 @@ static int em_download_image(struct usb_device *usbdev, char *path,  		goto restore_fs;  	} -	if (filp->f_dentry) { -		inode = filp->f_dentry->d_inode; -		if (!inode || !S_ISREG(inode->i_mode)) { -			printk(KERN_ERR "Invalid file type: %s\n", path); -			ret = -EINVAL; -			goto out; -		} +	inode = filp->f_dentry->d_inode; +	if (!S_ISREG(inode->i_mode)) { +		printk(KERN_ERR "Invalid file type: %s\n", path); +		ret = -EINVAL; +		goto out;  	}  	buf = kmalloc(DOWNLOAD_CHUCK + pad_size, GFP_KERNEL); @@ -364,7 +360,7 @@ static int em_download_image(struct usb_device *usbdev, char *path,  		goto out;  out: -	filp_close(filp, current->files); +	filp_close(filp, NULL);  restore_fs:  	set_fs(fs); diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 9e2100551c7..cbb5aaf3e56 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -109,46 +109,29 @@ static struct se_device *fd_create_virtdevice(  	struct se_subsystem_dev *se_dev,  	void *p)  { -	char *dev_p = NULL;  	struct se_device *dev;  	struct se_dev_limits dev_limits;  	struct queue_limits *limits;  	struct fd_dev *fd_dev = p;  	struct fd_host *fd_host = hba->hba_ptr; -	mm_segment_t old_fs;  	struct file *file;  	struct inode *inode = NULL;  	int dev_flags = 0, flags, ret = -EINVAL;  	memset(&dev_limits, 0, sizeof(struct se_dev_limits)); -	old_fs = get_fs(); -	set_fs(get_ds()); -	dev_p = getname(fd_dev->fd_dev_name); -	set_fs(old_fs); - -	if (IS_ERR(dev_p)) { -		pr_err("getname(%s) failed: %lu\n", -			fd_dev->fd_dev_name, IS_ERR(dev_p)); -		ret = PTR_ERR(dev_p); -		goto fail; -	}  	/*  	 * Use O_DSYNC by default instead of O_SYNC to forgo syncing  	 * of pure timestamp updates.  	 */  	flags = O_RDWR | O_CREAT | O_LARGEFILE | O_DSYNC; -	file = filp_open(dev_p, flags, 0600); +	file = filp_open(fd_dev->fd_dev_name, flags, 0600);  	if (IS_ERR(file)) { -		pr_err("filp_open(%s) failed\n", dev_p); +		pr_err("filp_open(%s) failed\n", fd_dev->fd_dev_name);  		ret = PTR_ERR(file);  		goto fail;  	} -	if (!file || !file->f_dentry) { -		pr_err("filp_open(%s) failed\n", dev_p); -		goto fail; -	}  	fd_dev->fd_file = file;  	/*  	 * If using a block backend with this struct file, we extract @@ -212,14 +195,12 @@ static struct se_device *fd_create_virtdevice(  		" %llu total bytes\n", fd_host->fd_host_id, fd_dev->fd_dev_id,  			fd_dev->fd_dev_name, fd_dev->fd_dev_size); -	putname(dev_p);  	return dev;  fail:  	if (fd_dev->fd_file) {  		filp_close(fd_dev->fd_file, NULL);  		fd_dev->fd_file = NULL;  	} -	putname(dev_p);  	return ERR_PTR(ret);  } @@ -452,14 +433,11 @@ static ssize_t fd_set_configfs_dev_params(  		token = match_token(ptr, tokens, args);  		switch (token) {  		case Opt_fd_dev_name: -			arg_p = match_strdup(&args[0]); -			if (!arg_p) { -				ret = -ENOMEM; +			if (match_strlcpy(fd_dev->fd_dev_name, &args[0], +				FD_MAX_DEV_NAME) == 0) { +				ret = -EINVAL;  				break;  			} -			snprintf(fd_dev->fd_dev_name, FD_MAX_DEV_NAME, -					"%s", arg_p); -			kfree(arg_p);  			pr_debug("FILEIO: Referencing Path: %s\n",  					fd_dev->fd_dev_name);  			fd_dev->fbd_flags |= FBDF_HAS_PATH; diff --git a/drivers/usb/gadget/storage_common.c b/drivers/usb/gadget/storage_common.c index ae8b18869b8..8d9bcd8207c 100644 --- a/drivers/usb/gadget/storage_common.c +++ b/drivers/usb/gadget/storage_common.c @@ -656,9 +656,8 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)  	if (!(filp->f_mode & FMODE_WRITE))  		ro = 1; -	if (filp->f_path.dentry) -		inode = filp->f_path.dentry->d_inode; -	if (!inode || (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { +	inode = filp->f_path.dentry->d_inode; +	if ((!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) {  		LINFO(curlun, "invalid file type: %s\n", filename);  		goto out;  	} @@ -667,7 +666,7 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)  	 * If we can't read the file, it's no good.  	 * If we can't write the file, use it read-only.  	 */ -	if (!filp->f_op || !(filp->f_op->read || filp->f_op->aio_read)) { +	if (!(filp->f_op->read || filp->f_op->aio_read)) {  		LINFO(curlun, "file not readable: %s\n", filename);  		goto out;  	} @@ -712,7 +711,6 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)  	if (fsg_lun_is_open(curlun))  		fsg_lun_close(curlun); -	get_file(filp);  	curlun->blksize = blksize;  	curlun->blkbits = blkbits;  	curlun->ro = ro; @@ -720,10 +718,10 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)  	curlun->file_length = size;  	curlun->num_sectors = num_sectors;  	LDBG(curlun, "open backing file: %s\n", filename); -	rc = 0; +	return 0;  out: -	filp_close(filp, current->files); +	fput(filp);  	return rc;  } diff --git a/drivers/usb/gadget/u_uac1.c b/drivers/usb/gadget/u_uac1.c index af989898205..e0c5e88e03e 100644 --- a/drivers/usb/gadget/u_uac1.c +++ b/drivers/usb/gadget/u_uac1.c @@ -275,17 +275,17 @@ static int gaudio_close_snd_dev(struct gaudio *gau)  	/* Close control device */  	snd = &gau->control;  	if (snd->filp) -		filp_close(snd->filp, current->files); +		filp_close(snd->filp, NULL);  	/* Close PCM playback device and setup substream */  	snd = &gau->playback;  	if (snd->filp) -		filp_close(snd->filp, current->files); +		filp_close(snd->filp, NULL);  	/* Close PCM capture device and setup substream */  	snd = &gau->capture;  	if (snd->filp) -		filp_close(snd->filp, current->files); +		filp_close(snd->filp, NULL);  	return 0;  } diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 1ddeb11659d..64cda560c48 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c @@ -104,6 +104,8 @@ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,  	deferred framebuffer IO. then if userspace touches a page  	again, we repeat the same scheme */ +	file_update_time(vma->vm_file); +  	/* protect against the workqueue changing the page list */  	mutex_lock(&fbdefio->lock); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index fc06fd27065..dd6f7ee1e31 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -610,6 +610,9 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",  		 page, (unsigned long)filp->private_data); +	/* Update file times before taking page lock */ +	file_update_time(filp); +  	v9inode = V9FS_I(inode);  	/* make sure the cache has finished storing the page */  	v9fs_fscache_wait_on_page_write(inode, page); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fadeba6a5db..62e0cafd6e2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1614,8 +1614,6 @@ static int cleaner_kthread(void *arg)  	struct btrfs_root *root = arg;  	do { -		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); -  		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&  		    mutex_trylock(&root->fs_info->cleaner_mutex)) {  			btrfs_run_delayed_iputs(root); @@ -1647,7 +1645,6 @@ static int transaction_kthread(void *arg)  	do {  		cannot_commit = false;  		delay = HZ * 30; -		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);  		mutex_lock(&root->fs_info->transaction_kthread_mutex);  		spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9aa01ec2138..5caf285c6e4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  	ssize_t err = 0;  	size_t count, ocount; -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); +	sb_start_write(inode->i_sb);  	mutex_lock(&inode->i_mutex); @@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  			num_written = err;  	}  out: +	sb_end_write(inode->i_sb);  	current->backing_dev_info = NULL;  	return num_written ? num_written : err;  } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 48bdfd2591c..83baec24946 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6629,6 +6629,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	u64 page_start;  	u64 page_end; +	sb_start_pagefault(inode->i_sb);  	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);  	if (!ret) {  		ret = file_update_time(vma->vm_file); @@ -6718,12 +6719,15 @@ again:  	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);  out_unlock: -	if (!ret) +	if (!ret) { +		sb_end_pagefault(inode->i_sb);  		return VM_FAULT_LOCKED; +	}  	unlock_page(page);  out:  	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);  out_noreserve: +	sb_end_pagefault(inode->i_sb);  	return ret;  } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 43f0012016e..bc2f6ffff3c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -195,6 +195,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  	if (!inode_owner_or_capable(inode))  		return -EACCES; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret; +  	mutex_lock(&inode->i_mutex);  	ip_oldflags = ip->flags; @@ -209,10 +213,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  		}  	} -	ret = mnt_want_write_file(file); -	if (ret) -		goto out_unlock; -  	if (flags & FS_SYNC_FL)  		ip->flags |= BTRFS_INODE_SYNC;  	else @@ -275,9 +275,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  		inode->i_flags = i_oldflags;  	} -	mnt_drop_write_file(file);   out_unlock:  	mutex_unlock(&inode->i_mutex); +	mnt_drop_write_file(file);  	return ret;  } @@ -664,6 +664,10 @@ static noinline int btrfs_mksubvol(struct path *parent,  	struct dentry *dentry;  	int error; +	error = mnt_want_write(parent->mnt); +	if (error) +		return error; +  	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);  	dentry = lookup_one_len(name, parent->dentry, namelen); @@ -699,6 +703,7 @@ out_dput:  	dput(dentry);  out_unlock:  	mutex_unlock(&dir->i_mutex); +	mnt_drop_write(parent->mnt);  	return error;  } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7ac7cdcc294..17be3dedacb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -335,6 +335,8 @@ again:  	if (!h)  		return ERR_PTR(-ENOMEM); +	sb_start_intwrite(root->fs_info->sb); +  	if (may_wait_transaction(root, type))  		wait_current_trans(root); @@ -345,6 +347,7 @@ again:  	} while (ret == -EBUSY);  	if (ret < 0) { +		sb_end_intwrite(root->fs_info->sb);  		kmem_cache_free(btrfs_trans_handle_cachep, h);  		return ERR_PTR(ret);  	} @@ -548,6 +551,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	btrfs_trans_release_metadata(trans, root);  	trans->block_rsv = NULL; +	sb_end_intwrite(root->fs_info->sb); +  	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&  	    should_end_transaction(trans, root)) {  		trans->transaction->blocked = 1; @@ -1578,6 +1583,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	put_transaction(cur_trans);  	put_transaction(cur_trans); +	sb_end_intwrite(root->fs_info->sb); +  	trace_btrfs_transaction_commit(root);  	btrfs_scrub_continue(root); diff --git a/fs/buffer.c b/fs/buffer.c index c7062c896d7..9f6d2e41281 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2306,8 +2306,8 @@ EXPORT_SYMBOL(block_commit_write);   * beyond EOF, then the page is guaranteed safe against truncation until we   * unlock the page.   * - * Direct callers of this function should call vfs_check_frozen() so that page - * fault does not busyloop until the fs is thawed. + * Direct callers of this function should protect against filesystem freezing + * using sb_start_write() - sb_end_write() functions.   */  int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,  			 get_block_t get_block) @@ -2318,6 +2318,12 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,  	loff_t size;  	int ret; +	/* +	 * Update file times before taking page lock. We may end up failing the +	 * fault so this update may be superfluous but who really cares... +	 */ +	file_update_time(vma->vm_file); +  	lock_page(page);  	size = i_size_read(inode);  	if ((page->mapping != inode->i_mapping) || @@ -2339,18 +2345,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,  	if (unlikely(ret < 0))  		goto out_unlock; -	/* -	 * Freezing in progress? We check after the page is marked dirty and -	 * with page lock held so if the test here fails, we are sure freezing -	 * code will wait during syncing until the page fault is done - at that -	 * point page will be dirty and unlocked so freezing code will write it -	 * and writeprotect it again. -	 */  	set_page_dirty(page); -	if (inode->i_sb->s_frozen != SB_UNFROZEN) { -		ret = -EAGAIN; -		goto out_unlock; -	}  	wait_on_page_writeback(page);  	return 0;  out_unlock: @@ -2365,12 +2360,9 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,  	int ret;  	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; -	/* -	 * This check is racy but catches the common case. The check in -	 * __block_page_mkwrite() is reliable. -	 */ -	vfs_check_frozen(sb, SB_FREEZE_WRITE); +	sb_start_pagefault(sb);  	ret = __block_page_mkwrite(vma, vmf, get_block); +	sb_end_pagefault(sb);  	return block_page_mkwrite_return(ret);  }  EXPORT_SYMBOL(block_page_mkwrite); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8b67304e4b8..452e71a1b75 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1184,6 +1184,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	loff_t size, len;  	int ret; +	/* Update time before taking page lock */ +	file_update_time(vma->vm_file); +  	size = i_size_read(inode);  	if (off + PAGE_CACHE_SIZE <= size)  		len = PAGE_CACHE_SIZE; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ffa2be57804..c3ca12c33ca 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -318,21 +318,20 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,  	struct vfsmount *lower_mnt;  	int rc = 0; -	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); -	fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); -	BUG_ON(!lower_dentry->d_count); -  	dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); -	ecryptfs_set_dentry_private(dentry, dentry_info);  	if (!dentry_info) {  		printk(KERN_ERR "%s: Out of memory whilst attempting "  		       "to allocate ecryptfs_dentry_info struct\n",  			__func__);  		dput(lower_dentry); -		mntput(lower_mnt); -		d_drop(dentry);  		return -ENOMEM;  	} + +	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); +	fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); +	BUG_ON(!lower_dentry->d_count); + +	ecryptfs_set_dentry_private(dentry, dentry_info);  	ecryptfs_set_dentry_lower(dentry, lower_dentry);  	ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); @@ -381,12 +380,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,  	struct dentry *lower_dir_dentry, *lower_dentry;  	int rc = 0; -	if ((ecryptfs_dentry->d_name.len == 1 -	     && !strcmp(ecryptfs_dentry->d_name.name, ".")) -	    || (ecryptfs_dentry->d_name.len == 2 -		&& !strcmp(ecryptfs_dentry->d_name.name, ".."))) { -		goto out_d_drop; -	}  	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);  	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);  	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, @@ -397,8 +390,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,  		rc = PTR_ERR(lower_dentry);  		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "  				"[%d] on lower_dentry = [%s]\n", __func__, rc, -				encrypted_and_encoded_name); -		goto out_d_drop; +				ecryptfs_dentry->d_name.name); +		goto out;  	}  	if (lower_dentry->d_inode)  		goto interpose; @@ -415,7 +408,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,  	if (rc) {  		printk(KERN_ERR "%s: Error attempting to encrypt and encode "  		       "filename; rc = [%d]\n", __func__, rc); -		goto out_d_drop; +		goto out;  	}  	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);  	lower_dentry = lookup_one_len(encrypted_and_encoded_name, @@ -427,14 +420,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,  		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "  				"[%d] on lower_dentry = [%s]\n", __func__, rc,  				encrypted_and_encoded_name); -		goto out_d_drop; +		goto out;  	}  interpose:  	rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,  				       ecryptfs_dir_inode); -	goto out; -out_d_drop: -	d_drop(ecryptfs_dentry);  out:  	kfree(encrypted_and_encoded_name);  	return ERR_PTR(rc); diff --git a/fs/exec.c b/fs/exec.c index 3684353ebd5..574cf4de4ec 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2069,25 +2069,18 @@ static void wait_for_dump_helpers(struct file *file)   */  static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)  { -	struct file *rp, *wp; +	struct file *files[2];  	struct fdtable *fdt;  	struct coredump_params *cp = (struct coredump_params *)info->data;  	struct files_struct *cf = current->files; +	int err = create_pipe_files(files, 0); +	if (err) +		return err; -	wp = create_write_pipe(0); -	if (IS_ERR(wp)) -		return PTR_ERR(wp); - -	rp = create_read_pipe(wp, 0); -	if (IS_ERR(rp)) { -		free_write_pipe(wp); -		return PTR_ERR(rp); -	} - -	cp->file = wp; +	cp->file = files[1];  	sys_close(0); -	fd_install(0, rp); +	fd_install(0, files[0]);  	spin_lock(&cf->file_lock);  	fdt = files_fdtable(cf);  	__set_open_fd(0, fdt); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 264d315f6c4..6363ac66faf 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -79,6 +79,7 @@ void ext2_evict_inode(struct inode * inode)  	truncate_inode_pages(&inode->i_data, 0);  	if (want_delete) { +		sb_start_intwrite(inode->i_sb);  		/* set dtime */  		EXT2_I(inode)->i_dtime	= get_seconds();  		mark_inode_dirty(inode); @@ -98,8 +99,10 @@ void ext2_evict_inode(struct inode * inode)  	if (unlikely(rsv))  		kfree(rsv); -	if (want_delete) +	if (want_delete) {  		ext2_free_inode(inode); +		sb_end_intwrite(inode->i_sb); +	}  }  typedef struct { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9f311d27b16..af74d9e27b7 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -42,6 +42,8 @@ static void ext2_sync_super(struct super_block *sb,  static int ext2_remount (struct super_block * sb, int * flags, char * data);  static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);  static int ext2_sync_fs(struct super_block *sb, int wait); +static int ext2_freeze(struct super_block *sb); +static int ext2_unfreeze(struct super_block *sb);  void ext2_error(struct super_block *sb, const char *function,  		const char *fmt, ...) @@ -305,6 +307,8 @@ static const struct super_operations ext2_sops = {  	.evict_inode	= ext2_evict_inode,  	.put_super	= ext2_put_super,  	.sync_fs	= ext2_sync_fs, +	.freeze_fs	= ext2_freeze, +	.unfreeze_fs	= ext2_unfreeze,  	.statfs		= ext2_statfs,  	.remount_fs	= ext2_remount,  	.show_options	= ext2_show_options, @@ -1200,6 +1204,35 @@ static int ext2_sync_fs(struct super_block *sb, int wait)  	return 0;  } +static int ext2_freeze(struct super_block *sb) +{ +	struct ext2_sb_info *sbi = EXT2_SB(sb); + +	/* +	 * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared +	 * because we have unattached inodes and thus filesystem is not fully +	 * consistent. +	 */ +	if (atomic_long_read(&sb->s_remove_count)) { +		ext2_sync_fs(sb, 1); +		return 0; +	} +	/* Set EXT2_FS_VALID flag */ +	spin_lock(&sbi->s_lock); +	sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state); +	spin_unlock(&sbi->s_lock); +	ext2_sync_super(sb, sbi->s_es, 1); + +	return 0; +} + +static int ext2_unfreeze(struct super_block *sb) +{ +	/* Just write sb to clear EXT2_VALID_FS flag */ +	ext2_write_super(sb); + +	return 0; +}  void ext2_write_super(struct super_block *sb)  { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 89b59cb7f9b..6324f74e034 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -233,6 +233,11 @@ void ext4_evict_inode(struct inode *inode)  	if (is_bad_inode(inode))  		goto no_delete; +	/* +	 * Protect us against freezing - iput() caller didn't have to have any +	 * protection against it +	 */ +	sb_start_intwrite(inode->i_sb);  	handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);  	if (IS_ERR(handle)) {  		ext4_std_error(inode->i_sb, PTR_ERR(handle)); @@ -242,6 +247,7 @@ void ext4_evict_inode(struct inode *inode)  		 * cleaned up.  		 */  		ext4_orphan_del(NULL, inode); +		sb_end_intwrite(inode->i_sb);  		goto no_delete;  	} @@ -273,6 +279,7 @@ void ext4_evict_inode(struct inode *inode)  		stop_handle:  			ext4_journal_stop(handle);  			ext4_orphan_del(NULL, inode); +			sb_end_intwrite(inode->i_sb);  			goto no_delete;  		}  	} @@ -301,6 +308,7 @@ void ext4_evict_inode(struct inode *inode)  	else  		ext4_free_inode(handle, inode);  	ext4_journal_stop(handle); +	sb_end_intwrite(inode->i_sb);  	return;  no_delete:  	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */ @@ -4779,11 +4787,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	get_block_t *get_block;  	int retries = 0; -	/* -	 * This check is racy but catches the common case. We rely on -	 * __block_page_mkwrite() to do a reliable check. -	 */ -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); +	sb_start_pagefault(inode->i_sb);  	/* Delalloc case is easy... */  	if (test_opt(inode->i_sb, DELALLOC) &&  	    !ext4_should_journal_data(inode) && @@ -4851,5 +4855,6 @@ retry_alloc:  out_ret:  	ret = block_page_mkwrite_return(ret);  out: +	sb_end_pagefault(inode->i_sb);  	return ret;  } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index f99a1311e84..fe7c63f4717 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -44,6 +44,11 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)  {  	struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); +	/* +	 * We protect against freezing so that we don't create dirty buffers +	 * on frozen filesystem. +	 */ +	sb_start_write(sb);  	ext4_mmp_csum_set(sb, mmp);  	mark_buffer_dirty(bh);  	lock_buffer(bh); @@ -51,6 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)  	get_bh(bh);  	submit_bh(WRITE_SYNC, bh);  	wait_on_buffer(bh); +	sb_end_write(sb);  	if (unlikely(!buffer_uptodate(bh)))  		return 1; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2d51cd9af22..d76ec8277d3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -331,33 +331,17 @@ static void ext4_put_nojournal(handle_t *handle)   * journal_end calls result in the superblock being marked dirty, so   * that sync() will call the filesystem's write_super callback if   * appropriate. - * - * To avoid j_barrier hold in userspace when a user calls freeze(), - * ext4 prevents a new handle from being started by s_frozen, which - * is in an upper layer.   */  handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)  {  	journal_t *journal; -	handle_t  *handle;  	trace_ext4_journal_start(sb, nblocks, _RET_IP_);  	if (sb->s_flags & MS_RDONLY)  		return ERR_PTR(-EROFS); +	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);  	journal = EXT4_SB(sb)->s_journal; -	handle = ext4_journal_current_handle(); - -	/* -	 * If a handle has been started, it should be allowed to -	 * finish, otherwise deadlock could happen between freeze -	 * and others(e.g. truncate) due to the restart of the -	 * journal handle if the filesystem is forzen and active -	 * handles are not stopped. -	 */ -	if (!handle) -		vfs_check_frozen(sb, SB_FREEZE_TRANS); -  	if (!journal)  		return ext4_get_nojournal();  	/* @@ -2747,6 +2731,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)  	sb = elr->lr_super;  	ngroups = EXT4_SB(sb)->s_groups_count; +	sb_start_write(sb);  	for (group = elr->lr_next_group; group < ngroups; group++) {  		gdp = ext4_get_group_desc(sb, group, NULL);  		if (!gdp) { @@ -2773,6 +2758,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)  		elr->lr_next_sched = jiffies + elr->lr_timeout;  		elr->lr_next_group = group + 1;  	} +	sb_end_write(sb);  	return ret;  } @@ -4460,10 +4446,8 @@ int ext4_force_commit(struct super_block *sb)  		return 0;  	journal = EXT4_SB(sb)->s_journal; -	if (journal) { -		vfs_check_frozen(sb, SB_FREEZE_TRANS); +	if (journal)  		ret = ext4_journal_force_commit(journal); -	}  	return ret;  } @@ -4493,9 +4477,8 @@ static int ext4_sync_fs(struct super_block *sb, int wait)   * gives us a chance to flush the journal completely and mark the fs clean.   *   * Note that only this function cannot bring a filesystem to be in a clean - * state independently, because ext4 prevents a new handle from being started - * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from - * the upper layer. + * state independently. It relies on upper layer to stop all data & metadata + * modifications.   */  static int ext4_freeze(struct super_block *sb)  { @@ -4522,7 +4505,7 @@ static int ext4_freeze(struct super_block *sb)  	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);  	error = ext4_commit_super(sb, 1);  out: -	/* we rely on s_frozen to stop further updates */ +	/* we rely on upper layer to stop further updates */  	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);  	return error;  } diff --git a/fs/fat/file.c b/fs/fat/file.c index a71fe3715ee..e007b8bd8e5 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -43,10 +43,10 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)  	if (err)  		goto out; -	mutex_lock(&inode->i_mutex);  	err = mnt_want_write_file(file);  	if (err) -		goto out_unlock_inode; +		goto out; +	mutex_lock(&inode->i_mutex);  	/*  	 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -73,14 +73,14 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)  	/* The root directory has no attributes */  	if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {  		err = -EINVAL; -		goto out_drop_write; +		goto out_unlock_inode;  	}  	if (sbi->options.sys_immutable &&  	    ((attr | oldattr) & ATTR_SYS) &&  	    !capable(CAP_LINUX_IMMUTABLE)) {  		err = -EPERM; -		goto out_drop_write; +		goto out_unlock_inode;  	}  	/* @@ -90,12 +90,12 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)  	 */  	err = security_inode_setattr(file->f_path.dentry, &ia);  	if (err) -		goto out_drop_write; +		goto out_unlock_inode;  	/* This MUST be done before doing anything irreversible... */  	err = fat_setattr(file->f_path.dentry, &ia);  	if (err) -		goto out_drop_write; +		goto out_unlock_inode;  	fsnotify_change(file->f_path.dentry, ia.ia_valid);  	if (sbi->options.sys_immutable) { @@ -107,10 +107,9 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)  	fat_save_attrs(inode, attr);  	mark_inode_dirty(inode); -out_drop_write: -	mnt_drop_write_file(file);  out_unlock_inode:  	mutex_unlock(&inode->i_mutex); +	mnt_drop_write_file(file);  out:  	return err;  } diff --git a/fs/file_table.c b/fs/file_table.c index b3fc4d67a26..701985e4ccd 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -43,7 +43,7 @@ static struct kmem_cache *filp_cachep __read_mostly;  static struct percpu_counter nr_files __cacheline_aligned_in_smp; -static inline void file_free_rcu(struct rcu_head *head) +static void file_free_rcu(struct rcu_head *head)  {  	struct file *f = container_of(head, struct file, f_u.fu_rcuhead); @@ -217,7 +217,7 @@ static void drop_file_write_access(struct file *file)  		return;  	if (file_check_writeable(file) != 0)  		return; -	mnt_drop_write(mnt); +	__mnt_drop_write(mnt);  	file_release_write(file);  } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b321a688cde..93d8d6c9494 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -944,9 +944,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  		return err;  	count = ocount; - +	sb_start_write(inode->i_sb);  	mutex_lock(&inode->i_mutex); -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);  	/* We can write back this queue in page reclaim */  	current->backing_dev_info = mapping->backing_dev_info; @@ -1004,6 +1003,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  out:  	current->backing_dev_info = NULL;  	mutex_unlock(&inode->i_mutex); +	sb_end_write(inode->i_sb);  	return written ? written : err;  } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 9aa6af13823..d1d791ef38d 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -373,11 +373,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	loff_t size;  	int ret; -	/* Wait if fs is frozen. This is racy so we check again later on -	 * and retry if the fs has been frozen after the page lock has -	 * been acquired -	 */ -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); +	sb_start_pagefault(inode->i_sb); + +	/* Update file times before taking page lock */ +	file_update_time(vma->vm_file);  	ret = gfs2_rs_alloc(ip);  	if (ret) @@ -462,14 +461,9 @@ out:  	gfs2_holder_uninit(&gh);  	if (ret == 0) {  		set_page_dirty(page); -		/* This check must be post dropping of transaction lock */ -		if (inode->i_sb->s_frozen == SB_UNFROZEN) { -			wait_on_page_writeback(page); -		} else { -			ret = -EAGAIN; -			unlock_page(page); -		} +		wait_on_page_writeback(page);  	} +	sb_end_pagefault(inode->i_sb);  	return block_page_mkwrite_return(ret);  } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ad3e2fb763d..adbd27875ef 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -50,6 +50,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,  	if (revokes)  		tr->tr_reserved += gfs2_struct2blk(sdp, revokes,  						   sizeof(u64)); +	sb_start_intwrite(sdp->sd_vfs);  	gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);  	error = gfs2_glock_nq(&tr->tr_t_gh); @@ -68,6 +69,7 @@ fail_gunlock:  	gfs2_glock_dq(&tr->tr_t_gh);  fail_holder_uninit: +	sb_end_intwrite(sdp->sd_vfs);  	gfs2_holder_uninit(&tr->tr_t_gh);  	kfree(tr); @@ -116,6 +118,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)  			gfs2_holder_uninit(&tr->tr_t_gh);  			kfree(tr);  		} +		sb_end_intwrite(sdp->sd_vfs);  		return;  	} @@ -136,6 +139,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)  	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)  		gfs2_log_flush(sdp, NULL); +	sb_end_intwrite(sdp->sd_vfs);  }  /** diff --git a/fs/inode.c b/fs/inode.c index 3cc50432046..ac8d904b3f1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1542,9 +1542,11 @@ void touch_atime(struct path *path)  	if (timespec_equal(&inode->i_atime, &now))  		return; -	if (mnt_want_write(mnt)) +	if (!sb_start_write_trylock(inode->i_sb))  		return; +	if (__mnt_want_write(mnt)) +		goto skip_update;  	/*  	 * File systems can error out when updating inodes if they need to  	 * allocate new space to modify an inode (such is the case for @@ -1555,7 +1557,9 @@ void touch_atime(struct path *path)  	 * of the fs read only, e.g. subvolumes in Btrfs.  	 */  	update_time(inode, &now, S_ATIME); -	mnt_drop_write(mnt); +	__mnt_drop_write(mnt); +skip_update: +	sb_end_write(inode->i_sb);  }  EXPORT_SYMBOL(touch_atime); @@ -1662,11 +1666,11 @@ int file_update_time(struct file *file)  		return 0;  	/* Finally allowed to write? Takes lock. */ -	if (mnt_want_write_file(file)) +	if (__mnt_want_write_file(file))  		return 0;  	ret = update_time(inode, &now, sync_it); -	mnt_drop_write_file(file); +	__mnt_drop_write_file(file);  	return ret;  } diff --git a/fs/internal.h b/fs/internal.h index a6fd56c68b1..371bcc4b169 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -61,6 +61,10 @@ extern void __init mnt_init(void);  extern struct lglock vfsmount_lock; +extern int __mnt_want_write(struct vfsmount *); +extern int __mnt_want_write_file(struct file *); +extern void __mnt_drop_write(struct vfsmount *); +extern void __mnt_drop_write_file(struct file *);  /*   * fs_struct.c diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 8392cb85bd5..05d29124c6a 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -156,12 +156,16 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)  	struct nlm_rqst		*call;  	int			status; -	nlm_get_host(host);  	call = nlm_alloc_call(host);  	if (call == NULL)  		return -ENOMEM;  	nlmclnt_locks_init_private(fl, host); +	if (!fl->fl_u.nfs_fl.owner) { +		/* lockowner allocation has failed */ +		nlmclnt_release_call(call); +		return -ENOMEM; +	}  	/* Set up the argument struct */  	nlmclnt_setlockargs(call, fl); @@ -185,9 +189,6 @@ EXPORT_SYMBOL_GPL(nlmclnt_proc);  /*   * Allocate an NLM RPC call struct - * - * Note: the caller must hold a reference to host. In case of failure, - * this reference will be released.   */  struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)  { @@ -199,7 +200,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)  			atomic_set(&call->a_count, 1);  			locks_init_lock(&call->a_args.lock.fl);  			locks_init_lock(&call->a_res.lock.fl); -			call->a_host = host; +			call->a_host = nlm_get_host(host);  			return call;  		}  		if (signalled()) @@ -207,7 +208,6 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)  		printk("nlm_alloc_call: failed, waiting for memory\n");  		schedule_timeout_interruptible(5*HZ);  	} -	nlmclnt_release_host(host);  	return NULL;  } @@ -750,7 +750,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl  	dprintk("lockd: blocking lock attempt was interrupted by a signal.\n"  		"       Attempting to cancel lock.\n"); -	req = nlm_alloc_call(nlm_get_host(host)); +	req = nlm_alloc_call(host);  	if (!req)  		return -ENOMEM;  	req->a_flags = RPC_TASK_ASYNC; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4a43d253c04..b147d1ae71f 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -257,6 +257,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args  		return rpc_system_err;  	call = nlm_alloc_call(host); +	nlmsvc_release_host(host);  	if (call == NULL)  		return rpc_system_err; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index afe4488c33d..fb1a2bedbe9 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -219,7 +219,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,  	struct nlm_block	*block;  	struct nlm_rqst		*call = NULL; -	nlm_get_host(host);  	call = nlm_alloc_call(host);  	if (call == NULL)  		return NULL; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index de8f2caa223..3009a365e08 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -297,6 +297,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args  		return rpc_system_err;  	call = nlm_alloc_call(host); +	nlmsvc_release_host(host);  	if (call == NULL)  		return rpc_system_err; diff --git a/fs/namei.c b/fs/namei.c index 2ccc35c4dc2..1b464390dde 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -650,6 +650,121 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki  	path_put(link);  } +int sysctl_protected_symlinks __read_mostly = 1; +int sysctl_protected_hardlinks __read_mostly = 1; + +/** + * may_follow_link - Check symlink following for unsafe situations + * @link: The path of the symlink + * + * In the case of the sysctl_protected_symlinks sysctl being enabled, + * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is + * in a sticky world-writable directory. This is to protect privileged + * processes from failing races against path names that may change out + * from under them by way of other users creating malicious symlinks. + * It will permit symlinks to be followed only when outside a sticky + * world-writable directory, or when the uid of the symlink and follower + * match, or when the directory owner matches the symlink's owner. + * + * Returns 0 if following the symlink is allowed, -ve on error. + */ +static inline int may_follow_link(struct path *link, struct nameidata *nd) +{ +	const struct inode *inode; +	const struct inode *parent; + +	if (!sysctl_protected_symlinks) +		return 0; + +	/* Allowed if owner and follower match. */ +	inode = link->dentry->d_inode; +	if (current_cred()->fsuid == inode->i_uid) +		return 0; + +	/* Allowed if parent directory not sticky and world-writable. */ +	parent = nd->path.dentry->d_inode; +	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) +		return 0; + +	/* Allowed if parent directory and link owner match. */ +	if (parent->i_uid == inode->i_uid) +		return 0; + +	path_put_conditional(link, nd); +	path_put(&nd->path); +	audit_log_link_denied("follow_link", link); +	return -EACCES; +} + +/** + * safe_hardlink_source - Check for safe hardlink conditions + * @inode: the source inode to hardlink from + * + * Return false if at least one of the following conditions: + *    - inode is not a regular file + *    - inode is setuid + *    - inode is setgid and group-exec + *    - access failure for read and write + * + * Otherwise returns true. + */ +static bool safe_hardlink_source(struct inode *inode) +{ +	umode_t mode = inode->i_mode; + +	/* Special files should not get pinned to the filesystem. */ +	if (!S_ISREG(mode)) +		return false; + +	/* Setuid files should not get pinned to the filesystem. */ +	if (mode & S_ISUID) +		return false; + +	/* Executable setgid files should not get pinned to the filesystem. */ +	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) +		return false; + +	/* Hardlinking to unreadable or unwritable sources is dangerous. */ +	if (inode_permission(inode, MAY_READ | MAY_WRITE)) +		return false; + +	return true; +} + +/** + * may_linkat - Check permissions for creating a hardlink + * @link: the source to hardlink from + * + * Block hardlink when all of: + *  - sysctl_protected_hardlinks enabled + *  - fsuid does not match inode + *  - hardlink source is unsafe (see safe_hardlink_source() above) + *  - not CAP_FOWNER + * + * Returns 0 if successful, -ve on error. + */ +static int may_linkat(struct path *link) +{ +	const struct cred *cred; +	struct inode *inode; + +	if (!sysctl_protected_hardlinks) +		return 0; + +	cred = current_cred(); +	inode = link->dentry->d_inode; + +	/* Source inode owner (or CAP_FOWNER) can hardlink all they like, +	 * otherwise, it must be a safe source. +	 */ +	if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) || +	    capable(CAP_FOWNER)) +		return 0; + +	audit_log_link_denied("linkat", link); +	return -EPERM; +} +  static __always_inline int  follow_link(struct path *link, struct nameidata *nd, void **p)  { @@ -1818,6 +1933,9 @@ static int path_lookupat(int dfd, const char *name,  		while (err > 0) {  			void *cookie;  			struct path link = path; +			err = may_follow_link(&link, nd); +			if (unlikely(err)) +				break;  			nd->flags |= LOOKUP_PARENT;  			err = follow_link(&link, nd, &cookie);  			if (err) @@ -2277,7 +2395,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)  static int atomic_open(struct nameidata *nd, struct dentry *dentry,  			struct path *path, struct file *file,  			const struct open_flags *op, -			bool *want_write, bool need_lookup, +			bool got_write, bool need_lookup,  			int *opened)  {  	struct inode *dir =  nd->path.dentry->d_inode; @@ -2300,7 +2418,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,  	if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))  		mode &= ~current_umask(); -	if (open_flag & O_EXCL) { +	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {  		open_flag &= ~O_TRUNC;  		*opened |= FILE_CREATED;  	} @@ -2314,12 +2432,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,  	 * Another problem is returing the "right" error value (e.g. for an  	 * O_EXCL open we want to return EEXIST not EROFS).  	 */ -	if ((open_flag & (O_CREAT | O_TRUNC)) || -	    (open_flag & O_ACCMODE) != O_RDONLY) { -		error = mnt_want_write(nd->path.mnt); -		if (!error) { -			*want_write = true; -		} else if (!(open_flag & O_CREAT)) { +	if (((open_flag & (O_CREAT | O_TRUNC)) || +	    (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) { +		if (!(open_flag & O_CREAT)) {  			/*  			 * No O_CREATE -> atomicity not a requirement -> fall  			 * back to lookup + open @@ -2327,11 +2442,11 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,  			goto no_open;  		} else if (open_flag & (O_EXCL | O_TRUNC)) {  			/* Fall back and fail with the right error */ -			create_error = error; +			create_error = -EROFS;  			goto no_open;  		} else {  			/* No side effects, safe to clear O_CREAT */ -			create_error = error; +			create_error = -EROFS;  			open_flag &= ~O_CREAT;  		}  	} @@ -2438,7 +2553,7 @@ looked_up:  static int lookup_open(struct nameidata *nd, struct path *path,  			struct file *file,  			const struct open_flags *op, -			bool *want_write, int *opened) +			bool got_write, int *opened)  {  	struct dentry *dir = nd->path.dentry;  	struct inode *dir_inode = dir->d_inode; @@ -2456,7 +2571,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,  		goto out_no_open;  	if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { -		return atomic_open(nd, dentry, path, file, op, want_write, +		return atomic_open(nd, dentry, path, file, op, got_write,  				   need_lookup, opened);  	} @@ -2480,10 +2595,10 @@ static int lookup_open(struct nameidata *nd, struct path *path,  		 * a permanent write count is taken through  		 * the 'struct file' in finish_open().  		 */ -		error = mnt_want_write(nd->path.mnt); -		if (error) +		if (!got_write) { +			error = -EROFS;  			goto out_dput; -		*want_write = true; +		}  		*opened |= FILE_CREATED;  		error = security_path_mknod(&nd->path, dentry, mode, 0);  		if (error) @@ -2513,7 +2628,7 @@ static int do_last(struct nameidata *nd, struct path *path,  	struct dentry *dir = nd->path.dentry;  	int open_flag = op->open_flag;  	bool will_truncate = (open_flag & O_TRUNC) != 0; -	bool want_write = false; +	bool got_write = false;  	int acc_mode = op->acc_mode;  	struct inode *inode;  	bool symlink_ok = false; @@ -2582,8 +2697,18 @@ static int do_last(struct nameidata *nd, struct path *path,  	}  retry_lookup: +	if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { +		error = mnt_want_write(nd->path.mnt); +		if (!error) +			got_write = true; +		/* +		 * do _not_ fail yet - we might not need that or fail with +		 * a different error; let lookup_open() decide; we'll be +		 * dropping this one anyway. +		 */ +	}  	mutex_lock(&dir->d_inode->i_mutex); -	error = lookup_open(nd, path, file, op, &want_write, opened); +	error = lookup_open(nd, path, file, op, got_write, opened);  	mutex_unlock(&dir->d_inode->i_mutex);  	if (error <= 0) { @@ -2608,22 +2733,23 @@ retry_lookup:  	}  	/* -	 * It already exists. +	 * create/update audit record if it already exists.  	 */ -	audit_inode(pathname, path->dentry); +	if (path->dentry->d_inode) +		audit_inode(pathname, path->dentry);  	/*  	 * If atomic_open() acquired write access it is dropped now due to  	 * possible mount and symlink following (this might be optimized away if  	 * necessary...)  	 */ -	if (want_write) { +	if (got_write) {  		mnt_drop_write(nd->path.mnt); -		want_write = false; +		got_write = false;  	}  	error = -EEXIST; -	if (open_flag & O_EXCL) +	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))  		goto exit_dput;  	error = follow_managed(path, nd->flags); @@ -2684,7 +2810,7 @@ finish_open:  		error = mnt_want_write(nd->path.mnt);  		if (error)  			goto out; -		want_write = true; +		got_write = true;  	}  finish_open_created:  	error = may_open(&nd->path, acc_mode, open_flag); @@ -2711,7 +2837,7 @@ opened:  			goto exit_fput;  	}  out: -	if (want_write) +	if (got_write)  		mnt_drop_write(nd->path.mnt);  	path_put(&save_parent);  	terminate_walk(nd); @@ -2735,9 +2861,9 @@ stale_open:  	nd->inode = dir->d_inode;  	save_parent.mnt = NULL;  	save_parent.dentry = NULL; -	if (want_write) { +	if (got_write) {  		mnt_drop_write(nd->path.mnt); -		want_write = false; +		got_write = false;  	}  	retried = true;  	goto retry_lookup; @@ -2777,6 +2903,9 @@ static struct file *path_openat(int dfd, const char *pathname,  			error = -ELOOP;  			break;  		} +		error = may_follow_link(&link, nd); +		if (unlikely(error)) +			break;  		nd->flags |= LOOKUP_PARENT;  		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);  		error = follow_link(&link, nd, &cookie); @@ -2846,6 +2975,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path  {  	struct dentry *dentry = ERR_PTR(-EEXIST);  	struct nameidata nd; +	int err2;  	int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);  	if (error)  		return ERR_PTR(error); @@ -2859,16 +2989,19 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path  	nd.flags &= ~LOOKUP_PARENT;  	nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; +	/* don't fail immediately if it's r/o, at least try to report other errors */ +	err2 = mnt_want_write(nd.path.mnt);  	/*  	 * Do the final lookup.  	 */  	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);  	dentry = lookup_hash(&nd);  	if (IS_ERR(dentry)) -		goto fail; +		goto unlock; +	error = -EEXIST;  	if (dentry->d_inode) -		goto eexist; +		goto fail;  	/*  	 * Special case - lookup gave negative, but... we had foo/bar/  	 * From the vfs_mknod() POV we just have a negative dentry - @@ -2876,23 +3009,37 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path  	 * been asking for (non-existent) directory. -ENOENT for you.  	 */  	if (unlikely(!is_dir && nd.last.name[nd.last.len])) { -		dput(dentry); -		dentry = ERR_PTR(-ENOENT); +		error = -ENOENT; +		goto fail; +	} +	if (unlikely(err2)) { +		error = err2;  		goto fail;  	}  	*path = nd.path;  	return dentry; -eexist: -	dput(dentry); -	dentry = ERR_PTR(-EEXIST);  fail: +	dput(dentry); +	dentry = ERR_PTR(error); +unlock:  	mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +	if (!err2) +		mnt_drop_write(nd.path.mnt);  out:  	path_put(&nd.path);  	return dentry;  }  EXPORT_SYMBOL(kern_path_create); +void done_path_create(struct path *path, struct dentry *dentry) +{ +	dput(dentry); +	mutex_unlock(&path->dentry->d_inode->i_mutex); +	mnt_drop_write(path->mnt); +	path_put(path); +} +EXPORT_SYMBOL(done_path_create); +  struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)  {  	char *tmp = getname(pathname); @@ -2956,8 +3103,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,  	struct path path;  	int error; -	if (S_ISDIR(mode)) -		return -EPERM; +	error = may_mknod(mode); +	if (error) +		return error;  	dentry = user_path_create(dfd, filename, &path, 0);  	if (IS_ERR(dentry)) @@ -2965,15 +3113,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,  	if (!IS_POSIXACL(path.dentry->d_inode))  		mode &= ~current_umask(); -	error = may_mknod(mode); -	if (error) -		goto out_dput; -	error = mnt_want_write(path.mnt); -	if (error) -		goto out_dput;  	error = security_path_mknod(&path, dentry, mode, dev);  	if (error) -		goto out_drop_write; +		goto out;  	switch (mode & S_IFMT) {  		case 0: case S_IFREG:  			error = vfs_create(path.dentry->d_inode,dentry,mode,true); @@ -2986,13 +3128,8 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,  			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);  			break;  	} -out_drop_write: -	mnt_drop_write(path.mnt); -out_dput: -	dput(dentry); -	mutex_unlock(&path.dentry->d_inode->i_mutex); -	path_put(&path); - +out: +	done_path_create(&path, dentry);  	return error;  } @@ -3038,19 +3175,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)  	if (!IS_POSIXACL(path.dentry->d_inode))  		mode &= ~current_umask(); -	error = mnt_want_write(path.mnt); -	if (error) -		goto out_dput;  	error = security_path_mkdir(&path, dentry, mode); -	if (error) -		goto out_drop_write; -	error = vfs_mkdir(path.dentry->d_inode, dentry, mode); -out_drop_write: -	mnt_drop_write(path.mnt); -out_dput: -	dput(dentry); -	mutex_unlock(&path.dentry->d_inode->i_mutex); -	path_put(&path); +	if (!error) +		error = vfs_mkdir(path.dentry->d_inode, dentry, mode); +	done_path_create(&path, dentry);  	return error;  } @@ -3144,6 +3272,9 @@ static long do_rmdir(int dfd, const char __user *pathname)  	}  	nd.flags &= ~LOOKUP_PARENT; +	error = mnt_want_write(nd.path.mnt); +	if (error) +		goto exit1;  	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);  	dentry = lookup_hash(&nd); @@ -3154,19 +3285,15 @@ static long do_rmdir(int dfd, const char __user *pathname)  		error = -ENOENT;  		goto exit3;  	} -	error = mnt_want_write(nd.path.mnt); -	if (error) -		goto exit3;  	error = security_path_rmdir(&nd.path, dentry);  	if (error) -		goto exit4; +		goto exit3;  	error = vfs_rmdir(nd.path.dentry->d_inode, dentry); -exit4: -	mnt_drop_write(nd.path.mnt);  exit3:  	dput(dentry);  exit2:  	mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +	mnt_drop_write(nd.path.mnt);  exit1:  	path_put(&nd.path);  	putname(name); @@ -3233,6 +3360,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)  		goto exit1;  	nd.flags &= ~LOOKUP_PARENT; +	error = mnt_want_write(nd.path.mnt); +	if (error) +		goto exit1;  	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);  	dentry = lookup_hash(&nd); @@ -3245,21 +3375,17 @@ static long do_unlinkat(int dfd, const char __user *pathname)  		if (!inode)  			goto slashes;  		ihold(inode); -		error = mnt_want_write(nd.path.mnt); -		if (error) -			goto exit2;  		error = security_path_unlink(&nd.path, dentry);  		if (error) -			goto exit3; +			goto exit2;  		error = vfs_unlink(nd.path.dentry->d_inode, dentry); -exit3: -		mnt_drop_write(nd.path.mnt); -	exit2: +exit2:  		dput(dentry);  	}  	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);  	if (inode)  		iput(inode);	/* truncate the inode here */ +	mnt_drop_write(nd.path.mnt);  exit1:  	path_put(&nd.path);  	putname(name); @@ -3324,19 +3450,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,  	if (IS_ERR(dentry))  		goto out_putname; -	error = mnt_want_write(path.mnt); -	if (error) -		goto out_dput;  	error = security_path_symlink(&path, dentry, from); -	if (error) -		goto out_drop_write; -	error = vfs_symlink(path.dentry->d_inode, dentry, from); -out_drop_write: -	mnt_drop_write(path.mnt); -out_dput: -	dput(dentry); -	mutex_unlock(&path.dentry->d_inode->i_mutex); -	path_put(&path); +	if (!error) +		error = vfs_symlink(path.dentry->d_inode, dentry, from); +	done_path_create(&path, dentry);  out_putname:  	putname(from);  	return error; @@ -3436,19 +3553,15 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,  	error = -EXDEV;  	if (old_path.mnt != new_path.mnt)  		goto out_dput; -	error = mnt_want_write(new_path.mnt); -	if (error) +	error = may_linkat(&old_path); +	if (unlikely(error))  		goto out_dput;  	error = security_path_link(old_path.dentry, &new_path, new_dentry);  	if (error) -		goto out_drop_write; +		goto out_dput;  	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); -out_drop_write: -	mnt_drop_write(new_path.mnt);  out_dput: -	dput(new_dentry); -	mutex_unlock(&new_path.dentry->d_inode->i_mutex); -	path_put(&new_path); +	done_path_create(&new_path, new_dentry);  out:  	path_put(&old_path); @@ -3644,6 +3757,10 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,  	if (newnd.last_type != LAST_NORM)  		goto exit2; +	error = mnt_want_write(oldnd.path.mnt); +	if (error) +		goto exit2; +  	oldnd.flags &= ~LOOKUP_PARENT;  	newnd.flags &= ~LOOKUP_PARENT;  	newnd.flags |= LOOKUP_RENAME_TARGET; @@ -3679,23 +3796,19 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,  	if (new_dentry == trap)  		goto exit5; -	error = mnt_want_write(oldnd.path.mnt); -	if (error) -		goto exit5;  	error = security_path_rename(&oldnd.path, old_dentry,  				     &newnd.path, new_dentry);  	if (error) -		goto exit6; +		goto exit5;  	error = vfs_rename(old_dir->d_inode, old_dentry,  				   new_dir->d_inode, new_dentry); -exit6: -	mnt_drop_write(oldnd.path.mnt);  exit5:  	dput(new_dentry);  exit4:  	dput(old_dentry);  exit3:  	unlock_rename(new_dir, old_dir); +	mnt_drop_write(oldnd.path.mnt);  exit2:  	path_put(&newnd.path);  	putname(to); diff --git a/fs/namespace.c b/fs/namespace.c index c53d3381b0d..4d31f73e256 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -283,24 +283,22 @@ static int mnt_is_readonly(struct vfsmount *mnt)  }  /* - * Most r/o checks on a fs are for operations that take - * discrete amounts of time, like a write() or unlink(). - * We must keep track of when those operations start - * (for permission checks) and when they end, so that - * we can determine when writes are able to occur to - * a filesystem. + * Most r/o & frozen checks on a fs are for operations that take discrete + * amounts of time, like a write() or unlink().  We must keep track of when + * those operations start (for permission checks) and when they end, so that we + * can determine when writes are able to occur to a filesystem.   */  /** - * mnt_want_write - get write access to a mount + * __mnt_want_write - get write access to a mount without freeze protection   * @m: the mount on which to take a write   * - * This tells the low-level filesystem that a write is - * about to be performed to it, and makes sure that - * writes are allowed before returning success.  When - * the write operation is finished, mnt_drop_write() - * must be called.  This is effectively a refcount. + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mnt it read-write) before + * returning success. This operation does not protect against filesystem being + * frozen. When the write operation is finished, __mnt_drop_write() must be + * called. This is effectively a refcount.   */ -int mnt_want_write(struct vfsmount *m) +int __mnt_want_write(struct vfsmount *m)  {  	struct mount *mnt = real_mount(m);  	int ret = 0; @@ -326,6 +324,27 @@ int mnt_want_write(struct vfsmount *m)  		ret = -EROFS;  	}  	preempt_enable(); + +	return ret; +} + +/** + * mnt_want_write - get write access to a mount + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mount is read-write, filesystem + * is not frozen) before returning success.  When the write operation is + * finished, mnt_drop_write() must be called.  This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *m) +{ +	int ret; + +	sb_start_write(m->mnt_sb); +	ret = __mnt_want_write(m); +	if (ret) +		sb_end_write(m->mnt_sb);  	return ret;  }  EXPORT_SYMBOL_GPL(mnt_want_write); @@ -355,38 +374,76 @@ int mnt_clone_write(struct vfsmount *mnt)  EXPORT_SYMBOL_GPL(mnt_clone_write);  /** - * mnt_want_write_file - get write access to a file's mount + * __mnt_want_write_file - get write access to a file's mount   * @file: the file who's mount on which to take a write   * - * This is like mnt_want_write, but it takes a file and can + * This is like __mnt_want_write, but it takes a file and can   * do some optimisations if the file is open for write already   */ -int mnt_want_write_file(struct file *file) +int __mnt_want_write_file(struct file *file)  {  	struct inode *inode = file->f_dentry->d_inode; +  	if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) -		return mnt_want_write(file->f_path.mnt); +		return __mnt_want_write(file->f_path.mnt);  	else  		return mnt_clone_write(file->f_path.mnt);  } + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int mnt_want_write_file(struct file *file) +{ +	int ret; + +	sb_start_write(file->f_path.mnt->mnt_sb); +	ret = __mnt_want_write_file(file); +	if (ret) +		sb_end_write(file->f_path.mnt->mnt_sb); +	return ret; +}  EXPORT_SYMBOL_GPL(mnt_want_write_file);  /** - * mnt_drop_write - give up write access to a mount + * __mnt_drop_write - give up write access to a mount   * @mnt: the mount on which to give up write access   *   * Tells the low-level filesystem that we are done   * performing writes to it.  Must be matched with - * mnt_want_write() call above. + * __mnt_want_write() call above.   */ -void mnt_drop_write(struct vfsmount *mnt) +void __mnt_drop_write(struct vfsmount *mnt)  {  	preempt_disable();  	mnt_dec_writers(real_mount(mnt));  	preempt_enable();  } + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen again.  Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ +	__mnt_drop_write(mnt); +	sb_end_write(mnt->mnt_sb); +}  EXPORT_SYMBOL_GPL(mnt_drop_write); +void __mnt_drop_write_file(struct file *file) +{ +	__mnt_drop_write(file->f_path.mnt); +} +  void mnt_drop_write_file(struct file *file)  {  	mnt_drop_write(file->f_path.mnt); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5ff0b7b9fc0..43295d45cc2 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -154,6 +154,10 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)  	if (status < 0)  		return; +	status = mnt_want_write_file(rec_file); +	if (status) +		return; +  	dir = rec_file->f_path.dentry;  	/* lock the parent */  	mutex_lock(&dir->d_inode->i_mutex); @@ -173,11 +177,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)  		 * as well be forgiving and just succeed silently.  		 */  		goto out_put; -	status = mnt_want_write_file(rec_file); -	if (status) -		goto out_put;  	status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); -	mnt_drop_write_file(rec_file);  out_put:  	dput(dentry);  out_unlock: @@ -189,6 +189,7 @@ out_unlock:  				" (err %d); please check that %s exists"  				" and is writeable", status,  				user_recovery_dirname); +	mnt_drop_write_file(rec_file);  	nfs4_reset_creds(original_cred);  } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index cc793005a87..032af381b3a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -635,6 +635,7 @@ fh_put(struct svc_fh *fhp)  		fhp->fh_post_saved = 0;  #endif  	} +	fh_drop_write(fhp);  	if (exp) {  		exp_put(exp);  		fhp->fh_export = NULL; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index e15dc45fc5e..aad6d457b9e 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -196,6 +196,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,  	struct dentry	*dchild;  	int		type, mode;  	__be32		nfserr; +	int		hosterr;  	dev_t		rdev = 0, wanted = new_decode_dev(attr->ia_size);  	dprintk("nfsd: CREATE   %s %.*s\n", @@ -214,6 +215,12 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,  	nfserr = nfserr_exist;  	if (isdotent(argp->name, argp->len))  		goto done; +	hosterr = fh_want_write(dirfhp); +	if (hosterr) { +		nfserr = nfserrno(hosterr); +		goto done; +	} +  	fh_lock_nested(dirfhp, I_MUTEX_PARENT);  	dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);  	if (IS_ERR(dchild)) { @@ -330,7 +337,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,  out_unlock:  	/* We don't really need to unlock, as fh_put does it. */  	fh_unlock(dirfhp); - +	fh_drop_write(dirfhp);  done:  	fh_put(dirfhp);  	return nfsd_return_dirop(nfserr, resp); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 702f64e820c..a9269f142cc 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1284,6 +1284,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  	 * If it has, the parent directory should already be locked.  	 */  	if (!resfhp->fh_dentry) { +		host_err = fh_want_write(fhp); +		if (host_err) +			goto out_nfserr; +  		/* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */  		fh_lock_nested(fhp, I_MUTEX_PARENT);  		dchild = lookup_one_len(fname, dentry, flen); @@ -1327,14 +1331,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  		goto out;  	} -	host_err = fh_want_write(fhp); -	if (host_err) -		goto out_nfserr; -  	/*  	 * Get the dir op function pointer.  	 */  	err = 0; +	host_err = 0;  	switch (type) {  	case S_IFREG:  		host_err = vfs_create(dirp, dchild, iap->ia_mode, true); @@ -1351,10 +1352,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);  		break;  	} -	if (host_err < 0) { -		fh_drop_write(fhp); +	if (host_err < 0)  		goto out_nfserr; -	}  	err = nfsd_create_setattr(rqstp, resfhp, iap); @@ -1366,7 +1365,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  	err2 = nfserrno(commit_metadata(fhp));  	if (err2)  		err = err2; -	fh_drop_write(fhp);  	/*  	 * Update the file handle to get the new inode info.  	 */ @@ -1425,6 +1423,11 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  	err = nfserr_notdir;  	if (!dirp->i_op->lookup)  		goto out; + +	host_err = fh_want_write(fhp); +	if (host_err) +		goto out_nfserr; +  	fh_lock_nested(fhp, I_MUTEX_PARENT);  	/* @@ -1457,9 +1460,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  		v_atime = verifier[1]&0x7fffffff;  	} -	host_err = fh_want_write(fhp); -	if (host_err) -		goto out_nfserr;  	if (dchild->d_inode) {  		err = 0; @@ -1530,7 +1530,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  	if (!err)  		err = nfserrno(commit_metadata(fhp)); -	fh_drop_write(fhp);  	/*  	 * Update the filehandle to get the new inode info.  	 */ @@ -1541,6 +1540,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  	fh_unlock(fhp);  	if (dchild && !IS_ERR(dchild))  		dput(dchild); +	fh_drop_write(fhp);   	return err;   out_nfserr: @@ -1621,6 +1621,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,  	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);  	if (err)  		goto out; + +	host_err = fh_want_write(fhp); +	if (host_err) +		goto out_nfserr; +  	fh_lock(fhp);  	dentry = fhp->fh_dentry;  	dnew = lookup_one_len(fname, dentry, flen); @@ -1628,10 +1633,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,  	if (IS_ERR(dnew))  		goto out_nfserr; -	host_err = fh_want_write(fhp); -	if (host_err) -		goto out_nfserr; -  	if (unlikely(path[plen] != 0)) {  		char *path_alloced = kmalloc(plen+1, GFP_KERNEL);  		if (path_alloced == NULL) @@ -1691,6 +1692,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,  	if (isdotent(name, len))  		goto out; +	host_err = fh_want_write(tfhp); +	if (host_err) { +		err = nfserrno(host_err); +		goto out; +	} +  	fh_lock_nested(ffhp, I_MUTEX_PARENT);  	ddir = ffhp->fh_dentry;  	dirp = ddir->d_inode; @@ -1702,18 +1709,13 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,  	dold = tfhp->fh_dentry; -	host_err = fh_want_write(tfhp); -	if (host_err) { -		err = nfserrno(host_err); -		goto out_dput; -	}  	err = nfserr_noent;  	if (!dold->d_inode) -		goto out_drop_write; +		goto out_dput;  	host_err = nfsd_break_lease(dold->d_inode);  	if (host_err) {  		err = nfserrno(host_err); -		goto out_drop_write; +		goto out_dput;  	}  	host_err = vfs_link(dold, dirp, dnew);  	if (!host_err) { @@ -1726,12 +1728,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,  		else  			err = nfserrno(host_err);  	} -out_drop_write: -	fh_drop_write(tfhp);  out_dput:  	dput(dnew);  out_unlock:  	fh_unlock(ffhp); +	fh_drop_write(tfhp);  out:  	return err; @@ -1774,6 +1775,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,  	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))  		goto out; +	host_err = fh_want_write(ffhp); +	if (host_err) { +		err = nfserrno(host_err); +		goto out; +	} +  	/* cannot use fh_lock as we need deadlock protective ordering  	 * so do it by hand */  	trap = lock_rename(tdentry, fdentry); @@ -1804,17 +1811,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,  	host_err = -EXDEV;  	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)  		goto out_dput_new; -	host_err = fh_want_write(ffhp); -	if (host_err) -		goto out_dput_new;  	host_err = nfsd_break_lease(odentry->d_inode);  	if (host_err) -		goto out_drop_write; +		goto out_dput_new;  	if (ndentry->d_inode) {  		host_err = nfsd_break_lease(ndentry->d_inode);  		if (host_err) -			goto out_drop_write; +			goto out_dput_new;  	}  	host_err = vfs_rename(fdir, odentry, tdir, ndentry);  	if (!host_err) { @@ -1822,8 +1826,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,  		if (!host_err)  			host_err = commit_metadata(ffhp);  	} -out_drop_write: -	fh_drop_write(ffhp);   out_dput_new:  	dput(ndentry);   out_dput_old: @@ -1839,6 +1841,7 @@ out_drop_write:  	fill_post_wcc(tfhp);  	unlock_rename(tdentry, fdentry);  	ffhp->fh_locked = tfhp->fh_locked = 0; +	fh_drop_write(ffhp);  out:  	return err; @@ -1864,6 +1867,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,  	if (err)  		goto out; +	host_err = fh_want_write(fhp); +	if (host_err) +		goto out_nfserr; +  	fh_lock_nested(fhp, I_MUTEX_PARENT);  	dentry = fhp->fh_dentry;  	dirp = dentry->d_inode; @@ -1882,21 +1889,15 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,  	if (!type)  		type = rdentry->d_inode->i_mode & S_IFMT; -	host_err = fh_want_write(fhp); -	if (host_err) -		goto out_put; -  	host_err = nfsd_break_lease(rdentry->d_inode);  	if (host_err) -		goto out_drop_write; +		goto out_put;  	if (type != S_IFDIR)  		host_err = vfs_unlink(dirp, rdentry);  	else  		host_err = vfs_rmdir(dirp, rdentry);  	if (!host_err)  		host_err = commit_metadata(fhp); -out_drop_write: -	fh_drop_write(fhp);  out_put:  	dput(rdentry); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index ec0611b2b73..359594c393d 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -110,12 +110,19 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);  static inline int fh_want_write(struct svc_fh *fh)  { -	return mnt_want_write(fh->fh_export->ex_path.mnt); +	int ret = mnt_want_write(fh->fh_export->ex_path.mnt); + +	if (!ret) +		fh->fh_want_write = 1; +	return ret;  }  static inline void fh_drop_write(struct svc_fh *fh)  { -	mnt_drop_write(fh->fh_export->ex_path.mnt); +	if (fh->fh_want_write) { +		fh->fh_want_write = 0; +		mnt_drop_write(fh->fh_export->ex_path.mnt); +	}  }  #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 62cebc8e1a1..a4d56ac02e6 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -69,16 +69,18 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	struct page *page = vmf->page;  	struct inode *inode = vma->vm_file->f_dentry->d_inode;  	struct nilfs_transaction_info ti; -	int ret; +	int ret = 0;  	if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))  		return VM_FAULT_SIGBUS; /* -ENOSPC */ +	sb_start_pagefault(inode->i_sb);  	lock_page(page);  	if (page->mapping != inode->i_mapping ||  	    page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {  		unlock_page(page); -		return VM_FAULT_NOPAGE; /* make the VM retry the fault */ +		ret = -EFAULT;	/* make the VM retry the fault */ +		goto out;  	}  	/* @@ -112,19 +114,21 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);  	/* never returns -ENOMEM, but may return -ENOSPC */  	if (unlikely(ret)) -		return VM_FAULT_SIGBUS; +		goto out; -	ret = block_page_mkwrite(vma, vmf, nilfs_get_block); -	if (ret != VM_FAULT_LOCKED) { +	ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); +	if (ret) {  		nilfs_transaction_abort(inode->i_sb); -		return ret; +		goto out;  	}  	nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));  	nilfs_transaction_commit(inode->i_sb);   mapped:  	wait_on_page_writeback(page); -	return VM_FAULT_LOCKED; + out: +	sb_end_pagefault(inode->i_sb); +	return block_page_mkwrite_return(ret);  }  static const struct vm_operations_struct nilfs_file_vm_ops = { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 0b6387c67e6..fdb18076948 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -660,8 +660,6 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,  		goto out_free;  	} -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); -  	ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);  	if (ret < 0)  		printk(KERN_ERR "NILFS: GC failed during preparation: " diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 88e11fb346b..a5752a58993 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -189,7 +189,7 @@ int nilfs_transaction_begin(struct super_block *sb,  	if (ret > 0)  		return 0; -	vfs_check_frozen(sb, SB_FREEZE_WRITE); +	sb_start_intwrite(sb);  	nilfs = sb->s_fs_info;  	down_read(&nilfs->ns_segctor_sem); @@ -205,6 +205,7 @@ int nilfs_transaction_begin(struct super_block *sb,  	current->journal_info = ti->ti_save;  	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)  		kmem_cache_free(nilfs_transaction_cachep, ti); +	sb_end_intwrite(sb);  	return ret;  } @@ -246,6 +247,7 @@ int nilfs_transaction_commit(struct super_block *sb)  		err = nilfs_construct_segment(sb);  	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)  		kmem_cache_free(nilfs_transaction_cachep, ti); +	sb_end_intwrite(sb);  	return err;  } @@ -264,6 +266,7 @@ void nilfs_transaction_abort(struct super_block *sb)  	current->journal_info = ti->ti_save;  	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)  		kmem_cache_free(nilfs_transaction_cachep, ti); +	sb_end_intwrite(sb);  }  void nilfs_relax_pressure_in_lock(struct super_block *sb) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 7389d2d5e51..1ecf46448f8 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2084,7 +2084,6 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,  	if (err)  		return err;  	pos = *ppos; -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);  	/* We can write back this queue in page reclaim. */  	current->backing_dev_info = mapping->backing_dev_info;  	written = 0; @@ -2119,6 +2118,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  	BUG_ON(iocb->ki_pos != pos); +	sb_start_write(inode->i_sb);  	mutex_lock(&inode->i_mutex);  	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);  	mutex_unlock(&inode->i_mutex); @@ -2127,6 +2127,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  		if (err < 0)  			ret = err;  	} +	sb_end_write(inode->i_sb);  	return ret;  } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7602783d7f4..46a1f6d7510 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1971,6 +1971,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,  {  	struct inode *inode = file->f_path.dentry->d_inode;  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); +	int ret;  	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&  	    !ocfs2_writes_unwritten_extents(osb)) @@ -1985,7 +1986,12 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,  	if (!(file->f_mode & FMODE_WRITE))  		return -EBADF; -	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); +	ret = mnt_want_write_file(file); +	if (ret) +		return ret; +	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); +	mnt_drop_write_file(file); +	return ret;  }  static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, @@ -2261,7 +2267,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,  	if (iocb->ki_left == 0)  		return 0; -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); +	sb_start_write(inode->i_sb);  	appending = file->f_flags & O_APPEND ? 1 : 0;  	direct_io = file->f_flags & O_DIRECT ? 1 : 0; @@ -2436,6 +2442,7 @@ out_sems:  		ocfs2_iocb_clear_sem_locked(iocb);  	mutex_unlock(&inode->i_mutex); +	sb_end_write(inode->i_sb);  	if (written)  		ret = written; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index d96f7f81d8d..f20edcbfe70 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -928,7 +928,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  		if (get_user(new_clusters, (int __user *)arg))  			return -EFAULT; -		return ocfs2_group_extend(inode, new_clusters); +		status = mnt_want_write_file(filp); +		if (status) +			return status; +		status = ocfs2_group_extend(inode, new_clusters); +		mnt_drop_write_file(filp); +		return status;  	case OCFS2_IOC_GROUP_ADD:  	case OCFS2_IOC_GROUP_ADD64:  		if (!capable(CAP_SYS_RESOURCE)) @@ -937,7 +942,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  		if (copy_from_user(&input, (int __user *) arg, sizeof(input)))  			return -EFAULT; -		return ocfs2_group_add(inode, &input); +		status = mnt_want_write_file(filp); +		if (status) +			return status; +		status = ocfs2_group_add(inode, &input); +		mnt_drop_write_file(filp); +		return status;  	case OCFS2_IOC_REFLINK:  		if (copy_from_user(&args, argp, sizeof(args)))  			return -EFAULT; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 0a42ae96dca..2dd36af79e2 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -355,11 +355,14 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)  	if (journal_current_handle())  		return jbd2_journal_start(journal, max_buffs); +	sb_start_intwrite(osb->sb); +  	down_read(&osb->journal->j_trans_barrier);  	handle = jbd2_journal_start(journal, max_buffs);  	if (IS_ERR(handle)) {  		up_read(&osb->journal->j_trans_barrier); +		sb_end_intwrite(osb->sb);  		mlog_errno(PTR_ERR(handle)); @@ -388,8 +391,10 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,  	if (ret < 0)  		mlog_errno(ret); -	if (!nested) +	if (!nested) {  		up_read(&journal->j_trans_barrier); +		sb_end_intwrite(osb->sb); +	}  	return ret;  } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9cd41083e99..d150372fd81 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -136,6 +136,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	sigset_t oldset;  	int ret; +	sb_start_pagefault(inode->i_sb);  	ocfs2_block_signals(&oldset);  	/* @@ -165,6 +166,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  out:  	ocfs2_unblock_signals(&oldset); +	sb_end_pagefault(inode->i_sb);  	return ret;  } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9f32d7cbb7a..30a055049e1 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4466,20 +4466,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,  		goto out_dput;  	} -	error = mnt_want_write(new_path.mnt); -	if (error) { -		mlog_errno(error); -		goto out_dput; -	} -  	error = ocfs2_vfs_reflink(old_path.dentry,  				  new_path.dentry->d_inode,  				  new_dentry, preserve); -	mnt_drop_write(new_path.mnt);  out_dput: -	dput(new_dentry); -	mutex_unlock(&new_path.dentry->d_inode->i_mutex); -	path_put(&new_path); +	done_path_create(&new_path, new_dentry);  out:  	path_put(&old_path); diff --git a/fs/open.c b/fs/open.c index 1e914b397e1..f3d96e7e7b1 100644 --- a/fs/open.c +++ b/fs/open.c @@ -164,11 +164,13 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)  	if (IS_APPEND(inode))  		goto out_putf; +	sb_start_write(inode->i_sb);  	error = locks_verify_truncate(inode, file, length);  	if (!error)  		error = security_path_truncate(&file->f_path);  	if (!error)  		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); +	sb_end_write(inode->i_sb);  out_putf:  	fput(file);  out: @@ -266,7 +268,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  	if (!file->f_op->fallocate)  		return -EOPNOTSUPP; -	return file->f_op->fallocate(file, mode, offset, len); +	sb_start_write(inode->i_sb); +	ret = file->f_op->fallocate(file, mode, offset, len); +	sb_end_write(inode->i_sb); +	return ret;  }  SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) @@ -620,7 +625,7 @@ static inline int __get_file_write_access(struct inode *inode,  		/*  		 * Balanced in __fput()  		 */ -		error = mnt_want_write(mnt); +		error = __mnt_want_write(mnt);  		if (error)  			put_write_access(inode);  	} @@ -654,6 +659,7 @@ static int do_dentry_open(struct file *f,  	if (unlikely(f->f_flags & O_PATH))  		f->f_mode = FMODE_PATH; +	path_get(&f->f_path);  	inode = f->f_path.dentry->d_inode;  	if (f->f_mode & FMODE_WRITE) {  		error = __get_file_write_access(inode, f->f_path.mnt); @@ -739,9 +745,7 @@ int finish_open(struct file *file, struct dentry *dentry,  	int error;  	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ -	mntget(file->f_path.mnt); -	file->f_path.dentry = dget(dentry); - +	file->f_path.dentry = dentry;  	error = do_dentry_open(file, open, current_cred());  	if (!error)  		*opened |= FILE_OPENED; @@ -784,7 +788,6 @@ struct file *dentry_open(const struct path *path, int flags,  	f->f_flags = flags;  	f->f_path = *path; -	path_get(&f->f_path);  	error = do_dentry_open(f, NULL, cred);  	if (!error) {  		error = open_check_o_direct(f); diff --git a/fs/pipe.c b/fs/pipe.c index 95cbd6b227e..8d85d7068c1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1016,18 +1016,16 @@ fail_inode:  	return NULL;  } -struct file *create_write_pipe(int flags) +int create_pipe_files(struct file **res, int flags)  {  	int err; -	struct inode *inode; +	struct inode *inode = get_pipe_inode();  	struct file *f;  	struct path path; -	struct qstr name = { .name = "" }; +	static struct qstr name = { .name = "" }; -	err = -ENFILE; -	inode = get_pipe_inode();  	if (!inode) -		goto err; +		return -ENFILE;  	err = -ENOMEM;  	path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); @@ -1041,62 +1039,43 @@ struct file *create_write_pipe(int flags)  	f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);  	if (!f)  		goto err_dentry; -	f->f_mapping = inode->i_mapping;  	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); -	f->f_version = 0; -	return f; +	res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); +	if (!res[0]) +		goto err_file; + +	path_get(&path); +	res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); +	res[1] = f; +	return 0; - err_dentry: +err_file: +	put_filp(f); +err_dentry:  	free_pipe_info(inode);  	path_put(&path); -	return ERR_PTR(err); +	return err; - err_inode: +err_inode:  	free_pipe_info(inode);  	iput(inode); - err: -	return ERR_PTR(err); -} - -void free_write_pipe(struct file *f) -{ -	free_pipe_info(f->f_dentry->d_inode); -	path_put(&f->f_path); -	put_filp(f); -} - -struct file *create_read_pipe(struct file *wrf, int flags) -{ -	/* Grab pipe from the writer */ -	struct file *f = alloc_file(&wrf->f_path, FMODE_READ, -				    &read_pipefifo_fops); -	if (!f) -		return ERR_PTR(-ENFILE); - -	path_get(&wrf->f_path); -	f->f_flags = O_RDONLY | (flags & O_NONBLOCK); - -	return f; +	return err;  }  int do_pipe_flags(int *fd, int flags)  { -	struct file *fw, *fr; +	struct file *files[2];  	int error;  	int fdw, fdr;  	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))  		return -EINVAL; -	fw = create_write_pipe(flags); -	if (IS_ERR(fw)) -		return PTR_ERR(fw); -	fr = create_read_pipe(fw, flags); -	error = PTR_ERR(fr); -	if (IS_ERR(fr)) -		goto err_write_pipe; +	error = create_pipe_files(files, flags); +	if (error) +		return error;  	error = get_unused_fd_flags(flags);  	if (error < 0) @@ -1109,8 +1088,8 @@ int do_pipe_flags(int *fd, int flags)  	fdw = error;  	audit_fd_pair(fdr, fdw); -	fd_install(fdr, fr); -	fd_install(fdw, fw); +	fd_install(fdr, files[0]); +	fd_install(fdw, files[1]);  	fd[0] = fdr;  	fd[1] = fdw; @@ -1119,10 +1098,8 @@ int do_pipe_flags(int *fd, int flags)   err_fdr:  	put_unused_fd(fdr);   err_read_pipe: -	path_put(&fr->f_path); -	put_filp(fr); - err_write_pipe: -	free_write_pipe(fw); +	fput(files[0]); +	fput(files[1]);  	return error;  } diff --git a/fs/splice.c b/fs/splice.c index 7bf08fa22ec..41514dd8946 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -996,6 +996,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,  	};  	ssize_t ret; +	sb_start_write(inode->i_sb); +  	pipe_lock(pipe);  	splice_from_pipe_begin(&sd); @@ -1034,6 +1036,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,  			*ppos += ret;  		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);  	} +	sb_end_write(inode->i_sb);  	return ret;  } diff --git a/fs/super.c b/fs/super.c index 4bf714459a4..b05cf47463d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -33,12 +33,19 @@  #include <linux/rculist_bl.h>  #include <linux/cleancache.h>  #include <linux/fsnotify.h> +#include <linux/lockdep.h>  #include "internal.h"  LIST_HEAD(super_blocks);  DEFINE_SPINLOCK(sb_lock); +static char *sb_writers_name[SB_FREEZE_LEVELS] = { +	"sb_writers", +	"sb_pagefaults", +	"sb_internal", +}; +  /*   * One thing we have to be careful of with a per-sb shrinker is that we don't   * drop the last active reference to the superblock from within the shrinker. @@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)  	return total_objects;  } +static int init_sb_writers(struct super_block *s, struct file_system_type *type) +{ +	int err; +	int i; + +	for (i = 0; i < SB_FREEZE_LEVELS; i++) { +		err = percpu_counter_init(&s->s_writers.counter[i], 0); +		if (err < 0) +			goto err_out; +		lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], +				 &type->s_writers_key[i], 0); +	} +	init_waitqueue_head(&s->s_writers.wait); +	init_waitqueue_head(&s->s_writers.wait_unfrozen); +	return 0; +err_out: +	while (--i >= 0) +		percpu_counter_destroy(&s->s_writers.counter[i]); +	return err; +} + +static void destroy_sb_writers(struct super_block *s) +{ +	int i; + +	for (i = 0; i < SB_FREEZE_LEVELS; i++) +		percpu_counter_destroy(&s->s_writers.counter[i]); +} +  /**   *	alloc_super	-	create new superblock   *	@type:	filesystem type superblock should belong to @@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)  	if (s) {  		if (security_sb_alloc(s)) { +			/* +			 * We cannot call security_sb_free() without +			 * security_sb_alloc() succeeding. So bail out manually +			 */  			kfree(s);  			s = NULL;  			goto out;  		}  #ifdef CONFIG_SMP  		s->s_files = alloc_percpu(struct list_head); -		if (!s->s_files) { -			security_sb_free(s); -			kfree(s); -			s = NULL; -			goto out; -		} else { +		if (!s->s_files) +			goto err_out; +		else {  			int i;  			for_each_possible_cpu(i) @@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)  #else  		INIT_LIST_HEAD(&s->s_files);  #endif +		if (init_sb_writers(s, type)) +			goto err_out;  		s->s_flags = flags;  		s->s_bdi = &default_backing_dev_info;  		INIT_HLIST_NODE(&s->s_instances); @@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)  		mutex_init(&s->s_dquot.dqio_mutex);  		mutex_init(&s->s_dquot.dqonoff_mutex);  		init_rwsem(&s->s_dquot.dqptr_sem); -		init_waitqueue_head(&s->s_wait_unfrozen);  		s->s_maxbytes = MAX_NON_LFS;  		s->s_op = &default_op;  		s->s_time_gran = 1000000000; @@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)  	}  out:  	return s; +err_out: +	security_sb_free(s); +#ifdef CONFIG_SMP +	if (s->s_files) +		free_percpu(s->s_files); +#endif +	destroy_sb_writers(s); +	kfree(s); +	s = NULL; +	goto out;  }  /** @@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s)  #ifdef CONFIG_SMP  	free_percpu(s->s_files);  #endif +	destroy_sb_writers(s);  	security_sb_free(s);  	WARN_ON(!list_empty(&s->s_mounts));  	kfree(s->s_subtype); @@ -651,10 +700,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)  {  	while (1) {  		struct super_block *s = get_super(bdev); -		if (!s || s->s_frozen == SB_UNFROZEN) +		if (!s || s->s_writers.frozen == SB_UNFROZEN)  			return s;  		up_read(&s->s_umount); -		vfs_check_frozen(s, SB_FREEZE_WRITE); +		wait_event(s->s_writers.wait_unfrozen, +			   s->s_writers.frozen == SB_UNFROZEN);  		put_super(s);  	}  } @@ -732,7 +782,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)  	int retval;  	int remount_ro; -	if (sb->s_frozen != SB_UNFROZEN) +	if (sb->s_writers.frozen != SB_UNFROZEN)  		return -EBUSY;  #ifdef CONFIG_BLOCK @@ -1163,6 +1213,120 @@ out:  	return ERR_PTR(error);  } +/* + * This is an internal function, please use sb_end_{write,pagefault,intwrite} + * instead. + */ +void __sb_end_write(struct super_block *sb, int level) +{ +	percpu_counter_dec(&sb->s_writers.counter[level-1]); +	/* +	 * Make sure s_writers are updated before we wake up waiters in +	 * freeze_super(). +	 */ +	smp_mb(); +	if (waitqueue_active(&sb->s_writers.wait)) +		wake_up(&sb->s_writers.wait); +	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); +} +EXPORT_SYMBOL(__sb_end_write); + +#ifdef CONFIG_LOCKDEP +/* + * We want lockdep to tell us about possible deadlocks with freezing but + * it's it bit tricky to properly instrument it. Getting a freeze protection + * works as getting a read lock but there are subtle problems. XFS for example + * gets freeze protection on internal level twice in some cases, which is OK + * only because we already hold a freeze protection also on higher level. Due + * to these cases we have to tell lockdep we are doing trylock when we + * already hold a freeze protection for a higher freeze level. + */ +static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, +				unsigned long ip) +{ +	int i; + +	if (!trylock) { +		for (i = 0; i < level - 1; i++) +			if (lock_is_held(&sb->s_writers.lock_map[i])) { +				trylock = true; +				break; +			} +	} +	rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); +} +#endif + +/* + * This is an internal function, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +int __sb_start_write(struct super_block *sb, int level, bool wait) +{ +retry: +	if (unlikely(sb->s_writers.frozen >= level)) { +		if (!wait) +			return 0; +		wait_event(sb->s_writers.wait_unfrozen, +			   sb->s_writers.frozen < level); +	} + +#ifdef CONFIG_LOCKDEP +	acquire_freeze_lock(sb, level, !wait, _RET_IP_); +#endif +	percpu_counter_inc(&sb->s_writers.counter[level-1]); +	/* +	 * Make sure counter is updated before we check for frozen. +	 * freeze_super() first sets frozen and then checks the counter. +	 */ +	smp_mb(); +	if (unlikely(sb->s_writers.frozen >= level)) { +		__sb_end_write(sb, level); +		goto retry; +	} +	return 1; +} +EXPORT_SYMBOL(__sb_start_write); + +/** + * sb_wait_write - wait until all writers to given file system finish + * @sb: the super for which we wait + * @level: type of writers we wait for (normal vs page fault) + * + * This function waits until there are no writers of given type to given file + * system. Caller of this function should make sure there can be no new writers + * of type @level before calling this function. Otherwise this function can + * livelock. + */ +static void sb_wait_write(struct super_block *sb, int level) +{ +	s64 writers; + +	/* +	 * We just cycle-through lockdep here so that it does not complain +	 * about returning with lock to userspace +	 */ +	rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); +	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); + +	do { +		DEFINE_WAIT(wait); + +		/* +		 * We use a barrier in prepare_to_wait() to separate setting +		 * of frozen and checking of the counter +		 */ +		prepare_to_wait(&sb->s_writers.wait, &wait, +				TASK_UNINTERRUPTIBLE); + +		writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); +		if (writers) +			schedule(); + +		finish_wait(&sb->s_writers.wait, &wait); +	} while (writers); +} +  /**   * freeze_super - lock the filesystem and force it into a consistent state   * @sb: the super to lock @@ -1170,6 +1334,31 @@ out:   * Syncs the super to make sure the filesystem is consistent and calls the fs's   * freeze_fs.  Subsequent calls to this without first thawing the fs will return   * -EBUSY. + * + * During this function, sb->s_writers.frozen goes through these values: + * + * SB_UNFROZEN: File system is normal, all writes progress as usual. + * + * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New + * writes should be blocked, though page faults are still allowed. We wait for + * all writes to complete and then proceed to the next stage. + * + * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked + * but internal fs threads can still modify the filesystem (although they + * should not dirty new pages or inodes), writeback can run etc. After waiting + * for all running page faults we sync the filesystem which will clean all + * dirty pages and inodes (no new dirty pages or inodes can be created when + * sync is running). + * + * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs + * modification are blocked (e.g. XFS preallocation truncation on inode + * reclaim). This is usually implemented by blocking new transactions for + * filesystems that have them and need this additional guard. After all + * internal writers are finished we call ->freeze_fs() to finish filesystem + * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is + * mostly auxiliary for filesystems to verify they do not modify frozen fs. + * + * sb->s_writers.frozen is protected by sb->s_umount.   */  int freeze_super(struct super_block *sb)  { @@ -1177,7 +1366,7 @@ int freeze_super(struct super_block *sb)  	atomic_inc(&sb->s_active);  	down_write(&sb->s_umount); -	if (sb->s_frozen) { +	if (sb->s_writers.frozen != SB_UNFROZEN) {  		deactivate_locked_super(sb);  		return -EBUSY;  	} @@ -1188,33 +1377,53 @@ int freeze_super(struct super_block *sb)  	}  	if (sb->s_flags & MS_RDONLY) { -		sb->s_frozen = SB_FREEZE_TRANS; -		smp_wmb(); +		/* Nothing to do really... */ +		sb->s_writers.frozen = SB_FREEZE_COMPLETE;  		up_write(&sb->s_umount);  		return 0;  	} -	sb->s_frozen = SB_FREEZE_WRITE; +	/* From now on, no new normal writers can start */ +	sb->s_writers.frozen = SB_FREEZE_WRITE; +	smp_wmb(); + +	/* Release s_umount to preserve sb_start_write -> s_umount ordering */ +	up_write(&sb->s_umount); + +	sb_wait_write(sb, SB_FREEZE_WRITE); + +	/* Now we go and block page faults... */ +	down_write(&sb->s_umount); +	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;  	smp_wmb(); +	sb_wait_write(sb, SB_FREEZE_PAGEFAULT); + +	/* All writers are done so after syncing there won't be dirty data */  	sync_filesystem(sb); -	sb->s_frozen = SB_FREEZE_TRANS; +	/* Now wait for internal filesystem counter */ +	sb->s_writers.frozen = SB_FREEZE_FS;  	smp_wmb(); +	sb_wait_write(sb, SB_FREEZE_FS); -	sync_blockdev(sb->s_bdev);  	if (sb->s_op->freeze_fs) {  		ret = sb->s_op->freeze_fs(sb);  		if (ret) {  			printk(KERN_ERR  				"VFS:Filesystem freeze failed\n"); -			sb->s_frozen = SB_UNFROZEN; +			sb->s_writers.frozen = SB_UNFROZEN;  			smp_wmb(); -			wake_up(&sb->s_wait_unfrozen); +			wake_up(&sb->s_writers.wait_unfrozen);  			deactivate_locked_super(sb);  			return ret;  		}  	} +	/* +	 * This is just for debugging purposes so that fs can warn if it +	 * sees write activity when frozen is set to SB_FREEZE_COMPLETE. +	 */ +	sb->s_writers.frozen = SB_FREEZE_COMPLETE;  	up_write(&sb->s_umount);  	return 0;  } @@ -1231,7 +1440,7 @@ int thaw_super(struct super_block *sb)  	int error;  	down_write(&sb->s_umount); -	if (sb->s_frozen == SB_UNFROZEN) { +	if (sb->s_writers.frozen == SB_UNFROZEN) {  		up_write(&sb->s_umount);  		return -EINVAL;  	} @@ -1244,16 +1453,15 @@ int thaw_super(struct super_block *sb)  		if (error) {  			printk(KERN_ERR  				"VFS:Filesystem thaw failed\n"); -			sb->s_frozen = SB_FREEZE_TRANS;  			up_write(&sb->s_umount);  			return error;  		}  	}  out: -	sb->s_frozen = SB_UNFROZEN; +	sb->s_writers.frozen = SB_UNFROZEN;  	smp_wmb(); -	wake_up(&sb->s_wait_unfrozen); +	wake_up(&sb->s_writers.wait_unfrozen);  	deactivate_locked_super(sb);  	return 0; diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index a4759833d62..614b2b54488 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -228,6 +228,8 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	ret = 0;  	if (bb->vm_ops->page_mkwrite)  		ret = bb->vm_ops->page_mkwrite(vma, vmf); +	else +		file_update_time(file);  	sysfs_put_active(attr_sd);  	return ret; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 15052ff916e..e562dd43f41 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,6 +124,12 @@ xfs_setfilesize_trans_alloc(  	ioend->io_append_trans = tp;  	/* +	 * We will pass freeze protection with a transaction.  So tell lockdep +	 * we released it. +	 */ +	rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], +		      1, _THIS_IP_); +	/*  	 * We hand off the transaction to the completion thread now, so  	 * clear the flag here.  	 */ @@ -199,6 +205,15 @@ xfs_end_io(  	struct xfs_inode *ip = XFS_I(ioend->io_inode);  	int		error = 0; +	if (ioend->io_append_trans) { +		/* +		 * We've got freeze protection passed with the transaction. +		 * Tell lockdep about it. +		 */ +		rwsem_acquire_read( +			&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], +			0, 1, _THIS_IP_); +	}  	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {  		ioend->io_error = -EIO;  		goto done; @@ -1425,6 +1440,9 @@ out_trans_cancel:  	if (ioend->io_append_trans) {  		current_set_flags_nested(&ioend->io_append_trans->t_pflags,  					 PF_FSTRANS); +		rwsem_acquire_read( +			&inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], +			0, 1, _THIS_IP_);  		xfs_trans_cancel(ioend->io_append_trans, 0);  	}  out_destroy_ioend: diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c4559c6e6f2..56afcdb2377 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -770,10 +770,12 @@ xfs_file_aio_write(  	if (ocount == 0)  		return 0; -	xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); +	sb_start_write(inode->i_sb); -	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) -		return -EIO; +	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { +		ret = -EIO; +		goto out; +	}  	if (unlikely(file->f_flags & O_DIRECT))  		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); @@ -792,6 +794,8 @@ xfs_file_aio_write(  			ret = err;  	} +out: +	sb_end_write(inode->i_sb);  	return ret;  } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f1535d25a9..0e0232c3b6d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -364,9 +364,15 @@ xfs_fssetdm_by_handle(  	if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))  		return -XFS_ERROR(EFAULT); +	error = mnt_want_write_file(parfilp); +	if (error) +		return error; +  	dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); -	if (IS_ERR(dentry)) +	if (IS_ERR(dentry)) { +		mnt_drop_write_file(parfilp);  		return PTR_ERR(dentry); +	}  	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {  		error = -XFS_ERROR(EPERM); @@ -382,6 +388,7 @@ xfs_fssetdm_by_handle(  				 fsd.fsd_dmstate);   out: +	mnt_drop_write_file(parfilp);  	dput(dentry);  	return error;  } @@ -634,7 +641,11 @@ xfs_ioc_space(  	if (ioflags & IO_INVIS)  		attr_flags |= XFS_ATTR_DMI; +	error = mnt_want_write_file(filp); +	if (error) +		return error;  	error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); +	mnt_drop_write_file(filp);  	return -error;  } @@ -1163,6 +1174,7 @@ xfs_ioc_fssetxattr(  {  	struct fsxattr		fa;  	unsigned int		mask; +	int error;  	if (copy_from_user(&fa, arg, sizeof(fa)))  		return -EFAULT; @@ -1171,7 +1183,12 @@ xfs_ioc_fssetxattr(  	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))  		mask |= FSX_NONBLOCK; -	return -xfs_ioctl_setattr(ip, &fa, mask); +	error = mnt_want_write_file(filp); +	if (error) +		return error; +	error = xfs_ioctl_setattr(ip, &fa, mask); +	mnt_drop_write_file(filp); +	return -error;  }  STATIC int @@ -1196,6 +1213,7 @@ xfs_ioc_setxflags(  	struct fsxattr		fa;  	unsigned int		flags;  	unsigned int		mask; +	int error;  	if (copy_from_user(&flags, arg, sizeof(flags)))  		return -EFAULT; @@ -1210,7 +1228,12 @@ xfs_ioc_setxflags(  		mask |= FSX_NONBLOCK;  	fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); -	return -xfs_ioctl_setattr(ip, &fa, mask); +	error = mnt_want_write_file(filp); +	if (error) +		return error; +	error = xfs_ioctl_setattr(ip, &fa, mask); +	mnt_drop_write_file(filp); +	return -error;  }  STATIC int @@ -1385,8 +1408,13 @@ xfs_file_ioctl(  		if (copy_from_user(&dmi, arg, sizeof(dmi)))  			return -XFS_ERROR(EFAULT); +		error = mnt_want_write_file(filp); +		if (error) +			return error; +  		error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,  				dmi.fsd_dmstate); +		mnt_drop_write_file(filp);  		return -error;  	} @@ -1434,7 +1462,11 @@ xfs_file_ioctl(  		if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))  			return -XFS_ERROR(EFAULT); +		error = mnt_want_write_file(filp); +		if (error) +			return error;  		error = xfs_swapext(&sxp); +		mnt_drop_write_file(filp);  		return -error;  	} @@ -1463,9 +1495,14 @@ xfs_file_ioctl(  		if (copy_from_user(&inout, arg, sizeof(inout)))  			return -XFS_ERROR(EFAULT); +		error = mnt_want_write_file(filp); +		if (error) +			return error; +  		/* input parameter is passed in resblks field of structure */  		in = inout.resblks;  		error = xfs_reserve_blocks(mp, &in, &inout); +		mnt_drop_write_file(filp);  		if (error)  			return -error; @@ -1496,7 +1533,11 @@ xfs_file_ioctl(  		if (copy_from_user(&in, arg, sizeof(in)))  			return -XFS_ERROR(EFAULT); +		error = mnt_want_write_file(filp); +		if (error) +			return error;  		error = xfs_growfs_data(mp, &in); +		mnt_drop_write_file(filp);  		return -error;  	} @@ -1506,7 +1547,11 @@ xfs_file_ioctl(  		if (copy_from_user(&in, arg, sizeof(in)))  			return -XFS_ERROR(EFAULT); +		error = mnt_want_write_file(filp); +		if (error) +			return error;  		error = xfs_growfs_log(mp, &in); +		mnt_drop_write_file(filp);  		return -error;  	} @@ -1516,7 +1561,11 @@ xfs_file_ioctl(  		if (copy_from_user(&in, arg, sizeof(in)))  			return -XFS_ERROR(EFAULT); +		error = mnt_want_write_file(filp); +		if (error) +			return error;  		error = xfs_growfs_rt(mp, &in); +		mnt_drop_write_file(filp);  		return -error;  	} diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c4f2da0d2bf..1244274a567 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -600,7 +600,11 @@ xfs_file_compat_ioctl(  		if (xfs_compat_growfs_data_copyin(&in, arg))  			return -XFS_ERROR(EFAULT); +		error = mnt_want_write_file(filp); +		if (error) +			return error;  		error = xfs_growfs_data(mp, &in); +		mnt_drop_write_file(filp);  		return -error;  	}  	case XFS_IOC_FSGROWFSRT_32: { @@ -608,7 +612,11 @@ xfs_file_compat_ioctl(  		if (xfs_compat_growfs_rt_copyin(&in, arg))  			return -XFS_ERROR(EFAULT); +		error = mnt_want_write_file(filp); +		if (error) +			return error;  		error = xfs_growfs_rt(mp, &in); +		mnt_drop_write_file(filp);  		return -error;  	}  #endif @@ -627,7 +635,11 @@ xfs_file_compat_ioctl(  				   offsetof(struct xfs_swapext, sx_stat)) ||  		    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))  			return -XFS_ERROR(EFAULT); +		error = mnt_want_write_file(filp); +		if (error) +			return error;  		error = xfs_swapext(&sxp); +		mnt_drop_write_file(filp);  		return -error;  	}  	case XFS_IOC_FSBULKSTAT_32: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 915edf6639f..973dff6ad93 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -680,9 +680,9 @@ xfs_iomap_write_unwritten(  		 * the same inode that we complete here and might deadlock  		 * on the iolock.  		 */ -		xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); +		sb_start_intwrite(mp->m_super);  		tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); -		tp->t_flags |= XFS_TRANS_RESERVE; +		tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;  		error = xfs_trans_reserve(tp, resblks,  				XFS_WRITE_LOG_RES(mp), 0,  				XFS_TRANS_PERM_LOG_RES, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 711ca51ca3d..29c2f83d414 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1551,7 +1551,7 @@ xfs_unmountfs(  int  xfs_fs_writable(xfs_mount_t *mp)  { -	return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || +	return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) ||  		(mp->m_flags & XFS_MOUNT_RDONLY));  } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8724336a9a0..05a05a7b611 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -311,9 +311,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,  #define SHUTDOWN_REMOTE_REQ	0x0010	/* shutdown came from remote cell */  #define SHUTDOWN_DEVICE_REQ	0x0020	/* failed all paths to the device */ -#define xfs_test_for_freeze(mp)		((mp)->m_super->s_frozen) -#define xfs_wait_for_freeze(mp,l)	vfs_check_frozen((mp)->m_super, (l)) -  /*   * Flags for xfs_mountfs   */ diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 97304f10e78..96548176db8 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -403,7 +403,7 @@ xfs_sync_worker(  	if (!(mp->m_super->s_flags & MS_ACTIVE) &&  	    !(mp->m_flags & XFS_MOUNT_RDONLY)) {  		/* dgc: errors ignored here */ -		if (mp->m_super->s_frozen == SB_UNFROZEN && +		if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&  		    xfs_log_need_covered(mp))  			error = xfs_fs_log_dummy(mp);  		else diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index fdf324508c5..06ed520a767 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -576,8 +576,12 @@ xfs_trans_alloc(  	xfs_mount_t	*mp,  	uint		type)  { -	xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); -	return _xfs_trans_alloc(mp, type, KM_SLEEP); +	xfs_trans_t     *tp; + +	sb_start_intwrite(mp->m_super); +	tp = _xfs_trans_alloc(mp, type, KM_SLEEP); +	tp->t_flags |= XFS_TRANS_FREEZE_PROT; +	return tp;  }  xfs_trans_t * @@ -588,6 +592,7 @@ _xfs_trans_alloc(  {  	xfs_trans_t	*tp; +	WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);  	atomic_inc(&mp->m_active_trans);  	tp = kmem_zone_zalloc(xfs_trans_zone, memflags); @@ -611,6 +616,8 @@ xfs_trans_free(  	xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);  	atomic_dec(&tp->t_mountp->m_active_trans); +	if (tp->t_flags & XFS_TRANS_FREEZE_PROT) +		sb_end_intwrite(tp->t_mountp->m_super);  	xfs_trans_free_dqinfo(tp);  	kmem_zone_free(xfs_trans_zone, tp);  } @@ -643,7 +650,11 @@ xfs_trans_dup(  	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);  	ASSERT(tp->t_ticket != NULL); -	ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); +	ntp->t_flags = XFS_TRANS_PERM_LOG_RES | +		       (tp->t_flags & XFS_TRANS_RESERVE) | +		       (tp->t_flags & XFS_TRANS_FREEZE_PROT); +	/* We gave our writer reference to the new transaction */ +	tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;  	ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);  	ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;  	tp->t_blk_res = tp->t_blk_res_used; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index bc2afd52a0b..db056544cbb 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -179,6 +179,8 @@ struct xfs_log_item_desc {  #define	XFS_TRANS_SYNC		0x08	/* make commit synchronous */  #define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */  #define XFS_TRANS_RESERVE	0x20    /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT	0x40	/* Transaction has elevated writer +					   count in superblock */  /*   * Values for call flags parameter. diff --git a/include/linux/audit.h b/include/linux/audit.h index 22f292a917a..36abf2aa7e6 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -130,6 +130,7 @@  #define AUDIT_LAST_KERN_ANOM_MSG    1799  #define AUDIT_ANOM_PROMISCUOUS      1700 /* Device changed promiscuous mode */  #define AUDIT_ANOM_ABEND            1701 /* Process ended abnormally */ +#define AUDIT_ANOM_LINK		    1702 /* Suspicious use of file links */  #define AUDIT_INTEGRITY_DATA	    1800 /* Data integrity verification */  #define AUDIT_INTEGRITY_METADATA    1801 /* Metadata integrity verification */  #define AUDIT_INTEGRITY_STATUS	    1802 /* Integrity enable status */ @@ -687,6 +688,8 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,  					     const struct path *path);  extern void		    audit_log_key(struct audit_buffer *ab,  					  char *key); +extern void		    audit_log_link_denied(const char *operation, +						  struct path *link);  extern void		    audit_log_lost(const char *message);  #ifdef CONFIG_SECURITY  extern void 		    audit_log_secctx(struct audit_buffer *ab, u32 secid); @@ -716,6 +719,7 @@ extern int audit_enabled;  #define audit_log_untrustedstring(a,s) do { ; } while (0)  #define audit_log_d_path(b, p, d) do { ; } while (0)  #define audit_log_key(b, k) do { ; } while (0) +#define audit_log_link_denied(o, l) do { ; } while (0)  #define audit_log_secctx(b,s) do { ; } while (0)  #define audit_enabled 0  #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 4ba5c871552..38dba16c417 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -414,6 +414,7 @@ struct inodes_stat_t {  #include <linux/shrinker.h>  #include <linux/migrate_mode.h>  #include <linux/uidgid.h> +#include <linux/lockdep.h>  #include <asm/byteorder.h> @@ -440,6 +441,8 @@ extern unsigned long get_max_files(void);  extern int sysctl_nr_open;  extern struct inodes_stat_t inodes_stat;  extern int leases_enable, lease_break_time; +extern int sysctl_protected_symlinks; +extern int sysctl_protected_hardlinks;  struct buffer_head;  typedef int (get_block_t)(struct inode *inode, sector_t iblock, @@ -1445,6 +1448,8 @@ extern void f_delown(struct file *filp);  extern pid_t f_getown(struct file *filp);  extern int send_sigurg(struct fown_struct *fown); +struct mm_struct; +  /*   *	Umount options   */ @@ -1458,6 +1463,31 @@ extern int send_sigurg(struct fown_struct *fown);  extern struct list_head super_blocks;  extern spinlock_t sb_lock; +/* Possible states of 'frozen' field */ +enum { +	SB_UNFROZEN = 0,		/* FS is unfrozen */ +	SB_FREEZE_WRITE	= 1,		/* Writes, dir ops, ioctls frozen */ +	SB_FREEZE_PAGEFAULT = 2,	/* Page faults stopped as well */ +	SB_FREEZE_FS = 3,		/* For internal FS use (e.g. to stop +					 * internal threads if needed) */ +	SB_FREEZE_COMPLETE = 4,		/* ->freeze_fs finished successfully */ +}; + +#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) + +struct sb_writers { +	/* Counters for counting writers at each level */ +	struct percpu_counter	counter[SB_FREEZE_LEVELS]; +	wait_queue_head_t	wait;		/* queue for waiting for +						   writers / faults to finish */ +	int			frozen;		/* Is sb frozen? */ +	wait_queue_head_t	wait_unfrozen;	/* queue for waiting for +						   sb to be thawed */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	struct lockdep_map	lock_map[SB_FREEZE_LEVELS]; +#endif +}; +  struct super_block {  	struct list_head	s_list;		/* Keep this first */  	dev_t			s_dev;		/* search index; _not_ kdev_t */ @@ -1505,8 +1535,7 @@ struct super_block {  	struct hlist_node	s_instances;  	struct quota_info	s_dquot;	/* Diskquota specific options */ -	int			s_frozen; -	wait_queue_head_t	s_wait_unfrozen; +	struct sb_writers	s_writers;  	char s_id[32];				/* Informational name */  	u8 s_uuid[16];				/* UUID */ @@ -1561,14 +1590,117 @@ extern struct timespec current_fs_time(struct super_block *sb);  /*   * Snapshotting support.   */ -enum { -	SB_UNFROZEN = 0, -	SB_FREEZE_WRITE	= 1, -	SB_FREEZE_TRANS = 2, -}; -#define vfs_check_frozen(sb, level) \ -	wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) +void __sb_end_write(struct super_block *sb, int level); +int __sb_start_write(struct super_block *sb, int level, bool wait); + +/** + * sb_end_write - drop write access to a superblock + * @sb: the super we wrote to + * + * Decrement number of writers to the filesystem. Wake up possible waiters + * wanting to freeze the filesystem. + */ +static inline void sb_end_write(struct super_block *sb) +{ +	__sb_end_write(sb, SB_FREEZE_WRITE); +} + +/** + * sb_end_pagefault - drop write access to a superblock from a page fault + * @sb: the super we wrote to + * + * Decrement number of processes handling write page fault to the filesystem. + * Wake up possible waiters wanting to freeze the filesystem. + */ +static inline void sb_end_pagefault(struct super_block *sb) +{ +	__sb_end_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_end_intwrite - drop write access to a superblock for internal fs purposes + * @sb: the super we wrote to + * + * Decrement fs-internal number of writers to the filesystem.  Wake up possible + * waiters wanting to freeze the filesystem. + */ +static inline void sb_end_intwrite(struct super_block *sb) +{ +	__sb_end_write(sb, SB_FREEZE_FS); +} + +/** + * sb_start_write - get write access to a superblock + * @sb: the super we write to + * + * When a process wants to write data or metadata to a file system (i.e. dirty + * a page or an inode), it should embed the operation in a sb_start_write() - + * sb_end_write() pair to get exclusion against file system freezing. This + * function increments number of writers preventing freezing. If the file + * system is already frozen, the function waits until the file system is + * thawed. + * + * Since freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. Generally, + * freeze protection should be the outermost lock. In particular, we have: + * + * sb_start_write + *   -> i_mutex			(write path, truncate, directory ops, ...) + *   -> s_umount		(freeze_super, thaw_super) + */ +static inline void sb_start_write(struct super_block *sb) +{ +	__sb_start_write(sb, SB_FREEZE_WRITE, true); +} + +static inline int sb_start_write_trylock(struct super_block *sb) +{ +	return __sb_start_write(sb, SB_FREEZE_WRITE, false); +} + +/** + * sb_start_pagefault - get write access to a superblock from a page fault + * @sb: the super we write to + * + * When a process starts handling write page fault, it should embed the + * operation into sb_start_pagefault() - sb_end_pagefault() pair to get + * exclusion against file system freezing. This is needed since the page fault + * is going to dirty a page. This function increments number of running page + * faults preventing freezing. If the file system is already frozen, the + * function waits until the file system is thawed. + * + * Since page fault freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. It is advised to + * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault + * handling code implies lock dependency: + * + * mmap_sem + *   -> sb_start_pagefault + */ +static inline void sb_start_pagefault(struct super_block *sb) +{ +	__sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); +} + +/* + * sb_start_intwrite - get write access to a superblock for internal fs purposes + * @sb: the super we write to + * + * This is the third level of protection against filesystem freezing. It is + * free for use by a filesystem. The only requirement is that it must rank + * below sb_start_pagefault. + * + * For example filesystem can call sb_start_intwrite() when starting a + * transaction which somewhat eases handling of freezing for internal sources + * of filesystem changes (internal fs threads, discarding preallocation on file + * close, etc.). + */ +static inline void sb_start_intwrite(struct super_block *sb) +{ +	__sb_start_write(sb, SB_FREEZE_FS, true); +} +  extern bool inode_owner_or_capable(const struct inode *inode); @@ -1892,6 +2024,7 @@ struct file_system_type {  	struct lock_class_key s_lock_key;  	struct lock_class_key s_umount_key;  	struct lock_class_key s_vfs_rename_key; +	struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];  	struct lock_class_key i_lock_key;  	struct lock_class_key i_mutex_key; @@ -2334,9 +2467,6 @@ static inline void i_readcount_inc(struct inode *inode)  }  #endif  extern int do_pipe_flags(int *, int); -extern struct file *create_read_pipe(struct file *f, int flags); -extern struct file *create_write_pipe(int flags); -extern void free_write_pipe(struct file *);  extern int kernel_read(struct file *, loff_t, char *, unsigned long);  extern struct file * open_exec(const char *); diff --git a/include/linux/mm.h b/include/linux/mm.h index bd079a1b0fd..311be906b57 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1441,6 +1441,7 @@ extern void truncate_inode_pages_range(struct address_space *,  /* generic vm_area_ops exported for stackable file systems */  extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); +extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);  /* mm/page-writeback.c */  int write_one_page(struct page *page, int wait); diff --git a/include/linux/namei.h b/include/linux/namei.h index d2ef8b34b96..4bf19d8174e 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -67,6 +67,7 @@ extern int kern_path(const char *, unsigned, struct path *);  extern struct dentry *kern_path_create(int, const char *, struct path *, int);  extern struct dentry *user_path_create(int, const char __user *, struct path *, int); +extern void done_path_create(struct path *, struct dentry *);  extern struct dentry *kern_path_locked(const char *, struct path *);  extern int vfs_path_lookup(struct dentry *, struct vfsmount *,  			   const char *, unsigned int, struct path *); diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index ce4743a2601..fa63048fecf 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -143,6 +143,7 @@ typedef struct svc_fh {  	int			fh_maxsize;	/* max size for fh_handle */  	unsigned char		fh_locked;	/* inode locked by us */ +	unsigned char		fh_want_write;	/* remount protection taken */  #ifdef CONFIG_NFSD_V3  	unsigned char		fh_post_saved;	/* post-op attrs saved */ diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index e11d1c0fc60..ad1a427b526 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -160,4 +160,6 @@ void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);  long pipe_fcntl(struct file *, unsigned int, unsigned long arg);  struct pipe_inode_info *get_pipe_info(struct file *file); +int create_pipe_files(struct file **, int); +  #endif diff --git a/kernel/audit.c b/kernel/audit.c index 4a3f28d2ca6..ea3b7b6191c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1456,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key)  }  /** + * audit_log_link_denied - report a link restriction denial + * @operation: specific link opreation + * @link: the path that triggered the restriction + */ +void audit_log_link_denied(const char *operation, struct path *link) +{ +	struct audit_buffer *ab; + +	ab = audit_log_start(current->audit_context, GFP_KERNEL, +			     AUDIT_ANOM_LINK); +	audit_log_format(ab, "op=%s action=denied", operation); +	audit_log_format(ab, " pid=%d comm=", current->pid); +	audit_log_untrustedstring(ab, current->comm); +	audit_log_d_path(ab, " path=", link); +	audit_log_format(ab, " dev="); +	audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); +	audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); +	audit_log_end(ab); +} + +/**   * audit_log_end - end one audit record   * @ab: the audit_buffer   * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6502d35a25b..87174ef5916 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1498,6 +1498,24 @@ static struct ctl_table fs_table[] = {  #endif  #endif  	{ +		.procname	= "protected_symlinks", +		.data		= &sysctl_protected_symlinks, +		.maxlen		= sizeof(int), +		.mode		= 0600, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &one, +	}, +	{ +		.procname	= "protected_hardlinks", +		.data		= &sysctl_protected_hardlinks, +		.maxlen		= sizeof(int), +		.mode		= 0600, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &one, +	}, +	{  		.procname	= "suid_dumpable",  		.data		= &suid_dumpable,  		.maxlen		= sizeof(int), diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index f8a3f1a829b..ba6085d9c74 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -12,7 +12,7 @@  #ifdef CONFIG_HOTPLUG_CPU  static LIST_HEAD(percpu_counters); -static DEFINE_MUTEX(percpu_counters_lock); +static DEFINE_SPINLOCK(percpu_counters_lock);  #endif  #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER @@ -123,9 +123,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,  #ifdef CONFIG_HOTPLUG_CPU  	INIT_LIST_HEAD(&fbc->list); -	mutex_lock(&percpu_counters_lock); +	spin_lock(&percpu_counters_lock);  	list_add(&fbc->list, &percpu_counters); -	mutex_unlock(&percpu_counters_lock); +	spin_unlock(&percpu_counters_lock);  #endif  	return 0;  } @@ -139,9 +139,9 @@ void percpu_counter_destroy(struct percpu_counter *fbc)  	debug_percpu_counter_deactivate(fbc);  #ifdef CONFIG_HOTPLUG_CPU -	mutex_lock(&percpu_counters_lock); +	spin_lock(&percpu_counters_lock);  	list_del(&fbc->list); -	mutex_unlock(&percpu_counters_lock); +	spin_unlock(&percpu_counters_lock);  #endif  	free_percpu(fbc->counters);  	fbc->counters = NULL; @@ -170,7 +170,7 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,  		return NOTIFY_OK;  	cpu = (unsigned long)hcpu; -	mutex_lock(&percpu_counters_lock); +	spin_lock(&percpu_counters_lock);  	list_for_each_entry(fbc, &percpu_counters, list) {  		s32 *pcount;  		unsigned long flags; @@ -181,7 +181,7 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,  		*pcount = 0;  		raw_spin_unlock_irqrestore(&fbc->lock, flags);  	} -	mutex_unlock(&percpu_counters_lock); +	spin_unlock(&percpu_counters_lock);  #endif  	return NOTIFY_OK;  } diff --git a/mm/filemap.c b/mm/filemap.c index a4a5260b027..fa5ca304148 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1712,8 +1712,35 @@ page_not_uptodate:  }  EXPORT_SYMBOL(filemap_fault); +int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +	struct page *page = vmf->page; +	struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +	int ret = VM_FAULT_LOCKED; + +	sb_start_pagefault(inode->i_sb); +	file_update_time(vma->vm_file); +	lock_page(page); +	if (page->mapping != inode->i_mapping) { +		unlock_page(page); +		ret = VM_FAULT_NOPAGE; +		goto out; +	} +	/* +	 * We mark the page dirty already here so that when freeze is in +	 * progress, we are guaranteed that writeback during freezing will +	 * see the dirty page and writeprotect it again. +	 */ +	set_page_dirty(page); +out: +	sb_end_pagefault(inode->i_sb); +	return ret; +} +EXPORT_SYMBOL(filemap_page_mkwrite); +  const struct vm_operations_struct generic_file_vm_ops = {  	.fault		= filemap_fault, +	.page_mkwrite	= filemap_page_mkwrite,  };  /* This is used for a general mmap of a disk file */ @@ -2407,8 +2434,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  	count = ocount;  	pos = *ppos; -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); -  	/* We can write back this queue in page reclaim */  	current->backing_dev_info = mapping->backing_dev_info;  	written = 0; @@ -2507,6 +2532,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  	BUG_ON(iocb->ki_pos != pos); +	sb_start_write(inode->i_sb);  	mutex_lock(&inode->i_mutex);  	blk_start_plug(&plug);  	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); @@ -2520,6 +2546,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  			ret = err;  	}  	blk_finish_plug(&plug); +	sb_end_write(inode->i_sb);  	return ret;  }  EXPORT_SYMBOL(generic_file_aio_write); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 213ca1f5340..13e013b1270 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -304,6 +304,7 @@ out:  static const struct vm_operations_struct xip_file_vm_ops = {  	.fault	= xip_file_fault, +	.page_mkwrite	= filemap_page_mkwrite,  };  int xip_file_mmap(struct file * file, struct vm_area_struct * vma) @@ -401,6 +402,8 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,  	loff_t pos;  	ssize_t ret; +	sb_start_write(inode->i_sb); +  	mutex_lock(&inode->i_mutex);  	if (!access_ok(VERIFY_READ, buf, len)) { @@ -411,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,  	pos = *ppos;  	count = len; -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); -  	/* We can write back this queue in page reclaim */  	current->backing_dev_info = mapping->backing_dev_info; @@ -436,6 +437,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,  	current->backing_dev_info = NULL;   out_up:  	mutex_unlock(&inode->i_mutex); +	sb_end_write(inode->i_sb);  	return ret;  }  EXPORT_SYMBOL_GPL(xip_file_write); diff --git a/mm/memory.c b/mm/memory.c index 482f089765f..57361708d1a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2650,6 +2650,9 @@ reuse:  		if (!page_mkwrite) {  			wait_on_page_locked(dirty_page);  			set_page_dirty_balance(dirty_page, page_mkwrite); +			/* file_update_time outside page_lock */ +			if (vma->vm_file) +				file_update_time(vma->vm_file);  		}  		put_page(dirty_page);  		if (page_mkwrite) { @@ -2667,10 +2670,6 @@ reuse:  			}  		} -		/* file_update_time outside page_lock */ -		if (vma->vm_file) -			file_update_time(vma->vm_file); -  		return ret;  	} @@ -3339,12 +3338,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	if (dirty_page) {  		struct address_space *mapping = page->mapping; +		int dirtied = 0;  		if (set_page_dirty(dirty_page)) -			page_mkwrite = 1; +			dirtied = 1;  		unlock_page(dirty_page);  		put_page(dirty_page); -		if (page_mkwrite && mapping) { +		if ((dirtied || page_mkwrite) && mapping) {  			/*  			 * Some device drivers do not set page.mapping but still  			 * dirty their pages @@ -3353,7 +3353,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,  		}  		/* file_update_time outside page_lock */ -		if (vma->vm_file) +		if (vma->vm_file && !page_mkwrite)  			file_update_time(vma->vm_file);  	} else {  		unlock_page(vmf.page); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 79981d97bc9..e4768c180da 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -823,6 +823,34 @@ fail:  	return NULL;  } +static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +{ +	struct dentry *dentry; +	struct path path; +	int err = 0; +	/* +	 * Get the parent directory, calculate the hash for last +	 * component. +	 */ +	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); +	err = PTR_ERR(dentry); +	if (IS_ERR(dentry)) +		return err; + +	/* +	 * All right, let's create it. +	 */ +	err = security_path_mknod(&path, dentry, mode, 0); +	if (!err) { +		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); +		if (!err) { +			res->mnt = mntget(path.mnt); +			res->dentry = dget(dentry); +		} +	} +	done_path_create(&path, dentry); +	return err; +}  static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  { @@ -831,8 +859,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  	struct unix_sock *u = unix_sk(sk);  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;  	char *sun_path = sunaddr->sun_path; -	struct dentry *dentry = NULL; -	struct path path;  	int err;  	unsigned int hash;  	struct unix_address *addr; @@ -869,43 +895,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  	atomic_set(&addr->refcnt, 1);  	if (sun_path[0]) { -		umode_t mode; -		err = 0; -		/* -		 * Get the parent directory, calculate the hash for last -		 * component. -		 */ -		dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); -		err = PTR_ERR(dentry); -		if (IS_ERR(dentry)) -			goto out_mknod_parent; - -		/* -		 * All right, let's create it. -		 */ -		mode = S_IFSOCK | +		struct path path; +		umode_t mode = S_IFSOCK |  		       (SOCK_INODE(sock)->i_mode & ~current_umask()); -		err = mnt_want_write(path.mnt); -		if (err) -			goto out_mknod_dput; -		err = security_path_mknod(&path, dentry, mode, 0); -		if (err) -			goto out_mknod_drop_write; -		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); -out_mknod_drop_write: -		mnt_drop_write(path.mnt); -		if (err) -			goto out_mknod_dput; -		mutex_unlock(&path.dentry->d_inode->i_mutex); -		dput(path.dentry); -		path.dentry = dentry; - +		err = unix_mknod(sun_path, mode, &path); +		if (err) { +			if (err == -EEXIST) +				err = -EADDRINUSE; +			unix_release_addr(addr); +			goto out_up; +		}  		addr->hash = UNIX_HASH_SIZE; -	} - -	spin_lock(&unix_table_lock); - -	if (!sun_path[0]) { +		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); +		spin_lock(&unix_table_lock); +		u->path = path; +		list = &unix_socket_table[hash]; +	} else { +		spin_lock(&unix_table_lock);  		err = -EADDRINUSE;  		if (__unix_find_socket_byname(net, sunaddr, addr_len,  					      sk->sk_type, hash)) { @@ -914,9 +920,6 @@ out_mknod_drop_write:  		}  		list = &unix_socket_table[addr->hash]; -	} else { -		list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; -		u->path = path;  	}  	err = 0; @@ -930,16 +933,6 @@ out_up:  	mutex_unlock(&u->readlock);  out:  	return err; - -out_mknod_dput: -	dput(dentry); -	mutex_unlock(&path.dentry->d_inode->i_mutex); -	path_put(&path); -out_mknod_parent: -	if (err == -EEXIST) -		err = -EADDRINUSE; -	unix_release_addr(addr); -	goto out_up;  }  static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) diff --git a/sound/sound_firmware.c b/sound/sound_firmware.c index 7e96249536b..37711a5d0d6 100644 --- a/sound/sound_firmware.c +++ b/sound/sound_firmware.c @@ -23,14 +23,14 @@ static int do_mod_firmware_load(const char *fn, char **fp)  	if (l <= 0 || l > 131072)  	{  		printk(KERN_INFO "Invalid firmware '%s'\n", fn); -		filp_close(filp, current->files); +		filp_close(filp, NULL);  		return 0;  	}  	dp = vmalloc(l);  	if (dp == NULL)  	{  		printk(KERN_INFO "Out of memory loading '%s'.\n", fn); -		filp_close(filp, current->files); +		filp_close(filp, NULL);  		return 0;  	}  	pos = 0; @@ -38,10 +38,10 @@ static int do_mod_firmware_load(const char *fn, char **fp)  	{  		printk(KERN_INFO "Failed to read '%s'.\n", fn);  		vfree(dp); -		filp_close(filp, current->files); +		filp_close(filp, NULL);  		return 0;  	} -	filp_close(filp, current->files); +	filp_close(filp, NULL);  	*fp = dp;  	return (int) l;  }  |