diff options
Diffstat (limited to 'fs')
622 files changed, 31454 insertions, 9177 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 0a93dc1cb4a..55abfd62654 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -11,8 +11,7 @@ config 9P_FS  if 9P_FS  config 9P_FSCACHE -	bool "Enable 9P client caching support (EXPERIMENTAL)" -	depends on EXPERIMENTAL +	bool "Enable 9P client caching support"  	depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y  	help  	  Choose Y here to enable persistent, read-only local diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 15b67916620..7af425f53be 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -23,6 +23,7 @@  #include "acl.h"  #include "v9fs.h"  #include "v9fs_vfs.h" +#include "fid.h"  static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)  { @@ -113,16 +114,12 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type)  } -static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) +static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)  {  	int retval;  	char *name;  	size_t size;  	void *buffer; -	struct inode *inode = dentry->d_inode; - -	set_cached_acl(inode, type, acl); -  	if (!acl)  		return 0; @@ -144,17 +141,16 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)  	default:  		BUG();  	} -	retval = v9fs_xattr_set(dentry, name, buffer, size, 0); +	retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0);  err_free_out:  	kfree(buffer);  	return retval;  } -int v9fs_acl_chmod(struct dentry *dentry) +int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)  {  	int retval = 0;  	struct posix_acl *acl; -	struct inode *inode = dentry->d_inode;  	if (S_ISLNK(inode->i_mode))  		return -EOPNOTSUPP; @@ -163,25 +159,30 @@ int v9fs_acl_chmod(struct dentry *dentry)  		retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);  		if (retval)  			return retval; -		retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, acl); +		set_cached_acl(inode, ACL_TYPE_ACCESS, acl); +		retval = v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);  		posix_acl_release(acl);  	}  	return retval;  } -int v9fs_set_create_acl(struct dentry *dentry, -			struct posix_acl **dpacl, struct posix_acl **pacl) +int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, +			struct posix_acl *dacl, struct posix_acl *acl)  { -	if (dentry) { -		v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl); -		v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl); -	} -	posix_acl_release(*dpacl); -	posix_acl_release(*pacl); -	*dpacl = *pacl = NULL; +	set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); +	set_cached_acl(inode, ACL_TYPE_ACCESS, acl); +	v9fs_set_acl(fid, ACL_TYPE_DEFAULT, dacl); +	v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);  	return 0;  } +void v9fs_put_acl(struct posix_acl *dacl, +		  struct posix_acl *acl) +{ +	posix_acl_release(dacl); +	posix_acl_release(acl); +} +  int v9fs_acl_mode(struct inode *dir, umode_t *modep,  		  struct posix_acl **dpacl, struct posix_acl **pacl)  { diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 55955641196..e4f7e882272 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -17,27 +17,33 @@  #ifdef CONFIG_9P_FS_POSIX_ACL  extern int v9fs_get_acl(struct inode *, struct p9_fid *);  extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); -extern int v9fs_acl_chmod(struct dentry *); -extern int v9fs_set_create_acl(struct dentry *, -			       struct posix_acl **, struct posix_acl **); +extern int v9fs_acl_chmod(struct inode *, struct p9_fid *); +extern int v9fs_set_create_acl(struct inode *, struct p9_fid *, +			       struct posix_acl *, struct posix_acl *);  extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,  			 struct posix_acl **dpacl, struct posix_acl **pacl); +extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);  #else  #define v9fs_iop_get_acl NULL  static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)  {  	return 0;  } -static inline int v9fs_acl_chmod(struct dentry *dentry) +static inline int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)  {  	return 0;  } -static inline int v9fs_set_create_acl(struct dentry *dentry, -				      struct posix_acl **dpacl, -				      struct posix_acl **pacl) +static inline int v9fs_set_create_acl(struct inode *inode, +				      struct p9_fid *fid, +				      struct posix_acl *dacl, +				      struct posix_acl *acl)  {  	return 0;  } +static inline void v9fs_put_acl(struct posix_acl *dacl, +				struct posix_acl *acl) +{ +}  static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep,  				struct posix_acl **dpacl,  				struct posix_acl **pacl) diff --git a/fs/9p/fid.c b/fs/9p/fid.c index da8eefbe830..d51ec9fafcc 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -41,29 +41,16 @@   *   */ -int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) +static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)  { -	struct v9fs_dentry *dent; - -	p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n", -		 fid->fid, dentry->d_name.name); - -	dent = dentry->d_fsdata; -	if (!dent) { -		dent = kmalloc(sizeof(struct v9fs_dentry), GFP_KERNEL); -		if (!dent) -			return -ENOMEM; - -		spin_lock_init(&dent->lock); -		INIT_LIST_HEAD(&dent->fidlist); -		dentry->d_fsdata = dent; -	} - -	spin_lock(&dent->lock); -	list_add(&fid->dlist, &dent->fidlist); -	spin_unlock(&dent->lock); +	hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); +} -	return 0; +void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) +{ +	spin_lock(&dentry->d_lock); +	__add_fid(dentry, fid); +	spin_unlock(&dentry->d_lock);  }  /** @@ -74,24 +61,25 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)   *   */ -static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) +static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)  { -	struct v9fs_dentry *dent;  	struct p9_fid *fid, *ret;  	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", -		 dentry->d_name.name, dentry, uid, any); -	dent = (struct v9fs_dentry *) dentry->d_fsdata; +		 dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid), +		 any);  	ret = NULL; -	if (dent) { -		spin_lock(&dent->lock); -		list_for_each_entry(fid, &dent->fidlist, dlist) { -			if (any || fid->uid == uid) { +	/* we'll recheck under lock if there's anything to look in */ +	if (dentry->d_fsdata) { +		struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata; +		spin_lock(&dentry->d_lock); +		hlist_for_each_entry(fid, h, dlist) { +			if (any || uid_eq(fid->uid, uid)) {  				ret = fid;  				break;  			}  		} -		spin_unlock(&dent->lock); +		spin_unlock(&dentry->d_lock);  	}  	return ret; @@ -126,7 +114,7 @@ err_out:  }  static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, -					       uid_t uid, int any) +					       kuid_t uid, int any)  {  	struct dentry *ds;  	char **wnames, *uname; @@ -214,8 +202,17 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,  	}  	kfree(wnames);  fid_out: -	if (!IS_ERR(fid)) -		v9fs_fid_add(dentry, fid); +	if (!IS_ERR(fid)) { +		spin_lock(&dentry->d_lock); +		if (d_unhashed(dentry)) { +			spin_unlock(&dentry->d_lock); +			p9_client_clunk(fid); +			fid = ERR_PTR(-ENOENT); +		} else { +			__add_fid(dentry, fid); +			spin_unlock(&dentry->d_lock); +		} +	}  err_out:  	up_read(&v9ses->rename_sem);  	return fid; @@ -233,7 +230,7 @@ err_out:  struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)  { -	uid_t uid; +	kuid_t uid;  	int  any, access;  	struct v9fs_session_info *v9ses; @@ -253,7 +250,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)  		break;  	default: -		uid = ~0; +		uid = INVALID_UID;  		any = 0;  		break;  	} @@ -272,7 +269,7 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)  	return ret;  } -static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid) +static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid)  {  	struct p9_fid *fid, *ret; @@ -289,7 +286,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)  	int err;  	struct p9_fid *fid; -	fid = v9fs_fid_clone_with_uid(dentry, 0); +	fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID);  	if (IS_ERR(fid))  		goto error_out;  	/* diff --git a/fs/9p/fid.h b/fs/9p/fid.h index bb0b6e7f58f..2b6787fcb62 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -23,28 +23,8 @@  #define FS_9P_FID_H  #include <linux/list.h> -/** - * struct v9fs_dentry - 9p private data stored in dentry d_fsdata - * @lock: protects the fidlist - * @fidlist: list of FIDs currently associated with this dentry - * - * This structure defines the 9p private data associated with - * a particular dentry.  In particular, this private data is used - * to lookup which 9P FID handle should be used for a particular VFS - * operation.  FID handles are associated with dentries instead of - * inodes in order to more closely map functionality to the Plan 9 - * expected behavior for FID reclaimation and tracking. - * - * See Also: Mapping FIDs to Linux VFS model in - * Design and Implementation of the Linux 9P File System documentation - */ -struct v9fs_dentry { -	spinlock_t lock; /* protect fidlist */ -	struct list_head fidlist; -}; -  struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);  struct p9_fid *v9fs_fid_clone(struct dentry *dentry); -int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); +void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);  struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);  #endif diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index d934f04e773..58e6cbce415 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -161,7 +161,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)  				ret = r;  				continue;  			} -			v9ses->dfltuid = option; +			v9ses->dfltuid = make_kuid(current_user_ns(), option); +			if (!uid_valid(v9ses->dfltuid)) { +				p9_debug(P9_DEBUG_ERROR, +					 "uid field, but not a uid?\n"); +				ret = -EINVAL; +				continue; +			}  			break;  		case Opt_dfltgid:  			r = match_int(&args[0], &option); @@ -171,7 +177,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)  				ret = r;  				continue;  			} -			v9ses->dfltgid = option; +			v9ses->dfltgid = make_kgid(current_user_ns(), option); +			if (!gid_valid(v9ses->dfltgid)) { +				p9_debug(P9_DEBUG_ERROR, +					 "gid field, but not a gid?\n"); +				ret = -EINVAL; +				continue; +			}  			break;  		case Opt_afid:  			r = match_int(&args[0], &option); @@ -248,8 +260,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)  			else if (strcmp(s, "client") == 0) {  				v9ses->flags |= V9FS_ACCESS_CLIENT;  			} else { +				uid_t uid;  				v9ses->flags |= V9FS_ACCESS_SINGLE; -				v9ses->uid = simple_strtoul(s, &e, 10); +				uid = simple_strtoul(s, &e, 10);  				if (*e != '\0') {  					ret = -EINVAL;  					pr_info("Unknown access argument %s\n", @@ -257,6 +270,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)  					kfree(s);  					goto free_and_return;  				} +				v9ses->uid = make_kuid(current_user_ns(), uid); +				if (!uid_valid(v9ses->uid)) { +					ret = -EINVAL; +					pr_info("Uknown uid %s\n", s); +					kfree(s); +					goto free_and_return; +				}  			}  			kfree(s); @@ -319,7 +339,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,  	list_add(&v9ses->slist, &v9fs_sessionlist);  	spin_unlock(&v9fs_sessionlist_lock); -	v9ses->uid = ~0; +	v9ses->uid = INVALID_UID;  	v9ses->dfltuid = V9FS_DEFUID;  	v9ses->dfltgid = V9FS_DEFGID; @@ -364,7 +384,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,  		v9ses->flags &= ~V9FS_ACCESS_MASK;  		v9ses->flags |= V9FS_ACCESS_ANY; -		v9ses->uid = ~0; +		v9ses->uid = INVALID_UID;  	}  	if (!v9fs_proto_dotl(v9ses) ||  		!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { @@ -375,7 +395,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,  		v9ses->flags &= ~V9FS_ACL_MASK;  	} -	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0, +	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,  							v9ses->aname);  	if (IS_ERR(fid)) {  		retval = PTR_ERR(fid); @@ -387,7 +407,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,  	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)  		fid->uid = v9ses->uid;  	else -		fid->uid = ~0; +		fid->uid = INVALID_UID;  #ifdef CONFIG_9P_FSCACHE  	/* register the session for caching */ diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 34c59f14a1c..a8e127c8962 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -109,9 +109,9 @@ struct v9fs_session_info {  	char *uname;		/* user name to mount as */  	char *aname;		/* name of remote hierarchy being mounted */  	unsigned int maxdata;	/* max data for client interface */ -	unsigned int dfltuid;	/* default uid/muid for legacy support */ -	unsigned int dfltgid;	/* default gid for legacy support */ -	u32 uid;		/* if ACCESS_SINGLE, the uid that has access */ +	kuid_t dfltuid;		/* default uid/muid for legacy support */ +	kgid_t dfltgid;		/* default gid for legacy support */ +	kuid_t uid;		/* if ACCESS_SINGLE, the uid that has access */  	struct p9_client *clnt;	/* 9p client */  	struct list_head slist; /* list of sessions registered with v9fs */  	struct backing_dev_info bdi; @@ -165,8 +165,8 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,  #define V9FS_PORT	564  #define V9FS_DEFUSER	"nobody"  #define V9FS_DEFANAME	"" -#define V9FS_DEFUID	(-2) -#define V9FS_DEFGID	(-2) +#define V9FS_DEFUID	KUIDT_INIT(-2) +#define V9FS_DEFGID	KGIDT_INIT(-2)  static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)  { diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 64600b5d052..f039b104a98 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -83,21 +83,12 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)  static void v9fs_dentry_release(struct dentry *dentry)  { -	struct v9fs_dentry *dent; -	struct p9_fid *temp, *current_fid; - +	struct hlist_node *p, *n;  	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",  		 dentry->d_name.name, dentry); -	dent = dentry->d_fsdata; -	if (dent) { -		list_for_each_entry_safe(current_fid, temp, &dent->fidlist, -									dlist) { -			p9_client_clunk(current_fid); -		} - -		kfree(dent); -		dentry->d_fsdata = NULL; -	} +	hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) +		p9_client_clunk(hlist_entry(p, struct p9_fid, dlist)); +	dentry->d_fsdata = NULL;  }  static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) @@ -137,6 +128,7 @@ out_valid:  const struct dentry_operations v9fs_cached_dentry_operations = {  	.d_revalidate = v9fs_lookup_revalidate, +	.d_weak_revalidate = v9fs_lookup_revalidate,  	.d_delete = v9fs_cached_dentry_delete,  	.d_release = v9fs_dentry_release,  }; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index ff911e77965..be1e34adc3c 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -52,10 +52,9 @@   */  struct p9_rdir { -	struct mutex mutex;  	int head;  	int tail; -	uint8_t *buf; +	uint8_t buf[];  };  /** @@ -93,33 +92,12 @@ static void p9stat_init(struct p9_wstat *stbuf)   *   */ -static int v9fs_alloc_rdir_buf(struct file *filp, int buflen) +static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)  { -	struct p9_rdir *rdir; -	struct p9_fid *fid; -	int err = 0; - -	fid = filp->private_data; -	if (!fid->rdir) { -		rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); - -		if (rdir == NULL) { -			err = -ENOMEM; -			goto exit; -		} -		spin_lock(&filp->f_dentry->d_lock); -		if (!fid->rdir) { -			rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir); -			mutex_init(&rdir->mutex); -			rdir->head = rdir->tail = 0; -			fid->rdir = (void *) rdir; -			rdir = NULL; -		} -		spin_unlock(&filp->f_dentry->d_lock); -		kfree(rdir); -	} -exit: -	return err; +	struct p9_fid *fid = filp->private_data; +	if (!fid->rdir) +		fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); +	return fid->rdir;  }  /** @@ -145,20 +123,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)  	buflen = fid->clnt->msize - P9_IOHDRSZ; -	err = v9fs_alloc_rdir_buf(filp, buflen); -	if (err) -		goto exit; -	rdir = (struct p9_rdir *) fid->rdir; +	rdir = v9fs_alloc_rdir_buf(filp, buflen); +	if (!rdir) +		return -ENOMEM; -	err = mutex_lock_interruptible(&rdir->mutex); -	if (err) -		return err; -	while (err == 0) { +	while (1) {  		if (rdir->tail == rdir->head) {  			err = v9fs_file_readn(filp, rdir->buf, NULL,  							buflen, filp->f_pos);  			if (err <= 0) -				goto unlock_and_exit; +				return err;  			rdir->head = 0;  			rdir->tail = err; @@ -169,9 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)  					  rdir->tail - rdir->head, &st);  			if (err) {  				p9_debug(P9_DEBUG_VFS, "returned %d\n", err); -				err = -EIO;  				p9stat_free(&st); -				goto unlock_and_exit; +				return -EIO;  			}  			reclen = st.size+2; @@ -180,19 +153,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)  			p9stat_free(&st); -			if (over) { -				err = 0; -				goto unlock_and_exit; -			} +			if (over) +				return 0; +  			rdir->head += reclen;  			filp->f_pos += reclen;  		}  	} - -unlock_and_exit: -	mutex_unlock(&rdir->mutex); -exit: -	return err;  }  /** @@ -218,21 +185,16 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,  	buflen = fid->clnt->msize - P9_READDIRHDRSZ; -	err = v9fs_alloc_rdir_buf(filp, buflen); -	if (err) -		goto exit; -	rdir = (struct p9_rdir *) fid->rdir; +	rdir = v9fs_alloc_rdir_buf(filp, buflen); +	if (!rdir) +		return -ENOMEM; -	err = mutex_lock_interruptible(&rdir->mutex); -	if (err) -		return err; - -	while (err == 0) { +	while (1) {  		if (rdir->tail == rdir->head) {  			err = p9_client_readdir(fid, rdir->buf, buflen,  						filp->f_pos);  			if (err <= 0) -				goto unlock_and_exit; +				return err;  			rdir->head = 0;  			rdir->tail = err; @@ -245,8 +207,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,  					    &curdirent);  			if (err < 0) {  				p9_debug(P9_DEBUG_VFS, "returned %d\n", err); -				err = -EIO; -				goto unlock_and_exit; +				return -EIO;  			}  			/* d_off in dirent structure tracks the offset into @@ -261,20 +222,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,  					curdirent.d_type);  			oldoffset = curdirent.d_off; -			if (over) { -				err = 0; -				goto unlock_and_exit; -			} +			if (over) +				return 0;  			filp->f_pos = curdirent.d_off;  			rdir->head += err;  		}  	} - -unlock_and_exit: -	mutex_unlock(&rdir->mutex); -exit: -	return err;  } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index c2483e97bee..d384a8b77ee 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -80,10 +80,6 @@ int v9fs_file_open(struct inode *inode, struct file *file)  			p9_client_clunk(fid);  			return err;  		} -		if (file->f_flags & O_TRUNC) { -			i_size_write(inode, 0); -			inode->i_blocks = 0; -		}  		if ((file->f_flags & O_APPEND) &&  			(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))  			generic_file_llseek(file, 0, SEEK_END); @@ -133,7 +129,7 @@ out_error:  static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)  {  	int res = 0; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); @@ -302,7 +298,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)  static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	int ret = -ENOLCK;  	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", @@ -338,7 +334,7 @@ out_err:  static int v9fs_file_flock_dotl(struct file *filp, int cmd,  	struct file_lock *fl)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	int ret = -ENOLCK;  	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", @@ -529,7 +525,7 @@ v9fs_file_write(struct file *filp, const char __user * data,  	if (!count)  		goto out; -	retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode, +	retval = v9fs_file_write_internal(file_inode(filp),  					filp->private_data,  					data, count, &origin, 1);  	/* update offset on successful write */ @@ -604,7 +600,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	struct v9fs_inode *v9inode;  	struct page *page = vmf->page;  	struct file *filp = vma->vm_file; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", @@ -620,6 +616,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	lock_page(page);  	if (page->mapping != inode->i_mapping)  		goto out_unlock; +	wait_for_stable_page(page);  	return VM_FAULT_LOCKED;  out_unlock: diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 890bed538f9..d86edc8d3fd 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -192,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended)  		break;  	} -	if (uflags & O_TRUNC) -		ret |= P9_OTRUNC; -  	if (extended) {  		if (uflags & O_EXCL)  			ret |= P9_OEXCL; @@ -228,9 +225,9 @@ v9fs_blank_wstat(struct p9_wstat *wstat)  	wstat->uid = NULL;  	wstat->gid = NULL;  	wstat->muid = NULL; -	wstat->n_uid = ~0; -	wstat->n_gid = ~0; -	wstat->n_muid = ~0; +	wstat->n_uid = INVALID_UID; +	wstat->n_gid = INVALID_GID; +	wstat->n_muid = INVALID_UID;  	wstat->extension = NULL;  } @@ -695,9 +692,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,  				   "inode creation failed %d\n", err);  			goto error;  		} -		err = v9fs_fid_add(dentry, fid); -		if (err < 0) -			goto error; +		v9fs_fid_add(dentry, fid);  		d_instantiate(dentry, inode);  	}  	return ofid; @@ -793,7 +788,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,  	struct p9_fid *dfid, *fid;  	struct inode *inode;  	char *name; -	int result = 0;  	p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n",  		 dir, dentry->d_name.name, dentry, flags); @@ -811,13 +805,11 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,  	name = (char *) dentry->d_name.name;  	fid = p9_client_walk(dfid, 1, &name, 1);  	if (IS_ERR(fid)) { -		result = PTR_ERR(fid); -		if (result == -ENOENT) { -			inode = NULL; -			goto inst_out; +		if (fid == ERR_PTR(-ENOENT)) { +			d_add(dentry, NULL); +			return NULL;  		} - -		return ERR_PTR(result); +		return ERR_CAST(fid);  	}  	/*  	 * Make sure we don't use a wrong inode due to parallel @@ -829,14 +821,9 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,  	else  		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);  	if (IS_ERR(inode)) { -		result = PTR_ERR(inode); -		inode = NULL; -		goto error; +		p9_client_clunk(fid); +		return ERR_CAST(inode);  	} -	result = v9fs_fid_add(dentry, fid); -	if (result < 0) -		goto error_iput; -inst_out:  	/*  	 * If we had a rename on the server and a parallel lookup  	 * for the new name, then make sure we instantiate with @@ -845,15 +832,13 @@ inst_out:  	 * k/b.  	 */  	res = d_materialise_unique(dentry, inode); -	if (!IS_ERR(res)) -		return res; -	result = PTR_ERR(res); -error_iput: -	iput(inode); -error: -	p9_client_clunk(fid); - -	return ERR_PTR(result); +	if (!res) +		v9fs_fid_add(dentry, fid); +	else if (!IS_ERR(res)) +		v9fs_fid_add(res, fid); +	else +		p9_client_clunk(fid); +	return res;  }  static int diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 40895546e10..53687bbf229 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -57,7 +57,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,   * group of the new file system object.   */ -static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) +static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)  {  	BUG_ON(dir_inode == NULL); @@ -186,7 +186,6 @@ static int v9fs_mapped_dotl_flags(int flags)  		{ O_CREAT,	P9_DOTL_CREATE },  		{ O_EXCL,	P9_DOTL_EXCL },  		{ O_NOCTTY,	P9_DOTL_NOCTTY }, -		{ O_TRUNC,	P9_DOTL_TRUNC },  		{ O_APPEND,	P9_DOTL_APPEND },  		{ O_NONBLOCK,	P9_DOTL_NONBLOCK },  		{ O_DSYNC,	P9_DOTL_DSYNC }, @@ -246,7 +245,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,  			  int *opened)  {  	int err = 0; -	gid_t gid; +	kgid_t gid;  	umode_t mode;  	char *name = NULL;  	struct p9_qid qid; @@ -268,8 +267,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,  	}  	/* Only creates */ -	if (!(flags & O_CREAT) || dentry->d_inode) -		return finish_no_open(file, res); +	if (!(flags & O_CREAT)) +		return	finish_no_open(file, res); +	else if (dentry->d_inode) { +		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) +			return -EEXIST; +		else +			return finish_no_open(file, res); +	}  	v9ses = v9fs_inode2v9ses(dir); @@ -325,13 +330,11 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,  		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);  		goto error;  	} -	err = v9fs_fid_add(dentry, fid); -	if (err < 0) -		goto error; -	d_instantiate(dentry, inode); -  	/* Now set the ACL based on the default value */ -	v9fs_set_create_acl(dentry, &dacl, &pacl); +	v9fs_set_create_acl(inode, fid, dacl, pacl); + +	v9fs_fid_add(dentry, fid); +	d_instantiate(dentry, inode);  	v9inode = V9FS_I(inode);  	mutex_lock(&v9inode->v_mutex); @@ -364,6 +367,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,  #endif  	*opened |= FILE_CREATED;  out: +	v9fs_put_acl(dacl, pacl);  	dput(res);  	return err; @@ -373,7 +377,6 @@ error:  err_clunk_old_fid:  	if (ofid)  		p9_client_clunk(ofid); -	v9fs_set_create_acl(NULL, &dacl, &pacl);  	goto out;  } @@ -391,7 +394,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,  	int err;  	struct v9fs_session_info *v9ses;  	struct p9_fid *fid = NULL, *dfid = NULL; -	gid_t gid; +	kgid_t gid;  	char *name;  	umode_t mode;  	struct inode *inode; @@ -430,17 +433,17 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,  	if (err < 0)  		goto error; +	fid = p9_client_walk(dfid, 1, &name, 1); +	if (IS_ERR(fid)) { +		err = PTR_ERR(fid); +		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", +			 err); +		fid = NULL; +		goto error; +	} +  	/* instantiate inode and assign the unopened fid to the dentry */  	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { -		fid = p9_client_walk(dfid, 1, &name, 1); -		if (IS_ERR(fid)) { -			err = PTR_ERR(fid); -			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", -				 err); -			fid = NULL; -			goto error; -		} -  		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);  		if (IS_ERR(inode)) {  			err = PTR_ERR(inode); @@ -448,11 +451,11 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,  				 err);  			goto error;  		} -		err = v9fs_fid_add(dentry, fid); -		if (err < 0) -			goto error; +		v9fs_fid_add(dentry, fid); +		v9fs_set_create_acl(inode, fid, dacl, pacl);  		d_instantiate(dentry, inode);  		fid = NULL; +		err = 0;  	} else {  		/*  		 * Not in cached mode. No need to populate @@ -464,16 +467,15 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,  			err = PTR_ERR(inode);  			goto error;  		} +		v9fs_set_create_acl(inode, fid, dacl, pacl);  		d_instantiate(dentry, inode);  	} -	/* Now set the ACL based on the default value */ -	v9fs_set_create_acl(dentry, &dacl, &pacl);  	inc_nlink(dir);  	v9fs_invalidate_inode_attr(dir);  error:  	if (fid)  		p9_client_clunk(fid); -	v9fs_set_create_acl(NULL, &dacl, &pacl); +	v9fs_put_acl(dacl, pacl);  	return err;  } @@ -567,10 +569,11 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)  	struct v9fs_session_info *v9ses;  	struct p9_fid *fid;  	struct p9_iattr_dotl p9attr; +	struct inode *inode = dentry->d_inode;  	p9_debug(P9_DEBUG_VFS, "\n"); -	retval = inode_change_ok(dentry->d_inode, iattr); +	retval = inode_change_ok(inode, iattr);  	if (retval)  		return retval; @@ -591,23 +594,23 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)  		return PTR_ERR(fid);  	/* Write all dirty data */ -	if (S_ISREG(dentry->d_inode->i_mode)) -		filemap_write_and_wait(dentry->d_inode->i_mapping); +	if (S_ISREG(inode->i_mode)) +		filemap_write_and_wait(inode->i_mapping);  	retval = p9_client_setattr(fid, &p9attr);  	if (retval < 0)  		return retval;  	if ((iattr->ia_valid & ATTR_SIZE) && -	    iattr->ia_size != i_size_read(dentry->d_inode)) -		truncate_setsize(dentry->d_inode, iattr->ia_size); +	    iattr->ia_size != i_size_read(inode)) +		truncate_setsize(inode, iattr->ia_size); -	v9fs_invalidate_inode_attr(dentry->d_inode); -	setattr_copy(dentry->d_inode, iattr); -	mark_inode_dirty(dentry->d_inode); +	v9fs_invalidate_inode_attr(inode); +	setattr_copy(inode, iattr); +	mark_inode_dirty(inode);  	if (iattr->ia_valid & ATTR_MODE) {  		/* We also want to update ACL when we update mode bits */ -		retval = v9fs_acl_chmod(dentry); +		retval = v9fs_acl_chmod(inode, fid);  		if (retval < 0)  			return retval;  	} @@ -692,7 +695,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,  		const char *symname)  {  	int err; -	gid_t gid; +	kgid_t gid;  	char *name;  	struct p9_qid qid;  	struct inode *inode; @@ -741,11 +744,10 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,  				 err);  			goto error;  		} -		err = v9fs_fid_add(dentry, fid); -		if (err < 0) -			goto error; +		v9fs_fid_add(dentry, fid);  		d_instantiate(dentry, inode);  		fid = NULL; +		err = 0;  	} else {  		/* Not in cached mode. No need to populate inode with stat */  		inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); @@ -832,7 +834,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,  		dev_t rdev)  {  	int err; -	gid_t gid; +	kgid_t gid;  	char *name;  	umode_t mode;  	struct v9fs_session_info *v9ses; @@ -875,17 +877,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,  		goto error;  	v9fs_invalidate_inode_attr(dir); +	fid = p9_client_walk(dfid, 1, &name, 1); +	if (IS_ERR(fid)) { +		err = PTR_ERR(fid); +		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", +			 err); +		fid = NULL; +		goto error; +	} +  	/* instantiate inode and assign the unopened fid to the dentry */  	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { -		fid = p9_client_walk(dfid, 1, &name, 1); -		if (IS_ERR(fid)) { -			err = PTR_ERR(fid); -			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", -				 err); -			fid = NULL; -			goto error; -		} -  		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);  		if (IS_ERR(inode)) {  			err = PTR_ERR(inode); @@ -893,11 +895,11 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,  				 err);  			goto error;  		} -		err = v9fs_fid_add(dentry, fid); -		if (err < 0) -			goto error; +		v9fs_set_create_acl(inode, fid, dacl, pacl); +		v9fs_fid_add(dentry, fid);  		d_instantiate(dentry, inode);  		fid = NULL; +		err = 0;  	} else {  		/*  		 * Not in cached mode. No need to populate inode with stat. @@ -908,14 +910,13 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,  			err = PTR_ERR(inode);  			goto error;  		} +		v9fs_set_create_acl(inode, fid, dacl, pacl);  		d_instantiate(dentry, inode);  	} -	/* Now set the ACL based on the default value */ -	v9fs_set_create_acl(dentry, &dacl, &pacl);  error:  	if (fid)  		p9_client_clunk(fid); -	v9fs_set_create_acl(NULL, &dacl, &pacl); +	v9fs_put_acl(dacl, pacl);  	return err;  } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 137d5039689..91dad63e5a2 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -363,5 +363,5 @@ struct file_system_type v9fs_fs_type = {  	.mount = v9fs_mount,  	.kill_sb = v9fs_kill_super,  	.owner = THIS_MODULE, -	.fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT, +	.fs_flags = FS_RENAME_DOES_D_MOVE,  }; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 29653b70a9c..c45e016b190 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -111,19 +111,26 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,  int v9fs_xattr_set(struct dentry *dentry, const char *name,  		   const void *value, size_t value_len, int flags)  { +	struct p9_fid *fid = v9fs_fid_lookup(dentry); +	if (IS_ERR(fid)) +		return PTR_ERR(fid); +	return v9fs_fid_xattr_set(fid, name, value, value_len, flags); +} + +int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, +		   const void *value, size_t value_len, int flags) +{  	u64 offset = 0;  	int retval, msize, write_count; -	struct p9_fid *fid = NULL;  	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",  		 name, value_len, flags); -	fid = v9fs_fid_clone(dentry); -	if (IS_ERR(fid)) { -		retval = PTR_ERR(fid); -		fid = NULL; -		goto error; -	} +	/* Clone it */ +	fid = p9_client_walk(fid, 0, NULL, 1); +	if (IS_ERR(fid)) +		return PTR_ERR(fid); +  	/*  	 * On success fid points to xattr  	 */ @@ -131,7 +138,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,  	if (retval < 0) {  		p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",  			 retval); -		goto error; +		p9_client_clunk(fid); +		return retval;  	}  	msize = fid->clnt->msize;  	while (value_len) { @@ -144,17 +152,12 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,  		if (write_count < 0) {  			/* error in xattr write */  			retval = write_count; -			goto error; +			break;  		}  		offset += write_count;  		value_len -= write_count;  	} -	/* Total read xattr bytes */ -	retval = offset; -error: -	if (fid) -		retval = p9_client_clunk(fid); -	return retval; +	return p9_client_clunk(fid);  }  ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index eaa837c53bd..eec348a3df7 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -27,6 +27,8 @@ extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,  				  void *, size_t);  extern ssize_t v9fs_xattr_get(struct dentry *, const char *,  			      void *, size_t); +extern int v9fs_fid_xattr_set(struct p9_fid *, const char *, +			  const void *, size_t, int);  extern int v9fs_xattr_set(struct dentry *, const char *,  			  const void *, size_t, int);  extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t); diff --git a/fs/Kconfig b/fs/Kconfig index eaff24a1950..780725a463b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -68,16 +68,6 @@ source "fs/quota/Kconfig"  source "fs/autofs4/Kconfig"  source "fs/fuse/Kconfig" -config CUSE -	tristate "Character device in Userspace support" -	depends on FUSE_FS -	help -	  This FUSE extension allows character devices to be -	  implemented in userspace. - -	  If you want to develop or use userspace character device -	  based on CUSE, answer Y or M. -  config GENERIC_ACL  	bool  	select FS_POSIX_ACL @@ -220,6 +210,7 @@ source "fs/pstore/Kconfig"  source "fs/sysv/Kconfig"  source "fs/ufs/Kconfig"  source "fs/exofs/Kconfig" +source "fs/f2fs/Kconfig"  endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 1d7af79288a..9d53192236f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -123,6 +123,7 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/  obj-$(CONFIG_OCFS2_FS)		+= ocfs2/  obj-$(CONFIG_BTRFS_FS)		+= btrfs/  obj-$(CONFIG_GFS2_FS)           += gfs2/ +obj-$(CONFIG_F2FS_FS)		+= f2fs/  obj-y				+= exofs/ # Multiple modules  obj-$(CONFIG_CEPH_FS)		+= ceph/  obj-$(CONFIG_PSTORE)		+= pstore/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index e55182a7460..c5a7787dd5e 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -1,6 +1,6 @@  config ADFS_FS -	tristate "ADFS file system support (EXPERIMENTAL)" -	depends on BLOCK && EXPERIMENTAL +	tristate "ADFS file system support" +	depends on BLOCK  	help  	  The Acorn Disc Filing System is the standard file system of the  	  RiscOS operating system which runs on Acorn's ARM-based Risc PC diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index b3be2e7c564..9cf874ce833 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -19,7 +19,7 @@ static DEFINE_RWLOCK(adfs_dir_lock);  static int  adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;  	struct object_info obj; diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index e9bad5093a3..5f95d1ed9c6 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page)  	return block_read_full_page(page, adfs_get_block);  } +static void adfs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) +		truncate_pagecache(inode, to, inode->i_size); +} +  static int adfs_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,  	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,  				adfs_get_block,  				&ADFS_I(mapping->host)->mmu_private); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		adfs_write_failed(mapping, pos + len);  	return ret;  } diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index cfad9afb476..a04d9e848d0 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -1,6 +1,6 @@  config AFFS_FS -	tristate "Amiga FFS file system support (EXPERIMENTAL)" -	depends on BLOCK && EXPERIMENTAL +	tristate "Amiga FFS file system support" +	depends on BLOCK  	help  	  The Fast File System (FFS) is the common file system used on hard  	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index eb82ee53ee0..d9a43674cb9 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -125,9 +125,8 @@ static void  affs_fix_dcache(struct inode *inode, u32 entry_ino)  {  	struct dentry *dentry; -	struct hlist_node *p;  	spin_lock(&inode->i_lock); -	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { +	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {  		if (entry_ino == (u32)(long)dentry->d_fsdata) {  			dentry->d_fsdata = (void *)inode->i_ino;  			break; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 8ca8f3a5559..fd11a6d608e 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -42,7 +42,7 @@ const struct inode_operations affs_dir_inode_operations = {  static int  affs_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode		*inode = filp->f_path.dentry->d_inode; +	struct inode		*inode = file_inode(filp);  	struct super_block	*sb = inode->i_sb;  	struct buffer_head	*dir_bh;  	struct buffer_head	*fh_bh; diff --git a/fs/affs/file.c b/fs/affs/file.c index 2f4c935cb32..af3261b7810 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -39,7 +39,6 @@ const struct file_operations affs_file_operations = {  };  const struct inode_operations affs_file_inode_operations = { -	.truncate	= affs_truncate,  	.setattr	= affs_notify_change,  }; @@ -402,6 +401,16 @@ static int affs_readpage(struct file *file, struct page *page)  	return block_read_full_page(page, affs_get_block);  } +static void affs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		affs_truncate(inode); +	} +} +  static int affs_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -412,11 +421,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,  	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,  				affs_get_block,  				&AFFS_I(mapping->host)->mmu_private); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		affs_write_failed(mapping, pos + len);  	return ret;  } diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 15c48426822..0e092d08680 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)  	if ((attr->ia_valid & ATTR_SIZE) &&  	    attr->ia_size != i_size_read(inode)) { -		error = vmtruncate(inode, attr->ia_size); +		error = inode_newsize_ok(inode, attr->ia_size);  		if (error)  			return error; + +		truncate_setsize(inode, attr->ia_size); +		affs_truncate(inode);  	}  	setattr_copy(inode, attr); diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 8f975f25b48..ebba3b18e5d 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -1,6 +1,6 @@  config AFS_FS -	tristate "Andrew File System support (AFS) (EXPERIMENTAL)" -	depends on INET && EXPERIMENTAL +	tristate "Andrew File System support (AFS)" +	depends on INET  	select AF_RXRPC  	select DNS_RESOLVER  	help @@ -22,8 +22,7 @@ config AFS_DEBUG  	  If unsure, say N.  config AFS_FSCACHE -	bool "Provide AFS client caching support (EXPERIMENTAL)" -	depends on EXPERIMENTAL +	bool "Provide AFS client caching support"  	depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y  	help  	  Say Y here if you want AFS data to be cached locally on disk through diff --git a/fs/afs/afs.h b/fs/afs/afs.h index c548aa346f0..3c462ff6db6 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -119,8 +119,8 @@ struct afs_file_status {  	u64			size;		/* file size */  	afs_dataversion_t	data_version;	/* current data version */  	u32			author;		/* author ID */ -	u32			owner;		/* owner ID */ -	u32			group;		/* group ID */ +	kuid_t			owner;		/* owner ID */ +	kgid_t			group;		/* group ID */  	afs_access_t		caller_access;	/* access rights for authenticated caller */  	afs_access_t		anon_access;	/* access rights for unauthenticated caller */  	umode_t			mode;		/* UNIX mode */ @@ -133,13 +133,6 @@ struct afs_file_status {  /*   * AFS file status change request   */ -struct afs_store_status { -	u32			mask;		/* which bits of the struct are set */ -	u32			mtime_client;	/* last time client changed data */ -	u32			owner;		/* owner ID */ -	u32			group;		/* group ID */ -	umode_t			mode;		/* UNIX mode */ -};  #define AFS_SET_MTIME		0x01		/* set the mtime */  #define AFS_SET_OWNER		0x02		/* set the owner ID */ diff --git a/fs/afs/dir.c b/fs/afs/dir.c index db477906ba4..7a465ed0444 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -393,12 +393,12 @@ static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)  	int ret;  	_enter("{%Ld,{%lu}}", -	       file->f_pos, file->f_path.dentry->d_inode->i_ino); +	       file->f_pos, file_inode(file)->i_ino);  	ASSERT(file->private_data != NULL);  	fpos = file->f_pos; -	ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos, +	ret = afs_dir_iterate(file_inode(file), &fpos,  			      cookie, filldir, file->private_data);  	file->f_pos = fpos; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 757d664575d..2497bf306c7 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -514,7 +514,7 @@ error:   */  int afs_lock(struct file *file, int cmd, struct file_lock *fl)  { -	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); +	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));  	_enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",  	       vnode->fid.vid, vnode->fid.vnode, cmd, @@ -537,7 +537,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)   */  int afs_flock(struct file *file, int cmd, struct file_lock *fl)  { -	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); +	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));  	_enter("{%x:%u},%d,{t=%x,fl=%x}",  	       vnode->fid.vid, vnode->fid.vnode, cmd, diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index b960ff05ea0..c2e930ec288 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -42,6 +42,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,  	umode_t mode;  	u64 data_version, size;  	u32 changed = 0; /* becomes non-zero if ctime-type changes seen */ +	kuid_t owner; +	kgid_t group;  #define EXTRACT(DST)				\  	do {					\ @@ -56,7 +58,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,  	size = ntohl(*bp++);  	data_version = ntohl(*bp++);  	EXTRACT(status->author); -	EXTRACT(status->owner); +	owner = make_kuid(&init_user_ns, ntohl(*bp++)); +	changed |= !uid_eq(owner, status->owner); +	status->owner = owner;  	EXTRACT(status->caller_access); /* call ticket dependent */  	EXTRACT(status->anon_access);  	EXTRACT(status->mode); @@ -65,7 +69,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,  	bp++; /* seg size */  	status->mtime_client = ntohl(*bp++);  	status->mtime_server = ntohl(*bp++); -	EXTRACT(status->group); +	group = make_kgid(&init_user_ns, ntohl(*bp++)); +	changed |= !gid_eq(group, status->group); +	status->group = group;  	bp++; /* sync counter */  	data_version |= (u64) ntohl(*bp++) << 32;  	EXTRACT(status->lock_count); @@ -181,12 +187,12 @@ static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)  	if (attr->ia_valid & ATTR_UID) {  		mask |= AFS_SET_OWNER; -		owner = attr->ia_uid; +		owner = from_kuid(&init_user_ns, attr->ia_uid);  	}  	if (attr->ia_valid & ATTR_GID) {  		mask |= AFS_SET_GROUP; -		group = attr->ia_gid; +		group = from_kgid(&init_user_ns, attr->ia_gid);  	}  	if (attr->ia_valid & ATTR_MODE) { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 95cffd38239..789bc253b5f 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -69,7 +69,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)  	set_nlink(inode, vnode->status.nlink);  	inode->i_uid		= vnode->status.owner; -	inode->i_gid		= 0; +	inode->i_gid		= GLOBAL_ROOT_GID;  	inode->i_size		= vnode->status.size;  	inode->i_ctime.tv_sec	= vnode->status.mtime_server;  	inode->i_ctime.tv_nsec	= 0; @@ -175,8 +175,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,  	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;  	inode->i_op		= &afs_autocell_inode_operations;  	set_nlink(inode, 2); -	inode->i_uid		= 0; -	inode->i_gid		= 0; +	inode->i_uid		= GLOBAL_ROOT_UID; +	inode->i_gid		= GLOBAL_ROOT_GID;  	inode->i_ctime.tv_sec	= get_seconds();  	inode->i_ctime.tv_nsec	= 0;  	inode->i_atime		= inode->i_mtime = inode->i_ctime; diff --git a/fs/afs/super.c b/fs/afs/super.c index 43165009428..7c31ec39957 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -24,6 +24,8 @@  #include <linux/parser.h>  #include <linux/statfs.h>  #include <linux/sched.h> +#include <linux/nsproxy.h> +#include <net/net_namespace.h>  #include "internal.h"  #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ @@ -363,6 +365,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,  	memset(¶ms, 0, sizeof(params)); +	ret = -EINVAL; +	if (current->nsproxy->net_ns != &init_net) +		goto error; +  	/* parse the options and device name */  	if (options) {  		ret = afs_parse_options(¶ms, options, &dev_name); diff --git a/fs/afs/write.c b/fs/afs/write.c index 9aa52d93c73..7e03eadb40c 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -120,7 +120,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,  		    struct page **pagep, void **fsdata)  {  	struct afs_writeback *candidate, *wb; -	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); +	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));  	struct page *page;  	struct key *key = file->private_data;  	unsigned from = pos & (PAGE_CACHE_SIZE - 1); @@ -245,7 +245,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,  		  loff_t pos, unsigned len, unsigned copied,  		  struct page *page, void *fsdata)  { -	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); +	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));  	loff_t i_size, maybe_i_size;  	_enter("{%x:%u},{%lx}", @@ -627,8 +627,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)  ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,  		       unsigned long nr_segs, loff_t pos)  { -	struct dentry *dentry = iocb->ki_filp->f_path.dentry; -	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); +	struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));  	ssize_t result;  	size_t count = iov_length(iov, nr_segs); @@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx)  	struct aio_ring *ring;  	struct aio_ring_info *info = &ctx->ring_info;  	unsigned nr_events = ctx->max_reqs; -	unsigned long size; +	unsigned long size, populate;  	int nr_pages;  	/* Compensate for the ring buffer's head/tail overlap entry */ @@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx)  	down_write(&ctx->mm->mmap_sem);  	info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,   					PROT_READ|PROT_WRITE, -					MAP_ANONYMOUS|MAP_PRIVATE, 0); +					MAP_ANONYMOUS|MAP_PRIVATE, 0, +					&populate);  	if (IS_ERR((void *)info->mmap_base)) {  		up_write(&ctx->mm->mmap_sem);  		info->mmap_size = 0; @@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx)  		aio_free_ring(ctx);  		return -EAGAIN;  	} +	if (populate) +		mm_populate(info->mmap_base, populate);  	ctx->user_id = info->mmap_base; @@ -588,11 +591,10 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)  {  	struct mm_struct *mm = current->mm;  	struct kioctx *ctx, *ret = NULL; -	struct hlist_node *n;  	rcu_read_lock(); -	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { +	hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {  		/*  		 * RCU protects us against accessing freed memory but  		 * we have to be careful not to get a reference when the diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 28d39fb84ae..47a65df8c87 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -131,7 +131,6 @@ struct file *anon_inode_getfile(const char *name,  	struct qstr this;  	struct path path;  	struct file *file; -	int error;  	if (IS_ERR(anon_inode_inode))  		return ERR_PTR(-ENODEV); @@ -143,7 +142,7 @@ struct file *anon_inode_getfile(const char *name,  	 * Link the inode to a directory entry by creating a unique name  	 * using the inode sequence number.  	 */ -	error = -ENOMEM; +	file = ERR_PTR(-ENOMEM);  	this.name = name;  	this.len = strlen(name);  	this.hash = 0; @@ -160,15 +159,12 @@ struct file *anon_inode_getfile(const char *name,  	d_instantiate(path.dentry, anon_inode_inode); -	error = -ENFILE;  	file = alloc_file(&path, OPEN_FMODE(flags), fops); -	if (!file) +	if (IS_ERR(file))  		goto err_dput;  	file->f_mapping = anon_inode_inode->i_mapping; -	file->f_pos = 0;  	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); -	file->f_version = 0;  	file->private_data = priv;  	return file; @@ -177,7 +173,7 @@ err_dput:  	path_put(&path);  err_module:  	module_put(fops->owner); -	return ERR_PTR(error); +	return file;  }  EXPORT_SYMBOL_GPL(anon_inode_getfile); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index b785e770795..3f1128b37e4 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -273,7 +273,7 @@ static inline int autofs_prepare_pipe(struct file *pipe)  {  	if (!pipe->f_op || !pipe->f_op->write)  		return -EINVAL; -	if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode)) +	if (!S_ISFIFO(file_inode(pipe)->i_mode))  		return -EINVAL;  	/* We want a packet pipe */  	pipe->f_flags |= O_DIRECT; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 9f68a37bb2b..743c7c2c949 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -159,7 +159,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)  	struct inode *inode;  	if (f) { -		inode = f->f_path.dentry->d_inode; +		inode = file_inode(f);  		sbi = autofs4_sbi(inode->i_sb);  	}  	return sbi; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index c93447604da..9bd16255dd9 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -383,8 +383,10 @@ static struct vfsmount *autofs4_d_automount(struct path *path)  				goto done;  			}  		} else { -			if (!simple_empty(dentry)) +			if (!simple_empty(dentry)) { +				spin_unlock(&sbi->fs_lock);  				goto done; +			}  		}  		ino->flags |= AUTOFS_INF_PENDING;  		spin_unlock(&sbi->fs_lock); @@ -587,7 +589,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)  	/* This allows root to remove symlinks */  	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	if (atomic_dec_and_test(&ino->count)) {  		p_ino = autofs4_dentry_ino(dentry->d_parent); @@ -874,7 +876,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,  static long autofs4_root_ioctl(struct file *filp,  			       unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);  } @@ -882,7 +884,7 @@ static long autofs4_root_ioctl(struct file *filp,  static long autofs4_root_compat_ioctl(struct file *filp,  			     unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	int ret;  	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 03bc1d347d8..3db70dae40d 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -42,10 +42,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)  	while (wq) {  		nwq = wq->next;  		wq->status = -ENOENT; /* Magic is gone - report failure */ -		if (wq->name.name) { -			kfree(wq->name.name); -			wq->name.name = NULL; -		} +		kfree(wq->name.name); +		wq->name.name = NULL;  		wq->wait_ctr--;  		wake_up_interruptible(&wq->queue);  		wq = nwq; diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig index 7835d30f211..edc5cc2aefa 100644 --- a/fs/befs/Kconfig +++ b/fs/befs/Kconfig @@ -1,6 +1,6 @@  config BEFS_FS -	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)" -	depends on BLOCK && EXPERIMENTAL +	tristate "BeOS file system (BeFS) support (read only)" +	depends on BLOCK  	select NLS  	help  	  The BeOS File System (BeFS) is the native file system of Be, Inc's diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 2b3bda8d5e6..c8f4e25eb9e 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -213,7 +213,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)  static int  befs_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;  	befs_off_t value; diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index c2336c62024..3728a6479c6 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -1,6 +1,6 @@  config BFS_FS -	tristate "BFS file system support (EXPERIMENTAL)" -	depends on BLOCK && EXPERIMENTAL +	tristate "BFS file system support" +	depends on BLOCK  	help  	  Boot File System (BFS) is a file system used under SCO UnixWare to  	  allow the bootloader access to the kernel image and other important diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 2785ef91191..3f422f6bb5c 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -28,7 +28,7 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,  static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)  { -	struct inode *dir = f->f_path.dentry->d_inode; +	struct inode *dir = file_inode(f);  	struct buffer_head *bh;  	struct bfs_dirent *de;  	struct bfs_sb_info *info = BFS_SB(dir->i_sb); diff --git a/fs/bfs/file.c b/fs/bfs/file.c index f20e8a71062..ad3ea1497cc 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page)  	return block_read_full_page(page, bfs_get_block);  } +static void bfs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) +		truncate_pagecache(inode, to, inode->i_size); +} +  static int bfs_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,  	ret = block_write_begin(mapping, pos, len, flags, pagep,  				bfs_get_block); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		bfs_write_failed(mapping, pos + len);  	return ret;  } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 6043567b95c..bbc8f8827ea 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -214,7 +214,7 @@ static int load_aout_binary(struct linux_binprm * bprm)  	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&  	     N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||  	    N_TRSIZE(ex) || N_DRSIZE(ex) || -	    i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { +	    i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {  		return -ENOEXEC;  	} @@ -367,7 +367,7 @@ static int load_aout_library(struct file *file)  	int retval;  	struct exec ex; -	inode = file->f_path.dentry->d_inode; +	inode = file_inode(file);  	retval = -ENOEXEC;  	error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0c42cdbabec..3939829f6c5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -33,6 +33,7 @@  #include <linux/elf.h>  #include <linux/utsname.h>  #include <linux/coredump.h> +#include <linux/sched.h>  #include <asm/uaccess.h>  #include <asm/param.h>  #include <asm/page.h> @@ -321,6 +322,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,  	return 0;  } +#ifndef elf_map +  static unsigned long elf_map(struct file *filep, unsigned long addr,  		struct elf_phdr *eppnt, int prot, int type,  		unsigned long total_size) @@ -355,6 +358,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,  	return(map_addr);  } +#endif /* !elf_map */ +  static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)  {  	int i, first_idx = -1, last_idx = -1; @@ -1140,7 +1145,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,  	/* By default, dump shared memory if mapped from an anonymous file. */  	if (vma->vm_flags & VM_SHARED) { -		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ? +		if (file_inode(vma->vm_file)->i_nlink == 0 ?  		    FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))  			goto whole;  		return 0; @@ -1248,7 +1253,7 @@ static int writenote(struct memelfnote *men, struct file *file,  #undef DUMP_WRITE  static void fill_elf_header(struct elfhdr *elf, int segs, -			    u16 machine, u32 flags, u8 osabi) +			    u16 machine, u32 flags)  {  	memset(elf, 0, sizeof(*elf)); @@ -1320,8 +1325,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,  		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);  		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);  	} else { -		cputime_to_timeval(p->utime, &prstatus->pr_utime); -		cputime_to_timeval(p->stime, &prstatus->pr_stime); +		cputime_t utime, stime; + +		task_cputime(p, &utime, &stime); +		cputime_to_timeval(utime, &prstatus->pr_utime); +		cputime_to_timeval(stime, &prstatus->pr_stime);  	}  	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);  	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); @@ -1630,7 +1638,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,  	 * Initialize the ELF file header.  	 */  	fill_elf_header(elf, phdrs, -			view->e_machine, view->e_flags, view->ei_osabi); +			view->e_machine, view->e_flags);  	/*  	 * Allocate a structure for each thread. @@ -1870,7 +1878,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,  	elf_core_copy_regs(&info->prstatus->pr_reg, regs);  	/* Set up header */ -	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI); +	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);  	/*  	 * Set up the notes in similar form to SVR4 core dumps made diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index dc84732e554..9c13e023e2b 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -909,7 +909,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,  dynamic_error:  	printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", -	       what, file->f_path.dentry->d_inode->i_ino); +	       what, file_inode(file)->i_ino);  	return -ELIBBAD;  } @@ -1219,7 +1219,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)  	/* By default, dump shared memory if mapped from an anonymous file. */  	if (vma->vm_flags & VM_SHARED) { -		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) { +		if (file_inode(vma->vm_file)->i_nlink == 0) {  			dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags);  			kdcore("%08lx: %08lx: %s (share)", vma->vm_start,  			       vma->vm_flags, dump_ok ? "yes" : "no"); @@ -1375,8 +1375,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,  		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);  		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);  	} else { -		cputime_to_timeval(p->utime, &prstatus->pr_utime); -		cputime_to_timeval(p->stime, &prstatus->pr_stime); +		cputime_t utime, stime; + +		task_cputime(p, &utime, &stime); +		cputime_to_timeval(utime, &prstatus->pr_utime); +		cputime_to_timeval(stime, &prstatus->pr_stime);  	}  	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);  	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b56371981d1..2036d21baae 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -438,7 +438,7 @@ static int load_flat_file(struct linux_binprm * bprm,  	int ret;  	hdr = ((struct flat_hdr *) bprm->buf);		/* exec-header */ -	inode = bprm->file->f_path.dentry->d_inode; +	inode = file_inode(bprm->file);  	text_len  = ntohl(hdr->data_start);  	data_len  = ntohl(hdr->data_end) - ntohl(hdr->data_start); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 9be335fb8a7..fecbbf3f8ff 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -172,7 +172,10 @@ static int load_misc_binary(struct linux_binprm *bprm)  		goto _error;  	bprm->argc ++; -	bprm->interp = iname;	/* for binfmt_script */ +	/* Update interp in case binfmt_script needs it. */ +	retval = bprm_change_interp(iname, bprm); +	if (retval < 0) +		goto _error;  	interp_file = open_exec (iname);  	retval = PTR_ERR (interp_file); @@ -528,7 +531,7 @@ static void kill_node(Node *e)  static ssize_t  bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)  { -	Node *e = file->f_path.dentry->d_inode->i_private; +	Node *e = file_inode(file)->i_private;  	ssize_t res;  	char *page; @@ -547,7 +550,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,  				size_t count, loff_t *ppos)  {  	struct dentry *root; -	Node *e = file->f_path.dentry->d_inode->i_private; +	Node *e = file_inode(file)->i_private;  	int res = parse_command(buffer, count);  	switch (res) { diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 1610a91637e..5027a3e1492 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -80,7 +80,9 @@ static int load_script(struct linux_binprm *bprm)  	retval = copy_strings_kernel(1, &i_name, bprm);  	if (retval) return retval;   	bprm->argc++; -	bprm->interp = interp; +	retval = bprm_change_interp(interp, bprm); +	if (retval < 0) +		return retval;  	/*  	 * OK, now restart the process with the interpreter's dentry. @@ -1428,6 +1428,8 @@ void bio_endio(struct bio *bio, int error)  	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))  		error = -EIO; +	trace_block_bio_complete(bio, error); +  	if (bio->bi_end_io)  		bio->bi_end_io(bio, error);  } diff --git a/fs/block_dev.c b/fs/block_dev.c index 172f8491a2b..aea605c98ba 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -318,7 +318,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,  /*   * private llseek: - * for a block special file file->f_path.dentry->d_inode->i_size is zero + * for a block special file file_inode(file)->i_size is zero   * so we compute the size by hand (just as in block_read/write above)   */  static loff_t block_llseek(struct file *file, loff_t offset, int whence) @@ -994,6 +994,7 @@ int revalidate_disk(struct gendisk *disk)  	mutex_lock(&bdev->bd_mutex);  	check_disk_size_change(disk, bdev); +	bdev->bd_invalidated = 0;  	mutex_unlock(&bdev->bd_mutex);  	bdput(bdev);  	return ret; @@ -1032,7 +1033,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)  {  	unsigned bsize = bdev_logical_block_size(bdev); -	bdev->bd_inode->i_size = size; +	mutex_lock(&bdev->bd_inode->i_mutex); +	i_size_write(bdev->bd_inode, size); +	mutex_unlock(&bdev->bd_inode->i_mutex);  	while (bsize < PAGE_CACHE_SIZE) {  		if (size & bsize)  			break; @@ -1117,7 +1120,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)  				}  			} -			if (!ret && !bdev->bd_openers) { +			if (!ret) {  				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);  				bdi = blk_get_backing_dev_info(bdev);  				if (bdi == NULL) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index d33f01c08b6..9a8622a5b86 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,11 +1,13 @@  config BTRFS_FS -	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" -	depends on EXPERIMENTAL +	tristate "Btrfs filesystem Unstable disk format"  	select LIBCRC32C  	select ZLIB_INFLATE  	select ZLIB_DEFLATE  	select LZO_COMPRESS  	select LZO_DECOMPRESS +	select RAID6_PQ +	select XOR_BLOCKS +  	help  	  Btrfs is a new filesystem with extents, writable snapshotting,  	  support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7df3e0f0ee5..3932224f99e 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \  	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \  	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \  	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ -	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o +	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o  btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04edf69be87..bd605c87adf 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  		err = __resolve_indirect_ref(fs_info, search_commit_root,  					     time_seq, ref, parents,  					     extent_item_pos); -		if (err) { -			if (ret == 0) -				ret = err; +		if (err)  			continue; -		}  		/* we put the first parent into the ref at hand */  		ULIST_ITER_INIT(&uiter); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index d61feca7945..310a7f6d09b 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -19,7 +19,7 @@  #ifndef __BTRFS_BACKREF__  #define __BTRFS_BACKREF__ -#include "ioctl.h" +#include <linux/btrfs.h>  #include "ulist.h"  #include "extent_io.h" diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2a8c242bc4f..d9b97d4960e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -40,6 +40,8 @@  #define BTRFS_INODE_HAS_ASYNC_EXTENT		6  #define BTRFS_INODE_NEEDS_FULL_SYNC		7  #define BTRFS_INODE_COPY_EVERYTHING		8 +#define BTRFS_INODE_IN_DELALLOC_LIST		9 +#define BTRFS_INODE_READDIO_NEED_LOCK		10  /* in memory btrfs inode */  struct btrfs_inode { @@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)  	return 0;  } +/* + * Disable DIO read nolock optimization, so new dio readers will be forced + * to grab i_mutex. It is used to avoid the endless truncate due to + * nonlocked dio read. + */ +static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +{ +	set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); +	smp_mb(); +} + +static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +{ +	smp_mb__before_clear_bit(); +	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, +		  &BTRFS_I(inode)->runtime_flags); +} +  #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 11d47bfb62b..18af6f48781 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror(  	    (bh->b_data + (dev_bytenr & 4095));  	if (btrfs_super_bytenr(super_tmp) != dev_bytenr || -	    strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, -		    sizeof(super_tmp->magic)) || +	    super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||  	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||  	    btrfs_super_nodesize(super_tmp) != state->metablock_size ||  	    btrfs_super_leafsize(super_tmp) != state->metablock_size || diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 94ab2f80e7e..15b94089abc 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,  		page = compressed_pages[pg_index];  		page->mapping = inode->i_mapping;  		if (bio->bi_size) -			ret = io_tree->ops->merge_bio_hook(page, 0, +			ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,  							   PAGE_CACHE_SIZE,  							   bio, 0);  		else @@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  		page->index = em_start >> PAGE_CACHE_SHIFT;  		if (comp_bio->bi_size) -			ret = tree->ops->merge_bio_hook(page, 0, +			ret = tree->ops->merge_bio_hook(READ, page, 0,  							PAGE_CACHE_SIZE,  							comp_bio, 0);  		else diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c7b67cf24bb..ecd25a1b4e5 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1138,13 +1138,14 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,  		switch (tm->op) {  		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:  			BUG_ON(tm->slot < n); -		case MOD_LOG_KEY_REMOVE: -			n++; +			/* Fallthrough */  		case MOD_LOG_KEY_REMOVE_WHILE_MOVING: +		case MOD_LOG_KEY_REMOVE:  			btrfs_set_node_key(eb, &tm->key, tm->slot);  			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);  			btrfs_set_node_ptr_generation(eb, tm->slot,  						      tm->generation); +			n++;  			break;  		case MOD_LOG_KEY_REPLACE:  			BUG_ON(tm->slot >= n); @@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,  	__tree_mod_log_rewind(eb_rewin, time_seq, tm);  	WARN_ON(btrfs_header_nritems(eb_rewin) > -		BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); +		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));  	return eb_rewin;  } @@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)   */  int btrfs_realloc_node(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct extent_buffer *parent, -		       int start_slot, int cache_only, u64 *last_ret, +		       int start_slot, u64 *last_ret,  		       struct btrfs_key *progress)  {  	struct extent_buffer *cur; @@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  	struct btrfs_disk_key disk_key;  	parent_level = btrfs_header_level(parent); -	if (cache_only && parent_level != 1) -		return 0;  	WARN_ON(trans->transaction != root->fs_info->running_transaction);  	WARN_ON(trans->transid != root->fs_info->generation); @@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  		else  			uptodate = 0;  		if (!cur || !uptodate) { -			if (cache_only) { -				free_extent_buffer(cur); -				continue; -			}  			if (!cur) {  				cur = read_tree_block(root, blocknr,  							 blocksize, gen); @@ -4611,12 +4606,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,  	u32 nritems;  	int ret; -	if (level) { -		ret = tree_mod_log_insert_key(root->fs_info, parent, slot, -					      MOD_LOG_KEY_REMOVE); -		BUG_ON(ret < 0); -	} -  	nritems = btrfs_header_nritems(parent);  	if (slot != nritems - 1) {  		if (level) @@ -4627,6 +4616,10 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,  			      btrfs_node_key_ptr_offset(slot + 1),  			      sizeof(struct btrfs_key_ptr) *  			      (nritems - slot - 1)); +	} else if (level) { +		ret = tree_mod_log_insert_key(root->fs_info, parent, slot, +					      MOD_LOG_KEY_REMOVE); +		BUG_ON(ret < 0);  	}  	nritems--; @@ -4827,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)  /*   * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are either in cache or have a minimum - * transaction id.  This is used by the btree defrag code, and tree logging + * for nodes or leaves that are have a minimum transaction id. + * This is used by the btree defrag code, and tree logging   *   * This does not cow, but it does stuff the starting key it finds back   * into min_key, so you can call btrfs_search_slot with cow=1 on the @@ -4849,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)   */  int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,  			 struct btrfs_key *max_key, -			 struct btrfs_path *path, int cache_only, +			 struct btrfs_path *path,  			 u64 min_trans)  {  	struct extent_buffer *cur; @@ -4889,15 +4882,12 @@ again:  		if (sret && slot > 0)  			slot--;  		/* -		 * check this node pointer against the cache_only and -		 * min_trans parameters.  If it isn't in cache or is too -		 * old, skip to the next one. +		 * check this node pointer against the min_trans parameters. +		 * If it is too old, old, skip to the next one.  		 */  		while (slot < nritems) {  			u64 blockptr;  			u64 gen; -			struct extent_buffer *tmp; -			struct btrfs_disk_key disk_key;  			blockptr = btrfs_node_blockptr(cur, slot);  			gen = btrfs_node_ptr_generation(cur, slot); @@ -4905,27 +4895,7 @@ again:  				slot++;  				continue;  			} -			if (!cache_only) -				break; - -			if (max_key) { -				btrfs_node_key(cur, &disk_key, slot); -				if (comp_keys(&disk_key, max_key) >= 0) { -					ret = 1; -					goto out; -				} -			} - -			tmp = btrfs_find_tree_block(root, blockptr, -					    btrfs_level_size(root, level - 1)); - -			if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { -				free_extent_buffer(tmp); -				break; -			} -			if (tmp) -				free_extent_buffer(tmp); -			slot++; +			break;  		}  find_next_key:  		/* @@ -4936,7 +4906,7 @@ find_next_key:  			path->slots[level] = slot;  			btrfs_set_path_blocking(path);  			sret = btrfs_find_next_key(root, path, min_key, level, -						  cache_only, min_trans); +						  min_trans);  			if (sret == 0) {  				btrfs_release_path(path);  				goto again; @@ -5401,8 +5371,7 @@ out:  /*   * this is similar to btrfs_next_leaf, but does not try to preserve   * and fixup the path.  It looks for and returns the next key in the - * tree based on the current path and the cache_only and min_trans - * parameters. + * tree based on the current path and the min_trans parameters.   *   * 0 is returned if another key is found, < 0 if there are any errors   * and 1 is returned if there are no higher keys in the tree @@ -5411,8 +5380,7 @@ out:   * calling this function.   */  int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, -			struct btrfs_key *key, int level, -			int cache_only, u64 min_trans) +			struct btrfs_key *key, int level, u64 min_trans)  {  	int slot;  	struct extent_buffer *c; @@ -5463,22 +5431,8 @@ next:  		if (level == 0)  			btrfs_item_key_to_cpu(c, key, slot);  		else { -			u64 blockptr = btrfs_node_blockptr(c, slot);  			u64 gen = btrfs_node_ptr_generation(c, slot); -			if (cache_only) { -				struct extent_buffer *cur; -				cur = btrfs_find_tree_block(root, blockptr, -					    btrfs_level_size(root, level - 1)); -				if (!cur || -				    btrfs_buffer_uptodate(cur, gen, 1) <= 0) { -					slot++; -					if (cur) -						free_extent_buffer(cur); -					goto next; -				} -				free_extent_buffer(cur); -			}  			if (gen < min_trans) {  				slot++;  				goto next; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 547b7b05727..0d82922179d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -31,10 +31,10 @@  #include <trace/events/btrfs.h>  #include <asm/kmap_types.h>  #include <linux/pagemap.h> +#include <linux/btrfs.h>  #include "extent_io.h"  #include "extent_map.h"  #include "async-thread.h" -#include "ioctl.h"  struct btrfs_trans_handle;  struct btrfs_transaction; @@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;  extern struct kmem_cache *btrfs_free_space_cachep;  struct btrfs_ordered_sum; -#define BTRFS_MAGIC "_BHRfS_M" +#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */  #define BTRFS_MAX_MIRRORS 3 @@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };  /* ioprio of readahead is set to idle */  #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) +#define BTRFS_DIRTY_METADATA_THRESH	(32 * 1024 * 1024) +  /*   * The key defines the order in the tree, and so it also defines (optimal)   * block layout. @@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)  /*   * File system states   */ +#define BTRFS_FS_STATE_ERROR		0 +#define BTRFS_FS_STATE_REMOUNTING	1 +/* Super block flags */  /* Errors detected */  #define BTRFS_SUPER_FLAG_ERROR		(1ULL << 2) @@ -502,6 +507,7 @@ struct btrfs_super_block {  #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)  #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6) +#define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)  #define BTRFS_FEATURE_COMPAT_SUPP		0ULL  #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL @@ -511,6 +517,7 @@ struct btrfs_super_block {  	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\  	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\  	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\ +	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\  	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)  /* @@ -952,8 +959,20 @@ struct btrfs_dev_replace_item {  #define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)  #define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)  #define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5    (1 << 7) +#define BTRFS_BLOCK_GROUP_RAID6    (1 << 8)  #define BTRFS_BLOCK_GROUP_RESERVED	BTRFS_AVAIL_ALLOC_BIT_SINGLE -#define BTRFS_NR_RAID_TYPES		5 + +enum btrfs_raid_types { +	BTRFS_RAID_RAID10, +	BTRFS_RAID_RAID1, +	BTRFS_RAID_DUP, +	BTRFS_RAID_RAID0, +	BTRFS_RAID_SINGLE, +	BTRFS_RAID_RAID5, +	BTRFS_RAID_RAID6, +	BTRFS_NR_RAID_TYPES +};  #define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \  					 BTRFS_BLOCK_GROUP_SYSTEM |  \ @@ -961,6 +980,8 @@ struct btrfs_dev_replace_item {  #define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \  					 BTRFS_BLOCK_GROUP_RAID1 |   \ +					 BTRFS_BLOCK_GROUP_RAID5 |   \ +					 BTRFS_BLOCK_GROUP_RAID6 |   \  					 BTRFS_BLOCK_GROUP_DUP |     \  					 BTRFS_BLOCK_GROUP_RAID10)  /* @@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache {  	u64 flags;  	u64 sectorsize;  	u64 cache_generation; + +	/* for raid56, this is a full stripe, without parity */ +	unsigned long full_stripe_len; +  	unsigned int ro:1;  	unsigned int dirty:1;  	unsigned int iref:1; @@ -1225,6 +1250,28 @@ struct seq_list {  	u64 seq;  }; +enum btrfs_orphan_cleanup_state { +	ORPHAN_CLEANUP_STARTED	= 1, +	ORPHAN_CLEANUP_DONE	= 2, +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { +	struct list_head hash_list; +	wait_queue_head_t wait; +	spinlock_t lock; +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { +	struct list_head stripe_cache; +	spinlock_t cache_lock; +	int cache_size; +	struct btrfs_stripe_hash table[]; +}; + +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 +  /* fs_info */  struct reloc_control;  struct btrfs_device; @@ -1250,6 +1297,7 @@ struct btrfs_fs_info {  	/* block group cache stuff */  	spinlock_t block_group_cache_lock; +	u64 first_logical_byte;  	struct rb_root block_group_cache_tree;  	/* keep track of unallocated space */ @@ -1288,7 +1336,23 @@ struct btrfs_fs_info {  	u64 last_trans_log_full_commit;  	unsigned long mount_opt;  	unsigned long compress_type:4; +	/* +	 * It is a suggestive number, the read side is safe even it gets a +	 * wrong number because we will write out the data into a regular +	 * extent. The write side(mount/remount) is under ->s_umount lock, +	 * so it is also safe. +	 */  	u64 max_inline; +	/* +	 * Protected by ->chunk_mutex and sb->s_umount. +	 * +	 * The reason that we use two lock to protect it is because only +	 * remount and mount operations can change it and these two operations +	 * are under sb->s_umount, but the read side (chunk allocation) can not +	 * acquire sb->s_umount or the deadlock would happen. So we use two +	 * locks to protect it. On the write side, we must acquire two locks, +	 * and on the read side, we just need acquire one of them. +	 */  	u64 alloc_start;  	struct btrfs_transaction *running_transaction;  	wait_queue_head_t transaction_throttle; @@ -1307,6 +1371,13 @@ struct btrfs_fs_info {  	struct mutex cleaner_mutex;  	struct mutex chunk_mutex;  	struct mutex volume_mutex; + +	/* this is used during read/modify/write to make sure +	 * no two ios are trying to mod the same stripe at the same +	 * time +	 */ +	struct btrfs_stripe_hash_table *stripe_hash_table; +  	/*  	 * this protects the ordered operations list only while we are  	 * processing all of the entries on it.  This way we make @@ -1365,6 +1436,7 @@ struct btrfs_fs_info {  	 */  	struct list_head ordered_extents; +	spinlock_t delalloc_lock;  	/*  	 * all of the inodes that have delalloc bytes.  It is possible for  	 * this list to be empty even when there is still dirty data=ordered @@ -1373,13 +1445,6 @@ struct btrfs_fs_info {  	struct list_head delalloc_inodes;  	/* -	 * special rename and truncate targets that must be on disk before -	 * we're allowed to commit.  This is basically the ext3 style -	 * data=ordered list. -	 */ -	struct list_head ordered_operations; - -	/*  	 * there is a pool of worker threads for checksumming during writes  	 * and a pool for checksumming after reads.  This is because readers  	 * can run with FS locks held, and the writers may be waiting for @@ -1395,6 +1460,8 @@ struct btrfs_fs_info {  	struct btrfs_workers flush_workers;  	struct btrfs_workers endio_workers;  	struct btrfs_workers endio_meta_workers; +	struct btrfs_workers endio_raid56_workers; +	struct btrfs_workers rmw_workers;  	struct btrfs_workers endio_meta_write_workers;  	struct btrfs_workers endio_write_workers;  	struct btrfs_workers endio_freespace_worker; @@ -1423,10 +1490,12 @@ struct btrfs_fs_info {  	u64 total_pinned; -	/* protected by the delalloc lock, used to keep from writing -	 * metadata until there is a nice batch -	 */ -	u64 dirty_metadata_bytes; +	/* used to keep from writing metadata until there is a nice batch */ +	struct percpu_counter dirty_metadata_bytes; +	struct percpu_counter delalloc_bytes; +	s32 dirty_metadata_batch; +	s32 delalloc_batch; +  	struct list_head dirty_cowonly_roots;  	struct btrfs_fs_devices *fs_devices; @@ -1442,9 +1511,6 @@ struct btrfs_fs_info {  	struct reloc_control *reloc_ctl; -	spinlock_t delalloc_lock; -	u64 delalloc_bytes; -  	/* data_alloc_cluster is only used in ssd mode */  	struct btrfs_free_cluster data_alloc_cluster; @@ -1456,6 +1522,8 @@ struct btrfs_fs_info {  	struct rb_root defrag_inodes;  	atomic_t defrag_running; +	/* Used to protect avail_{data, metadata, system}_alloc_bits */ +	seqlock_t profiles_lock;  	/*  	 * these three are in extended format (availability of single  	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other @@ -1520,7 +1588,7 @@ struct btrfs_fs_info {  	u64 qgroup_seq;  	/* filesystem state */ -	u64 fs_state; +	unsigned long fs_state;  	struct btrfs_delayed_root *delayed_root; @@ -1623,6 +1691,9 @@ struct btrfs_root {  	struct list_head root_list; +	spinlock_t log_extents_lock[2]; +	struct list_head logged_list[2]; +  	spinlock_t orphan_lock;  	atomic_t orphan_inodes;  	struct btrfs_block_rsv *orphan_block_rsv; @@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args {  #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)  #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt)	((o) & BTRFS_MOUNT_##opt)  #define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \  					 BTRFS_MOUNT_##opt)  /* @@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,  			     u64 num_bytes, u64 *refs, u64 *flags);  int btrfs_pin_extent(struct btrfs_root *root,  		     u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, -				    struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,  				    u64 bytenr, u64 num_bytes);  int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,  			  struct btrfs_root *root, @@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,  int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,  				  struct inode *inode);  void btrfs_orphan_release_metadata(struct inode *inode); -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, -				struct btrfs_pending_snapshot *pending); +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, +				     struct btrfs_block_rsv *rsv, +				     int nitems, +				     u64 *qgroup_reserved); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, +				      struct btrfs_block_rsv *rsv, +				      u64 qgroup_reserved);  int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);  int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); @@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root);  struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);  int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,  			struct btrfs_key *key, int lowest_level, -			int cache_only, u64 min_trans); +			u64 min_trans);  int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,  			 struct btrfs_key *max_key, -			 struct btrfs_path *path, int cache_only, +			 struct btrfs_path *path,  			 u64 min_trans);  enum btrfs_compare_tree_result {  	BTRFS_COMPARE_TREE_NEW, @@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,  			       int find_higher, int return_any);  int btrfs_realloc_node(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct extent_buffer *parent, -		       int start_slot, int cache_only, u64 *last_ret, +		       int start_slot, u64 *last_ret,  		       struct btrfs_key *progress);  void btrfs_release_path(struct btrfs_path *p);  struct btrfs_path *btrfs_alloc_path(void); @@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping,  		     struct writeback_control *wbc);  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,  			     struct btrfs_root *new_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, -			 size_t size, struct bio *bio, unsigned long bio_flags); - +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +			 size_t size, struct bio *bio, +			 unsigned long bio_flags);  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);  int btrfs_readpage(struct file *file, struct page *page);  void btrfs_evict_inode(struct inode *inode); @@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,  /* tree-defrag.c */  int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, -			struct btrfs_root *root, int cache_only); +			struct btrfs_root *root);  /* sysfs.c */  int btrfs_init_sysfs(void); @@ -3620,11 +3696,14 @@ __printf(5, 6)  void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,  		   unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic().  Otherwise we BUG() here. + */  #define btrfs_panic(fs_info, errno, fmt, args...)			\  do {									\ -	struct btrfs_fs_info *_i = (fs_info);				\ -	__btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args);	\ -	BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR));	\ +	__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args);	\ +	BUG();								\  } while (0)  /* acl.c */ @@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid)  		return 1;  	return 0;  } + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ +	return signal_pending(current); +} + +  #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 34836036f01..0b278b117cb 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,  				     struct btrfs_delayed_item *delayed_item)  {  	struct extent_buffer *leaf; -	struct btrfs_item *item;  	char *ptr;  	int ret; @@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,  	leaf = path->nodes[0]; -	item = btrfs_item_nr(leaf, path->slots[0]);  	ptr = btrfs_item_ptr(leaf, path->slots[0], char);  	write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, @@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)  	}  } -static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, -				      struct btrfs_root *root, -				      struct btrfs_path *path, -				      struct btrfs_delayed_node *node) +static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, +					struct btrfs_root *root, +					struct btrfs_path *path, +					struct btrfs_delayed_node *node)  {  	struct btrfs_key key;  	struct btrfs_inode_item *inode_item;  	struct extent_buffer *leaf;  	int ret; -	mutex_lock(&node->mutex); -	if (!node->inode_dirty) { -		mutex_unlock(&node->mutex); -		return 0; -	} -  	key.objectid = node->inode_id;  	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);  	key.offset = 0; +  	ret = btrfs_lookup_inode(trans, root, path, &key, 1);  	if (ret > 0) {  		btrfs_release_path(path); -		mutex_unlock(&node->mutex);  		return -ENOENT;  	} else if (ret < 0) { -		mutex_unlock(&node->mutex);  		return ret;  	} @@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,  	btrfs_delayed_inode_release_metadata(root, node);  	btrfs_release_delayed_inode(node); -	mutex_unlock(&node->mutex);  	return 0;  } +static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, +					     struct btrfs_root *root, +					     struct btrfs_path *path, +					     struct btrfs_delayed_node *node) +{ +	int ret; + +	mutex_lock(&node->mutex); +	if (!node->inode_dirty) { +		mutex_unlock(&node->mutex); +		return 0; +	} + +	ret = __btrfs_update_delayed_inode(trans, root, path, node); +	mutex_unlock(&node->mutex); +	return ret; +} + +static inline int +__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, +				   struct btrfs_path *path, +				   struct btrfs_delayed_node *node) +{ +	int ret; + +	ret = btrfs_insert_delayed_items(trans, path, node->root, node); +	if (ret) +		return ret; + +	ret = btrfs_delete_delayed_items(trans, path, node->root, node); +	if (ret) +		return ret; + +	ret = btrfs_update_delayed_inode(trans, node->root, path, node); +	return ret; +} +  /*   * Called when committing the transaction.   * Returns 0 on success. @@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,  static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root, int nr)  { -	struct btrfs_root *curr_root = root;  	struct btrfs_delayed_root *delayed_root;  	struct btrfs_delayed_node *curr_node, *prev_node;  	struct btrfs_path *path; @@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,  	curr_node = btrfs_first_delayed_node(delayed_root);  	while (curr_node && (!count || (count && nr--))) { -		curr_root = curr_node->root; -		ret = btrfs_insert_delayed_items(trans, path, curr_root, -						 curr_node); -		if (!ret) -			ret = btrfs_delete_delayed_items(trans, path, -						curr_root, curr_node); -		if (!ret) -			ret = btrfs_update_delayed_inode(trans, curr_root, -						path, curr_node); +		ret = __btrfs_commit_inode_delayed_items(trans, path, +							 curr_node);  		if (ret) {  			btrfs_release_delayed_node(curr_node);  			curr_node = NULL; @@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,  	return __btrfs_run_delayed_items(trans, root, nr);  } -static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, -					      struct btrfs_delayed_node *node) +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, +				     struct inode *inode)  { +	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);  	struct btrfs_path *path;  	struct btrfs_block_rsv *block_rsv;  	int ret; +	if (!delayed_node) +		return 0; + +	mutex_lock(&delayed_node->mutex); +	if (!delayed_node->count) { +		mutex_unlock(&delayed_node->mutex); +		btrfs_release_delayed_node(delayed_node); +		return 0; +	} +	mutex_unlock(&delayed_node->mutex); +  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM;  	path->leave_spinning = 1;  	block_rsv = trans->block_rsv; -	trans->block_rsv = &node->root->fs_info->delayed_block_rsv; +	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; -	ret = btrfs_insert_delayed_items(trans, path, node->root, node); -	if (!ret) -		ret = btrfs_delete_delayed_items(trans, path, node->root, node); -	if (!ret) -		ret = btrfs_update_delayed_inode(trans, node->root, path, node); -	btrfs_free_path(path); +	ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); +	btrfs_release_delayed_node(delayed_node); +	btrfs_free_path(path);  	trans->block_rsv = block_rsv; +  	return ret;  } -int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, -				     struct inode *inode) +int btrfs_commit_inode_delayed_inode(struct inode *inode)  { +	struct btrfs_trans_handle *trans;  	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); +	struct btrfs_path *path; +	struct btrfs_block_rsv *block_rsv;  	int ret;  	if (!delayed_node)  		return 0;  	mutex_lock(&delayed_node->mutex); -	if (!delayed_node->count) { +	if (!delayed_node->inode_dirty) {  		mutex_unlock(&delayed_node->mutex);  		btrfs_release_delayed_node(delayed_node);  		return 0;  	}  	mutex_unlock(&delayed_node->mutex); -	ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); +	trans = btrfs_join_transaction(delayed_node->root); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		goto out; +	} + +	path = btrfs_alloc_path(); +	if (!path) { +		ret = -ENOMEM; +		goto trans_out; +	} +	path->leave_spinning = 1; + +	block_rsv = trans->block_rsv; +	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + +	mutex_lock(&delayed_node->mutex); +	if (delayed_node->inode_dirty) +		ret = __btrfs_update_delayed_inode(trans, delayed_node->root, +						   path, delayed_node); +	else +		ret = 0; +	mutex_unlock(&delayed_node->mutex); + +	btrfs_free_path(path); +	trans->block_rsv = block_rsv; +trans_out: +	btrfs_end_transaction(trans, delayed_node->root); +	btrfs_btree_balance_dirty(delayed_node->root); +out:  	btrfs_release_delayed_node(delayed_node); +  	return ret;  } @@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)  	struct btrfs_root *root;  	struct btrfs_block_rsv *block_rsv;  	int need_requeue = 0; -	int ret;  	async_node = container_of(work, struct btrfs_async_delayed_node, work); @@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)  	block_rsv = trans->block_rsv;  	trans->block_rsv = &root->fs_info->delayed_block_rsv; -	ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); -	if (!ret) -		ret = btrfs_delete_delayed_items(trans, path, root, -						 delayed_node); - -	if (!ret) -		btrfs_update_delayed_inode(trans, root, path, delayed_node); - +	__btrfs_commit_inode_delayed_items(trans, path, delayed_node);  	/*  	 * Maybe new delayed items have been inserted, so we need requeue  	 * the work. Besides that, we must dequeue the empty delayed nodes diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 4f808e1baee..78b6ad0fc66 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,  /* Used for evicting the inode. */  void btrfs_remove_delayed_node(struct inode *inode);  void btrfs_kill_delayed_inode_items(struct inode *inode); +int btrfs_commit_inode_delayed_inode(struct inode *inode);  int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ae941177339..b7a0641ead7 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -23,6 +23,10 @@  #include "delayed-ref.h"  #include "transaction.h" +struct kmem_cache *btrfs_delayed_ref_head_cachep; +struct kmem_cache *btrfs_delayed_tree_ref_cachep; +struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_extent_op_cachep;  /*   * delayed back reference update tracking.  For subvolume trees   * we queue up extent allocations and backref maintenance for @@ -422,6 +426,14 @@ again:  	return 1;  } +void btrfs_release_ref_cluster(struct list_head *cluster) +{ +	struct list_head *pos, *q; + +	list_for_each_safe(pos, q, cluster) +		list_del_init(pos); +} +  /*   * helper function to update an extent delayed ref in the   * rbtree.  existing and update must both have the same @@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,  					ref->extent_op->flags_to_set;  				existing_ref->extent_op->update_flags = 1;  			} -			kfree(ref->extent_op); +			btrfs_free_delayed_extent_op(ref->extent_op);  		}  	}  	/* @@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,  		 * we've updated the existing ref, free the newly  		 * allocated ref  		 */ -		kfree(head_ref); +		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);  	} else {  		delayed_refs->num_heads++;  		delayed_refs->num_heads_ready++; @@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  		 * we've updated the existing ref, free the newly  		 * allocated ref  		 */ -		kfree(full_ref); +		kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);  	} else {  		delayed_refs->num_entries++;  		trans->delayed_ref_updates++; @@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,  		 * we've updated the existing ref, free the newly  		 * allocated ref  		 */ -		kfree(full_ref); +		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);  	} else {  		delayed_refs->num_entries++;  		trans->delayed_ref_updates++; @@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  	struct btrfs_delayed_ref_root *delayed_refs;  	BUG_ON(extent_op && extent_op->is_data); -	ref = kmalloc(sizeof(*ref), GFP_NOFS); +	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);  	if (!ref)  		return -ENOMEM; -	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); +	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);  	if (!head_ref) { -		kfree(ref); +		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);  		return -ENOMEM;  	} @@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,  	struct btrfs_delayed_ref_root *delayed_refs;  	BUG_ON(extent_op && !extent_op->is_data); -	ref = kmalloc(sizeof(*ref), GFP_NOFS); +	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);  	if (!ref)  		return -ENOMEM; -	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); +	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);  	if (!head_ref) { -		kfree(ref); +		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);  		return -ENOMEM;  	} @@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,  	struct btrfs_delayed_ref_head *head_ref;  	struct btrfs_delayed_ref_root *delayed_refs; -	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); +	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);  	if (!head_ref)  		return -ENOMEM; @@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)  		return btrfs_delayed_node_to_head(ref);  	return NULL;  } + +void btrfs_delayed_ref_exit(void) +{ +	if (btrfs_delayed_ref_head_cachep) +		kmem_cache_destroy(btrfs_delayed_ref_head_cachep); +	if (btrfs_delayed_tree_ref_cachep) +		kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); +	if (btrfs_delayed_data_ref_cachep) +		kmem_cache_destroy(btrfs_delayed_data_ref_cachep); +	if (btrfs_delayed_extent_op_cachep) +		kmem_cache_destroy(btrfs_delayed_extent_op_cachep); +} + +int btrfs_delayed_ref_init(void) +{ +	btrfs_delayed_ref_head_cachep = kmem_cache_create( +				"btrfs_delayed_ref_head", +				sizeof(struct btrfs_delayed_ref_head), 0, +				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); +	if (!btrfs_delayed_ref_head_cachep) +		goto fail; + +	btrfs_delayed_tree_ref_cachep = kmem_cache_create( +				"btrfs_delayed_tree_ref", +				sizeof(struct btrfs_delayed_tree_ref), 0, +				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); +	if (!btrfs_delayed_tree_ref_cachep) +		goto fail; + +	btrfs_delayed_data_ref_cachep = kmem_cache_create( +				"btrfs_delayed_data_ref", +				sizeof(struct btrfs_delayed_data_ref), 0, +				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); +	if (!btrfs_delayed_data_ref_cachep) +		goto fail; + +	btrfs_delayed_extent_op_cachep = kmem_cache_create( +				"btrfs_delayed_extent_op", +				sizeof(struct btrfs_delayed_extent_op), 0, +				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); +	if (!btrfs_delayed_extent_op_cachep) +		goto fail; + +	return 0; +fail: +	btrfs_delayed_ref_exit(); +	return -ENOMEM; +} diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c9d703693df..f75fcaf79ae 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root {  	unsigned long num_heads_ready;  	/* +	 * bumped when someone is making progress on the delayed +	 * refs, so that other procs know they are just adding to +	 * contention intead of helping +	 */ +	atomic_t procs_running_refs; +	atomic_t ref_seq; +	wait_queue_head_t wait; + +	/*  	 * set when the tree is flushing before a transaction commit,  	 * used by the throttling code to decide if new updates need  	 * to be run right away @@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root {  	u64 run_delayed_start;  }; +extern struct kmem_cache *btrfs_delayed_ref_head_cachep; +extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; +extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_extent_op_cachep; + +int btrfs_delayed_ref_init(void); +void btrfs_delayed_ref_exit(void); + +static inline struct btrfs_delayed_extent_op * +btrfs_alloc_delayed_extent_op(void) +{ +	return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); +} + +static inline void +btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) +{ +	if (op) +		kmem_cache_free(btrfs_delayed_extent_op_cachep, op); +} +  static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)  {  	WARN_ON(atomic_read(&ref->refs) == 0);  	if (atomic_dec_and_test(&ref->refs)) {  		WARN_ON(ref->in_tree); -		kfree(ref); +		switch (ref->type) { +		case BTRFS_TREE_BLOCK_REF_KEY: +		case BTRFS_SHARED_BLOCK_REF_KEY: +			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); +			break; +		case BTRFS_EXTENT_DATA_REF_KEY: +		case BTRFS_SHARED_DATA_REF_KEY: +			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); +			break; +		case 0: +			kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); +			break; +		default: +			BUG(); +		}  	}  } @@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head *  btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);  int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,  			   struct btrfs_delayed_ref_head *head); +static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) +{ +	mutex_unlock(&head->mutex); +} +  int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,  			   struct list_head *cluster, u64 search_start); +void btrfs_release_ref_cluster(struct list_head *cluster);  int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,  			    struct btrfs_delayed_ref_root *delayed_refs, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 66dbc8dbddf..7ba7b3900cb 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	 * flush all outstanding I/O and inode extent mappings before the  	 * copy operation is declared as being finished  	 */ -	btrfs_start_delalloc_inodes(root, 0); +	ret = btrfs_start_delalloc_inodes(root, 0); +	if (ret) { +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +		return ret; +	}  	btrfs_wait_ordered_extents(root, 0);  	trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a8f652dc940..02369a3c162 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,7 @@  #include "check-integrity.h"  #include "rcu-string.h"  #include "dev-replace.h" +#include "raid56.h"  #ifdef CONFIG_X86  #include <asm/cpufeature.h> @@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work);  static void free_fs_root(struct btrfs_root *root);  static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,  				    int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_root *root); +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, +					     struct btrfs_root *root);  static void btrfs_destroy_ordered_extents(struct btrfs_root *root);  static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,  				      struct btrfs_root *root); @@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,  static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)  {  	struct extent_io_tree *tree; -	u64 start = (u64)page->index << PAGE_CACHE_SHIFT; +	u64 start = page_offset(page);  	u64 found_start;  	struct extent_buffer *eb; @@ -639,8 +641,15 @@ err:  		btree_readahead_hook(root, eb, eb->start, ret);  	} -	if (ret) +	if (ret) { +		/* +		 * our io error hook is going to dec the io pages +		 * again, we have to make sure it has something +		 * to decrement +		 */ +		atomic_inc(&eb->io_pages);  		clear_extent_buffer_uptodate(eb); +	}  	free_extent_buffer(eb);  out:  	return ret; @@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)  	eb = (struct extent_buffer *)page->private;  	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);  	eb->read_mirror = failed_mirror; +	atomic_dec(&eb->io_pages);  	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))  		btree_readahead_hook(root, eb, eb->start, -EIO);  	return -EIO;	/* we fixed nothing */ @@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)  	end_io_wq->work.flags = 0;  	if (bio->bi_rw & REQ_WRITE) { -		if (end_io_wq->metadata == 1) +		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)  			btrfs_queue_worker(&fs_info->endio_meta_write_workers,  					   &end_io_wq->work); -		else if (end_io_wq->metadata == 2) +		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)  			btrfs_queue_worker(&fs_info->endio_freespace_worker,  					   &end_io_wq->work); +		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) +			btrfs_queue_worker(&fs_info->endio_raid56_workers, +					   &end_io_wq->work);  		else  			btrfs_queue_worker(&fs_info->endio_write_workers,  					   &end_io_wq->work);  	} else { -		if (end_io_wq->metadata) +		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) +			btrfs_queue_worker(&fs_info->endio_raid56_workers, +					   &end_io_wq->work); +		else if (end_io_wq->metadata)  			btrfs_queue_worker(&fs_info->endio_meta_workers,  					   &end_io_wq->work);  		else @@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)   * 0 - if data   * 1 - if normal metadta   * 2 - if writing to the free space cache area + * 3 - raid parity work   */  int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,  			int metadata) @@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping,  			    struct writeback_control *wbc)  {  	struct extent_io_tree *tree; +	struct btrfs_fs_info *fs_info; +	int ret; +  	tree = &BTRFS_I(mapping->host)->io_tree;  	if (wbc->sync_mode == WB_SYNC_NONE) { -		struct btrfs_root *root = BTRFS_I(mapping->host)->root; -		u64 num_dirty; -		unsigned long thresh = 32 * 1024 * 1024;  		if (wbc->for_kupdate)  			return 0; +		fs_info = BTRFS_I(mapping->host)->root->fs_info;  		/* this is a bit racy, but that's ok */ -		num_dirty = root->fs_info->dirty_metadata_bytes; -		if (num_dirty < thresh) +		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, +					     BTRFS_DIRTY_METADATA_THRESH); +		if (ret < 0)  			return 0;  	}  	return btree_write_cache_pages(mapping, wbc); @@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,  void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		      struct extent_buffer *buf)  { +	struct btrfs_fs_info *fs_info = root->fs_info; +  	if (btrfs_header_generation(buf) == -	    root->fs_info->running_transaction->transid) { +	    fs_info->running_transaction->transid) {  		btrfs_assert_tree_locked(buf);  		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { -			spin_lock(&root->fs_info->delalloc_lock); -			if (root->fs_info->dirty_metadata_bytes >= buf->len) -				root->fs_info->dirty_metadata_bytes -= buf->len; -			else { -				spin_unlock(&root->fs_info->delalloc_lock); -				btrfs_panic(root->fs_info, -EOVERFLOW, -					  "Can't clear %lu bytes from " -					  " dirty_mdatadata_bytes (%llu)", -					  buf->len, -					  root->fs_info->dirty_metadata_bytes); -			} -			spin_unlock(&root->fs_info->delalloc_lock); - +			__percpu_counter_add(&fs_info->dirty_metadata_bytes, +					     -buf->len, +					     fs_info->dirty_metadata_batch);  			/* ugh, clear_extent_buffer_dirty needs to lock the page */  			btrfs_set_lock_blocking(buf);  			clear_extent_buffer_dirty(buf); @@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,  	INIT_LIST_HEAD(&root->dirty_list);  	INIT_LIST_HEAD(&root->root_list); +	INIT_LIST_HEAD(&root->logged_list[0]); +	INIT_LIST_HEAD(&root->logged_list[1]);  	spin_lock_init(&root->orphan_lock);  	spin_lock_init(&root->inode_lock);  	spin_lock_init(&root->accounting_lock); +	spin_lock_init(&root->log_extents_lock[0]); +	spin_lock_init(&root->log_extents_lock[1]);  	mutex_init(&root->objectid_mutex);  	mutex_init(&root->log_mutex);  	init_waitqueue_head(&root->log_writer_wait); @@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb,  		goto fail_srcu;  	} +	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); +	if (ret) { +		err = ret; +		goto fail_bdi; +	} +	fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * +					(1 + ilog2(nr_cpu_ids)); + +	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); +	if (ret) { +		err = ret; +		goto fail_dirty_metadata_bytes; +	} +  	fs_info->btree_inode = new_inode(sb);  	if (!fs_info->btree_inode) {  		err = -ENOMEM; -		goto fail_bdi; +		goto fail_delalloc_bytes;  	}  	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb,  	INIT_LIST_HEAD(&fs_info->dead_roots);  	INIT_LIST_HEAD(&fs_info->delayed_iputs);  	INIT_LIST_HEAD(&fs_info->delalloc_inodes); -	INIT_LIST_HEAD(&fs_info->ordered_operations);  	INIT_LIST_HEAD(&fs_info->caching_block_groups);  	spin_lock_init(&fs_info->delalloc_lock);  	spin_lock_init(&fs_info->trans_lock); @@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb,  	spin_lock_init(&fs_info->tree_mod_seq_lock);  	rwlock_init(&fs_info->tree_mod_log_lock);  	mutex_init(&fs_info->reloc_mutex); +	seqlock_init(&fs_info->profiles_lock);  	init_completion(&fs_info->kobj_unregister);  	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb,  	spin_lock_init(&fs_info->block_group_cache_lock);  	fs_info->block_group_cache_tree = RB_ROOT; +	fs_info->first_logical_byte = (u64)-1;  	extent_io_tree_init(&fs_info->freed_extents[0],  			     fs_info->btree_inode->i_mapping); @@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb,  	init_waitqueue_head(&fs_info->transaction_blocked_wait);  	init_waitqueue_head(&fs_info->async_submit_wait); +	ret = btrfs_alloc_stripe_hash_table(fs_info); +	if (ret) { +		err = ret; +		goto fail_alloc; +	} +  	__setup_root(4096, 4096, 4096, 4096, tree_root,  		     fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb,  		goto fail_alloc;  	/* check FS state, whether FS is broken. */ -	fs_info->fs_state |= btrfs_super_flags(disk_super); +	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) +		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);  	ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);  	if (ret) { @@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb,  	leafsize = btrfs_super_leafsize(disk_super);  	sectorsize = btrfs_super_sectorsize(disk_super);  	stripesize = btrfs_super_stripesize(disk_super); +	fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); +	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));  	/*  	 * mixed block groups end up with duplicate but slightly offset @@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb,  	btrfs_init_workers(&fs_info->endio_meta_write_workers,  			   "endio-meta-write", fs_info->thread_pool_size,  			   &fs_info->generic_worker); +	btrfs_init_workers(&fs_info->endio_raid56_workers, +			   "endio-raid56", fs_info->thread_pool_size, +			   &fs_info->generic_worker); +	btrfs_init_workers(&fs_info->rmw_workers, +			   "rmw", fs_info->thread_pool_size, +			   &fs_info->generic_worker);  	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",  			   fs_info->thread_pool_size,  			   &fs_info->generic_worker); @@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb,  	 */  	fs_info->endio_workers.idle_thresh = 4;  	fs_info->endio_meta_workers.idle_thresh = 4; +	fs_info->endio_raid56_workers.idle_thresh = 4; +	fs_info->rmw_workers.idle_thresh = 2;  	fs_info->endio_write_workers.idle_thresh = 2;  	fs_info->endio_meta_write_workers.idle_thresh = 2; @@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb,  	ret |= btrfs_start_workers(&fs_info->fixup_workers);  	ret |= btrfs_start_workers(&fs_info->endio_workers);  	ret |= btrfs_start_workers(&fs_info->endio_meta_workers); +	ret |= btrfs_start_workers(&fs_info->rmw_workers); +	ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);  	ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);  	ret |= btrfs_start_workers(&fs_info->endio_write_workers);  	ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); @@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb,  	sb->s_blocksize = sectorsize;  	sb->s_blocksize_bits = blksize_bits(sectorsize); -	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, -		    sizeof(disk_super->magic))) { +	if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {  		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);  		goto fail_sb_buffer;  	} @@ -2694,13 +2742,13 @@ fail_cleaner:  	 * kthreads  	 */  	filemap_write_and_wait(fs_info->btree_inode->i_mapping); -	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);  fail_block_groups:  	btrfs_free_block_groups(fs_info);  fail_tree_roots:  	free_root_pointers(fs_info, 1); +	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);  fail_sb_buffer:  	btrfs_stop_workers(&fs_info->generic_worker); @@ -2710,6 +2758,8 @@ fail_sb_buffer:  	btrfs_stop_workers(&fs_info->workers);  	btrfs_stop_workers(&fs_info->endio_workers);  	btrfs_stop_workers(&fs_info->endio_meta_workers); +	btrfs_stop_workers(&fs_info->endio_raid56_workers); +	btrfs_stop_workers(&fs_info->rmw_workers);  	btrfs_stop_workers(&fs_info->endio_meta_write_workers);  	btrfs_stop_workers(&fs_info->endio_write_workers);  	btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -2721,13 +2771,17 @@ fail_alloc:  fail_iput:  	btrfs_mapping_tree_free(&fs_info->mapping_tree); -	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);  	iput(fs_info->btree_inode); +fail_delalloc_bytes: +	percpu_counter_destroy(&fs_info->delalloc_bytes); +fail_dirty_metadata_bytes: +	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);  fail_bdi:  	bdi_destroy(&fs_info->bdi);  fail_srcu:  	cleanup_srcu_struct(&fs_info->subvol_srcu);  fail: +	btrfs_free_stripe_hash_table(fs_info);  	btrfs_close_devices(fs_info->fs_devices);  	return err; @@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)  		super = (struct btrfs_super_block *)bh->b_data;  		if (btrfs_super_bytenr(super) != bytenr || -		    strncmp((char *)(&super->magic), BTRFS_MAGIC, -			    sizeof(super->magic))) { +		    super->magic != cpu_to_le64(BTRFS_MAGIC)) {  			brelse(bh);  			continue;  		} @@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(  				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)  				      == 0)))  					num_tolerated_disk_barrier_failures = 0; -				else if (num_tolerated_disk_barrier_failures > 1 -					 && -					 (flags & (BTRFS_BLOCK_GROUP_RAID1 | -						   BTRFS_BLOCK_GROUP_RAID10))) -					num_tolerated_disk_barrier_failures = 1; +				else if (num_tolerated_disk_barrier_failures > 1) { +					if (flags & (BTRFS_BLOCK_GROUP_RAID1 | +					    BTRFS_BLOCK_GROUP_RAID5 | +					    BTRFS_BLOCK_GROUP_RAID10)) { +						num_tolerated_disk_barrier_failures = 1; +					} else if (flags & +						   BTRFS_BLOCK_GROUP_RAID5) { +						num_tolerated_disk_barrier_failures = 2; +					} +				}  			}  		}  		up_read(&sinfo->groups_sem); @@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)  	if (btrfs_root_refs(&root->root_item) == 0)  		synchronize_srcu(&fs_info->subvol_srcu); +	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +		btrfs_free_log(NULL, root); +		btrfs_free_log_root_tree(NULL, fs_info); +	} +  	__btrfs_remove_free_space_cache(root->free_ino_pinned);  	__btrfs_remove_free_space_cache(root->free_ino_ctl);  	free_fs_root(root); @@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root)  			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);  	} -	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) +	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))  		btrfs_error_commit_super(root);  	btrfs_put_block_group_cache(fs_info); @@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root)  	btrfs_free_qgroup_config(root->fs_info); -	if (fs_info->delalloc_bytes) { -		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", -		       (unsigned long long)fs_info->delalloc_bytes); +	if (percpu_counter_sum(&fs_info->delalloc_bytes)) { +		printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", +		       percpu_counter_sum(&fs_info->delalloc_bytes));  	}  	free_extent_buffer(fs_info->extent_root->node); @@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root)  	btrfs_stop_workers(&fs_info->workers);  	btrfs_stop_workers(&fs_info->endio_workers);  	btrfs_stop_workers(&fs_info->endio_meta_workers); +	btrfs_stop_workers(&fs_info->endio_raid56_workers); +	btrfs_stop_workers(&fs_info->rmw_workers);  	btrfs_stop_workers(&fs_info->endio_meta_write_workers);  	btrfs_stop_workers(&fs_info->endio_write_workers);  	btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root)  	btrfs_close_devices(fs_info->fs_devices);  	btrfs_mapping_tree_free(&fs_info->mapping_tree); +	percpu_counter_destroy(&fs_info->dirty_metadata_bytes); +	percpu_counter_destroy(&fs_info->delalloc_bytes);  	bdi_destroy(&fs_info->bdi);  	cleanup_srcu_struct(&fs_info->subvol_srcu); +	btrfs_free_stripe_hash_table(fs_info); +  	return 0;  } @@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)  			(unsigned long long)transid,  			(unsigned long long)root->fs_info->generation);  	was_dirty = set_extent_buffer_dirty(buf); -	if (!was_dirty) { -		spin_lock(&root->fs_info->delalloc_lock); -		root->fs_info->dirty_metadata_bytes += buf->len; -		spin_unlock(&root->fs_info->delalloc_lock); -	} +	if (!was_dirty) +		__percpu_counter_add(&root->fs_info->dirty_metadata_bytes, +				     buf->len, +				     root->fs_info->dirty_metadata_batch);  }  static void __btrfs_btree_balance_dirty(struct btrfs_root *root, @@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,  	 * looks as though older kernels can get into trouble with  	 * this code, they end up stuck in balance_dirty_pages forever  	 */ -	u64 num_dirty; -	unsigned long thresh = 32 * 1024 * 1024; +	int ret;  	if (current->flags & PF_MEMALLOC)  		return; @@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,  	if (flush_delayed)  		btrfs_balance_delayed_items(root); -	num_dirty = root->fs_info->dirty_metadata_bytes; - -	if (num_dirty > thresh) { +	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, +				     BTRFS_DIRTY_METADATA_THRESH); +	if (ret > 0) {  		balance_dirty_pages_ratelimited(  				   root->fs_info->btree_inode->i_mapping);  	} @@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)  	btrfs_cleanup_transaction(root);  } -static void btrfs_destroy_ordered_operations(struct btrfs_root *root) +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, +					     struct btrfs_root *root)  {  	struct btrfs_inode *btrfs_inode;  	struct list_head splice; @@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)  	mutex_lock(&root->fs_info->ordered_operations_mutex);  	spin_lock(&root->fs_info->ordered_extent_lock); -	list_splice_init(&root->fs_info->ordered_operations, &splice); +	list_splice_init(&t->ordered_operations, &splice);  	while (!list_empty(&splice)) {  		btrfs_inode = list_entry(splice.next, struct btrfs_inode,  					 ordered_operations); @@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)  static void btrfs_destroy_ordered_extents(struct btrfs_root *root)  { -	struct list_head splice;  	struct btrfs_ordered_extent *ordered; -	struct inode *inode; - -	INIT_LIST_HEAD(&splice);  	spin_lock(&root->fs_info->ordered_extent_lock); - -	list_splice_init(&root->fs_info->ordered_extents, &splice); -	while (!list_empty(&splice)) { -		ordered = list_entry(splice.next, struct btrfs_ordered_extent, -				     root_extent_list); - -		list_del_init(&ordered->root_extent_list); -		atomic_inc(&ordered->refs); - -		/* the inode may be getting freed (in sys_unlink path). */ -		inode = igrab(ordered->inode); - -		spin_unlock(&root->fs_info->ordered_extent_lock); -		if (inode) -			iput(inode); - -		atomic_set(&ordered->refs, 1); -		btrfs_put_ordered_extent(ordered); - -		spin_lock(&root->fs_info->ordered_extent_lock); -	} - +	/* +	 * This will just short circuit the ordered completion stuff which will +	 * make sure the ordered extent gets properly cleaned up. +	 */ +	list_for_each_entry(ordered, &root->fs_info->ordered_extents, +			    root_extent_list) +		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);  	spin_unlock(&root->fs_info->ordered_extent_lock);  } @@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,  	}  	while ((node = rb_first(&delayed_refs->root)) != NULL) { -		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); +		struct btrfs_delayed_ref_head *head = NULL; +		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);  		atomic_set(&ref->refs, 1);  		if (btrfs_delayed_ref_is_head(ref)) { -			struct btrfs_delayed_ref_head *head;  			head = btrfs_delayed_node_to_head(ref);  			if (!mutex_trylock(&head->mutex)) { @@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,  				continue;  			} -			kfree(head->extent_op); +			btrfs_free_delayed_extent_op(head->extent_op);  			delayed_refs->num_heads--;  			if (list_empty(&head->cluster))  				delayed_refs->num_heads_ready--;  			list_del_init(&head->cluster);  		} +  		ref->in_tree = 0;  		rb_erase(&ref->rb_node, &delayed_refs->root);  		delayed_refs->num_entries--; - +		if (head) +			mutex_unlock(&head->mutex);  		spin_unlock(&delayed_refs->lock);  		btrfs_put_delayed_ref(ref); @@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)  				    delalloc_inodes);  		list_del_init(&btrfs_inode->delalloc_inodes); +		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, +			  &btrfs_inode->runtime_flags);  		btrfs_invalidate_inodes(btrfs_inode->root);  	} @@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)  	while (!list_empty(&list)) {  		t = list_entry(list.next, struct btrfs_transaction, list); -		if (!t) -			break; -		btrfs_destroy_ordered_operations(root); +		btrfs_destroy_ordered_operations(t, root);  		btrfs_destroy_ordered_extents(root); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 305c33efb0e..034d7dc552b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,6 +25,13 @@  #define BTRFS_SUPER_MIRROR_MAX	 3  #define BTRFS_SUPER_MIRROR_SHIFT 12 +enum { +	BTRFS_WQ_ENDIO_DATA = 0, +	BTRFS_WQ_ENDIO_METADATA = 1, +	BTRFS_WQ_ENDIO_FREE_SPACE = 2, +	BTRFS_WQ_ENDIO_RAID56 = 3, +}; +  static inline u64 btrfs_sb_offset(int mirror)  {  	u64 start = 16 * 1024; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 614f34a899c..81ee29eeb7c 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -22,10 +22,10 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,  	if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {  		*max_len = BTRFS_FID_SIZE_CONNECTABLE; -		return 255; +		return FILEID_INVALID;  	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {  		*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; -		return 255; +		return FILEID_INVALID;  	}  	len  = BTRFS_FID_SIZE_NON_CONNECTABLE; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 521e9d4424f..3e074dab2d5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@  #include "print-tree.h"  #include "transaction.h"  #include "volumes.h" +#include "raid56.h"  #include "locking.h"  #include "free-space-cache.h"  #include "math.h" @@ -72,8 +73,7 @@ enum {  	RESERVE_ALLOC_NO_ACCOUNT = 2,  }; -static int update_block_group(struct btrfs_trans_handle *trans, -			      struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root,  			      u64 bytenr, u64 num_bytes, int alloc);  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  				struct btrfs_root *root, @@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  			    int dump_block_groups);  static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,  				       u64 num_bytes, int reserve); +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, +			       u64 num_bytes);  static noinline int  block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,  	rb_link_node(&block_group->cache_node, parent, p);  	rb_insert_color(&block_group->cache_node,  			&info->block_group_cache_tree); + +	if (info->first_logical_byte > block_group->key.objectid) +		info->first_logical_byte = block_group->key.objectid; +  	spin_unlock(&info->block_group_cache_lock);  	return 0; @@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,  			break;  		}  	} -	if (ret) +	if (ret) {  		btrfs_get_block_group(ret); +		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) +			info->first_logical_byte = ret->key.objectid; +	}  	spin_unlock(&info->block_group_cache_lock);  	return ret; @@ -468,8 +477,6 @@ out:  }  static int cache_block_group(struct btrfs_block_group_cache *cache, -			     struct btrfs_trans_handle *trans, -			     struct btrfs_root *root,  			     int load_cache_only)  {  	DEFINE_WAIT(wait); @@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  	cache->cached = BTRFS_CACHE_FAST;  	spin_unlock(&cache->lock); -	/* -	 * We can't do the read from on-disk cache during a commit since we need -	 * to have the normal tree locking.  Also if we are currently trying to -	 * allocate blocks for the tree root we can't do the fast caching since -	 * we likely hold important locks. -	 */  	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {  		ret = load_free_space_cache(fs_info, cache); @@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,  		*actual_bytes = discarded_bytes; +	if (ret == -EOPNOTSUPP) +		ret = 0;  	return ret;  } @@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,  						      node->num_bytes);  			}  		} -		mutex_unlock(&head->mutex);  		return ret;  	} @@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  			 * process of being added. Don't run this ref yet.  			 */  			list_del_init(&locked_ref->cluster); -			mutex_unlock(&locked_ref->mutex); +			btrfs_delayed_ref_unlock(locked_ref);  			locked_ref = NULL;  			delayed_refs->num_heads_ready++;  			spin_unlock(&delayed_refs->lock); @@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  			ref = &locked_ref->node;  			if (extent_op && must_insert_reserved) { -				kfree(extent_op); +				btrfs_free_delayed_extent_op(extent_op);  				extent_op = NULL;  			} @@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  				ret = run_delayed_extent_op(trans, root,  							    ref, extent_op); -				kfree(extent_op); +				btrfs_free_delayed_extent_op(extent_op);  				if (ret) { -					list_del_init(&locked_ref->cluster); -					mutex_unlock(&locked_ref->mutex); - -					printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); +					printk(KERN_DEBUG +					       "btrfs: run_delayed_extent_op " +					       "returned %d\n", ret);  					spin_lock(&delayed_refs->lock); +					btrfs_delayed_ref_unlock(locked_ref);  					return ret;  				}  				goto next;  			} - -			list_del_init(&locked_ref->cluster); -			locked_ref = NULL;  		}  		ref->in_tree = 0;  		rb_erase(&ref->rb_node, &delayed_refs->root);  		delayed_refs->num_entries--; -		if (locked_ref) { +		if (!btrfs_delayed_ref_is_head(ref)) {  			/*  			 * when we play the delayed ref, also correct the  			 * ref_mod on head @@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  		ret = run_one_delayed_ref(trans, root, ref, extent_op,  					  must_insert_reserved); -		btrfs_put_delayed_ref(ref); -		kfree(extent_op); -		count++; - +		btrfs_free_delayed_extent_op(extent_op);  		if (ret) { -			if (locked_ref) { -				list_del_init(&locked_ref->cluster); -				mutex_unlock(&locked_ref->mutex); -			} -			printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); +			btrfs_delayed_ref_unlock(locked_ref); +			btrfs_put_delayed_ref(ref); +			printk(KERN_DEBUG +			       "btrfs: run_one_delayed_ref returned %d\n", ret);  			spin_lock(&delayed_refs->lock);  			return ret;  		} +		/* +		 * If this node is a head, that means all the refs in this head +		 * have been dealt with, and we will pick the next head to deal +		 * with, so we must unlock the head and drop it from the cluster +		 * list before we release it. +		 */ +		if (btrfs_delayed_ref_is_head(ref)) { +			list_del_init(&locked_ref->cluster); +			btrfs_delayed_ref_unlock(locked_ref); +			locked_ref = NULL; +		} +		btrfs_put_delayed_ref(ref); +		count++;  next:  		cond_resched();  		spin_lock(&delayed_refs->lock); @@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,  	return ret;  } +static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, +		      int count) +{ +	int val = atomic_read(&delayed_refs->ref_seq); + +	if (val < seq || val >= seq + count) +		return 1; +	return 0; +} +  /*   * this starts processing the delayed reference count updates and   * extent insertions we have queued up so far.  count can be @@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  	delayed_refs = &trans->transaction->delayed_refs;  	INIT_LIST_HEAD(&cluster); +	if (count == 0) { +		count = delayed_refs->num_entries * 2; +		run_most = 1; +	} + +	if (!run_all && !run_most) { +		int old; +		int seq = atomic_read(&delayed_refs->ref_seq); + +progress: +		old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); +		if (old) { +			DEFINE_WAIT(__wait); +			if (delayed_refs->num_entries < 16348) +				return 0; + +			prepare_to_wait(&delayed_refs->wait, &__wait, +					TASK_UNINTERRUPTIBLE); + +			old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); +			if (old) { +				schedule(); +				finish_wait(&delayed_refs->wait, &__wait); + +				if (!refs_newer(delayed_refs, seq, 256)) +					goto progress; +				else +					return 0; +			} else { +				finish_wait(&delayed_refs->wait, &__wait); +				goto again; +			} +		} + +	} else { +		atomic_inc(&delayed_refs->procs_running_refs); +	} +  again:  	loops = 0;  	spin_lock(&delayed_refs->lock); @@ -2477,10 +2533,6 @@ again:  	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);  #endif -	if (count == 0) { -		count = delayed_refs->num_entries * 2; -		run_most = 1; -	}  	while (1) {  		if (!(run_all || run_most) &&  		    delayed_refs->num_heads_ready < 64) @@ -2500,11 +2552,15 @@ again:  		ret = run_clustered_refs(trans, root, &cluster);  		if (ret < 0) { +			btrfs_release_ref_cluster(&cluster);  			spin_unlock(&delayed_refs->lock);  			btrfs_abort_transaction(trans, root, ret); +			atomic_dec(&delayed_refs->procs_running_refs);  			return ret;  		} +		atomic_add(ret, &delayed_refs->ref_seq); +  		count -= min_t(unsigned long, ret, count);  		if (count == 0) @@ -2573,6 +2629,11 @@ again:  		goto again;  	}  out: +	atomic_dec(&delayed_refs->procs_running_refs); +	smp_mb(); +	if (waitqueue_active(&delayed_refs->wait)) +		wake_up(&delayed_refs->wait); +  	spin_unlock(&delayed_refs->lock);  	assert_qgroups_uptodate(trans);  	return 0; @@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,  	struct btrfs_delayed_extent_op *extent_op;  	int ret; -	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); +	extent_op = btrfs_alloc_delayed_extent_op();  	if (!extent_op)  		return -ENOMEM; @@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,  	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,  					  num_bytes, extent_op);  	if (ret) -		kfree(extent_op); +		btrfs_free_delayed_extent_op(extent_op);  	return ret;  } @@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)  	u64 extra_flags = chunk_to_extended(flags) &  				BTRFS_EXTENDED_PROFILE_MASK; +	write_seqlock(&fs_info->profiles_lock);  	if (flags & BTRFS_BLOCK_GROUP_DATA)  		fs_info->avail_data_alloc_bits |= extra_flags;  	if (flags & BTRFS_BLOCK_GROUP_METADATA)  		fs_info->avail_metadata_alloc_bits |= extra_flags;  	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)  		fs_info->avail_system_alloc_bits |= extra_flags; +	write_sequnlock(&fs_info->profiles_lock);  }  /* @@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)  	u64 num_devices = root->fs_info->fs_devices->rw_devices +  		root->fs_info->fs_devices->missing_devices;  	u64 target; +	u64 tmp;  	/*  	 * see if restripe for this chunk_type is in progress, if so @@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)  	}  	spin_unlock(&root->fs_info->balance_lock); +	/* First, mask out the RAID levels which aren't possible */  	if (num_devices == 1) -		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); +		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | +			   BTRFS_BLOCK_GROUP_RAID5); +	if (num_devices < 3) +		flags &= ~BTRFS_BLOCK_GROUP_RAID6;  	if (num_devices < 4)  		flags &= ~BTRFS_BLOCK_GROUP_RAID10; -	if ((flags & BTRFS_BLOCK_GROUP_DUP) && -	    (flags & (BTRFS_BLOCK_GROUP_RAID1 | -		      BTRFS_BLOCK_GROUP_RAID10))) { -		flags &= ~BTRFS_BLOCK_GROUP_DUP; -	} - -	if ((flags & BTRFS_BLOCK_GROUP_RAID1) && -	    (flags & BTRFS_BLOCK_GROUP_RAID10)) { -		flags &= ~BTRFS_BLOCK_GROUP_RAID1; -	} +	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | +		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | +		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); +	flags &= ~tmp; -	if ((flags & BTRFS_BLOCK_GROUP_RAID0) && -	    ((flags & BTRFS_BLOCK_GROUP_RAID1) | -	     (flags & BTRFS_BLOCK_GROUP_RAID10) | -	     (flags & BTRFS_BLOCK_GROUP_DUP))) { -		flags &= ~BTRFS_BLOCK_GROUP_RAID0; -	} +	if (tmp & BTRFS_BLOCK_GROUP_RAID6) +		tmp = BTRFS_BLOCK_GROUP_RAID6; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID5) +		tmp = BTRFS_BLOCK_GROUP_RAID5; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID10) +		tmp = BTRFS_BLOCK_GROUP_RAID10; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID1) +		tmp = BTRFS_BLOCK_GROUP_RAID1; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID0) +		tmp = BTRFS_BLOCK_GROUP_RAID0; -	return extended_to_chunk(flags); +	return extended_to_chunk(flags | tmp);  }  static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)  { -	if (flags & BTRFS_BLOCK_GROUP_DATA) -		flags |= root->fs_info->avail_data_alloc_bits; -	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) -		flags |= root->fs_info->avail_system_alloc_bits; -	else if (flags & BTRFS_BLOCK_GROUP_METADATA) -		flags |= root->fs_info->avail_metadata_alloc_bits; +	unsigned seq; + +	do { +		seq = read_seqbegin(&root->fs_info->profiles_lock); + +		if (flags & BTRFS_BLOCK_GROUP_DATA) +			flags |= root->fs_info->avail_data_alloc_bits; +		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) +			flags |= root->fs_info->avail_system_alloc_bits; +		else if (flags & BTRFS_BLOCK_GROUP_METADATA) +			flags |= root->fs_info->avail_metadata_alloc_bits; +	} while (read_seqretry(&root->fs_info->profiles_lock, seq));  	return btrfs_reduce_alloc_profile(root, flags);  } @@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)  u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)  {  	u64 flags; +	u64 ret;  	if (data)  		flags = BTRFS_BLOCK_GROUP_DATA; @@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)  	else  		flags = BTRFS_BLOCK_GROUP_METADATA; -	return get_alloc_profile(root, flags); +	ret = get_alloc_profile(root, flags); +	return ret;  }  /* @@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)  	int ret = 0, committed = 0, alloc_chunk = 1;  	/* make sure bytes are sectorsize aligned */ -	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); +	bytes = ALIGN(bytes, root->sectorsize);  	if (root == root->fs_info->tree_root ||  	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { @@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)  	struct btrfs_space_info *data_sinfo;  	/* make sure bytes are sectorsize aligned */ -	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); +	bytes = ALIGN(bytes, root->sectorsize);  	data_sinfo = root->fs_info->data_sinfo;  	spin_lock(&data_sinfo->lock); @@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)  {  	u64 num_dev; -	if (type & BTRFS_BLOCK_GROUP_RAID10 || -	    type & BTRFS_BLOCK_GROUP_RAID0) +	if (type & (BTRFS_BLOCK_GROUP_RAID10 | +		    BTRFS_BLOCK_GROUP_RAID0 | +		    BTRFS_BLOCK_GROUP_RAID5 | +		    BTRFS_BLOCK_GROUP_RAID6))  		num_dev = root->fs_info->fs_devices->rw_devices;  	else if (type & BTRFS_BLOCK_GROUP_RAID1)  		num_dev = 2; @@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,  	int wait_for_alloc = 0;  	int ret = 0; +	/* Don't re-enter if we're already allocating a chunk */ +	if (trans->allocating_chunk) +		return -ENOSPC; +  	space_info = __find_space_info(extent_root->fs_info, flags);  	if (!space_info) {  		ret = update_space_info(extent_root->fs_info, flags, @@ -3606,6 +3686,8 @@ again:  		goto again;  	} +	trans->allocating_chunk = true; +  	/*  	 * If we have mixed data/metadata chunks we want to make sure we keep  	 * allocating mixed chunks instead of individual chunks. @@ -3632,19 +3714,20 @@ again:  	check_system_chunk(trans, extent_root, flags);  	ret = btrfs_alloc_chunk(trans, extent_root, flags); -	if (ret < 0 && ret != -ENOSPC) -		goto out; +	trans->allocating_chunk = false;  	spin_lock(&space_info->lock); +	if (ret < 0 && ret != -ENOSPC) +		goto out;  	if (ret)  		space_info->full = 1;  	else  		ret = 1;  	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out:  	space_info->chunk_alloc = 0;  	spin_unlock(&space_info->lock); -out:  	mutex_unlock(&fs_info->chunk_mutex);  	return ret;  } @@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root,  			  struct btrfs_space_info *space_info, u64 bytes,  			  enum btrfs_reserve_flush_enum flush)  { +	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;  	u64 profile = btrfs_get_alloc_profile(root, 0); +	u64 rsv_size = 0;  	u64 avail;  	u64 used; +	u64 to_add;  	used = space_info->bytes_used + space_info->bytes_reserved + -		space_info->bytes_pinned + space_info->bytes_readonly + -		space_info->bytes_may_use; +		space_info->bytes_pinned + space_info->bytes_readonly; + +	spin_lock(&global_rsv->lock); +	rsv_size = global_rsv->size; +	spin_unlock(&global_rsv->lock); + +	/* +	 * We only want to allow over committing if we have lots of actual space +	 * free, but if we don't have enough space to handle the global reserve +	 * space then we could end up having a real enospc problem when trying +	 * to allocate a chunk or some other such important allocation. +	 */ +	rsv_size <<= 1; +	if (used + rsv_size >= space_info->total_bytes) +		return 0; + +	used += space_info->bytes_may_use;  	spin_lock(&root->fs_info->free_chunk_lock);  	avail = root->fs_info->free_chunk_space; @@ -3667,40 +3768,58 @@ static int can_overcommit(struct btrfs_root *root,  	/*  	 * If we have dup, raid1 or raid10 then only half of the free -	 * space is actually useable. +	 * space is actually useable.  For raid56, the space info used +	 * doesn't include the parity drive, so we don't have to +	 * change the math  	 */  	if (profile & (BTRFS_BLOCK_GROUP_DUP |  		       BTRFS_BLOCK_GROUP_RAID1 |  		       BTRFS_BLOCK_GROUP_RAID10))  		avail >>= 1; +	to_add = space_info->total_bytes; +  	/*  	 * If we aren't flushing all things, let us overcommit up to  	 * 1/2th of the space. If we can flush, don't let us overcommit  	 * too much, let it overcommit up to 1/8 of the space.  	 */  	if (flush == BTRFS_RESERVE_FLUSH_ALL) -		avail >>= 3; +		to_add >>= 3;  	else -		avail >>= 1; +		to_add >>= 1; + +	/* +	 * Limit the overcommit to the amount of free space we could possibly +	 * allocate for chunks. +	 */ +	to_add = min(avail, to_add); -	if (used + bytes < space_info->total_bytes + avail) +	if (used + bytes < space_info->total_bytes + to_add)  		return 1;  	return 0;  } -static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, -					       unsigned long nr_pages, -					       enum wb_reason reason) +void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, +				  unsigned long nr_pages)  { -	if (!writeback_in_progress(sb->s_bdi) && -	    down_read_trylock(&sb->s_umount)) { -		writeback_inodes_sb_nr(sb, nr_pages, reason); -		up_read(&sb->s_umount); -		return 1; -	} +	struct super_block *sb = root->fs_info->sb; +	int started; -	return 0; +	/* If we can not start writeback, just sync all the delalloc file. */ +	started = try_to_writeback_inodes_sb_nr(sb, nr_pages, +						      WB_REASON_FS_FREE_SPACE); +	if (!started) { +		/* +		 * We needn't worry the filesystem going from r/w to r/o though +		 * we don't acquire ->s_umount mutex, because the filesystem +		 * should guarantee the delalloc inodes list be empty after +		 * the filesystem is readonly(all dirty pages are written to +		 * the disk). +		 */ +		btrfs_start_delalloc_inodes(root, 0); +		btrfs_wait_ordered_extents(root, 0); +	}  }  /* @@ -3724,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  	space_info = block_rsv->space_info;  	smp_mb(); -	delalloc_bytes = root->fs_info->delalloc_bytes; +	delalloc_bytes = percpu_counter_sum_positive( +						&root->fs_info->delalloc_bytes);  	if (delalloc_bytes == 0) {  		if (trans)  			return; @@ -3735,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  	while (delalloc_bytes && loops < 3) {  		max_reclaim = min(delalloc_bytes, to_reclaim);  		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; -		writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, -						    nr_pages, -						    WB_REASON_FS_FREE_SPACE); - +		btrfs_writeback_inodes_sb_nr(root, nr_pages);  		/*  		 * We need to wait for the async pages to actually start before  		 * we do anything. @@ -3766,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  				break;  		}  		smp_mb(); -		delalloc_bytes = root->fs_info->delalloc_bytes; +		delalloc_bytes = percpu_counter_sum_positive( +						&root->fs_info->delalloc_bytes);  	}  } @@ -3997,7 +4115,7 @@ again:  	 * We make the other tasks wait for the flush only when we can flush  	 * all things.  	 */ -	if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { +	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {  		flushing = true;  		space_info->flush = 1;  	} @@ -4030,6 +4148,15 @@ again:  		goto again;  out: +	if (ret == -ENOSPC && +	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { +		struct btrfs_block_rsv *global_rsv = +			&root->fs_info->global_block_rsv; + +		if (block_rsv != global_rsv && +		    !block_rsv_use_bytes(global_rsv, orig_bytes)) +			ret = 0; +	}  	if (flushing) {  		spin_lock(&space_info->lock);  		space_info->flush = 0; @@ -4416,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)  	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);  } -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, -				struct btrfs_pending_snapshot *pending) +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, +				     struct btrfs_block_rsv *rsv, +				     int items, +				     u64 *qgroup_reserved)  { -	struct btrfs_root *root = pending->root; -	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); -	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; -	/* -	 * two for root back/forward refs, two for directory entries, -	 * one for root of the snapshot and one for parent inode. -	 */ -	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); -	dst_rsv->space_info = src_rsv->space_info; -	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +	u64 num_bytes; +	int ret; + +	if (root->fs_info->quota_enabled) { +		/* One for parent inode, two for dir entries */ +		num_bytes = 3 * root->leafsize; +		ret = btrfs_qgroup_reserve(root, num_bytes); +		if (ret) +			return ret; +	} else { +		num_bytes = 0; +	} + +	*qgroup_reserved = num_bytes; + +	num_bytes = btrfs_calc_trans_metadata_size(root, items); +	rsv->space_info = __find_space_info(root->fs_info, +					    BTRFS_BLOCK_GROUP_METADATA); +	ret = btrfs_block_rsv_add(root, rsv, num_bytes, +				  BTRFS_RESERVE_FLUSH_ALL); +	if (ret) { +		if (*qgroup_reserved) +			btrfs_qgroup_free(root, *qgroup_reserved); +	} + +	return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_root *root, +				      struct btrfs_block_rsv *rsv, +				      u64 qgroup_reserved) +{ +	btrfs_block_rsv_release(root, rsv, (u64)-1); +	if (qgroup_reserved) +		btrfs_qgroup_free(root, qgroup_reserved);  }  /** @@ -4534,8 +4702,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	unsigned nr_extents = 0;  	int extra_reserve = 0;  	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; -	int ret; +	int ret = 0;  	bool delalloc_lock = true; +	u64 to_free = 0; +	unsigned dropped;  	/* If we are a free space inode we need to not flush since we will be in  	 * the middle of a transaction commit.  We also don't need the delalloc @@ -4582,53 +4752,16 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	if (root->fs_info->quota_enabled) {  		ret = btrfs_qgroup_reserve(root, num_bytes +  					   nr_extents * root->leafsize); -		if (ret) { -			spin_lock(&BTRFS_I(inode)->lock); -			calc_csum_metadata_size(inode, num_bytes, 0); -			spin_unlock(&BTRFS_I(inode)->lock); -			if (delalloc_lock) -				mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); -			return ret; -		} +		if (ret) +			goto out_fail;  	}  	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); -	if (ret) { -		u64 to_free = 0; -		unsigned dropped; - -		spin_lock(&BTRFS_I(inode)->lock); -		dropped = drop_outstanding_extent(inode); -		/* -		 * If the inodes csum_bytes is the same as the original -		 * csum_bytes then we know we haven't raced with any free()ers -		 * so we can just reduce our inodes csum bytes and carry on. -		 * Otherwise we have to do the normal free thing to account for -		 * the case that the free side didn't free up its reserve -		 * because of this outstanding reservation. -		 */ -		if (BTRFS_I(inode)->csum_bytes == csum_bytes) -			calc_csum_metadata_size(inode, num_bytes, 0); -		else -			to_free = calc_csum_metadata_size(inode, num_bytes, 0); -		spin_unlock(&BTRFS_I(inode)->lock); -		if (dropped) -			to_free += btrfs_calc_trans_metadata_size(root, dropped); - -		if (to_free) { -			btrfs_block_rsv_release(root, block_rsv, to_free); -			trace_btrfs_space_reservation(root->fs_info, -						      "delalloc", -						      btrfs_ino(inode), -						      to_free, 0); -		} -		if (root->fs_info->quota_enabled) { +	if (unlikely(ret)) { +		if (root->fs_info->quota_enabled)  			btrfs_qgroup_free(root, num_bytes +  						nr_extents * root->leafsize); -		} -		if (delalloc_lock) -			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); -		return ret; +		goto out_fail;  	}  	spin_lock(&BTRFS_I(inode)->lock); @@ -4649,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	block_rsv_add_bytes(block_rsv, to_reserve, 1);  	return 0; + +out_fail: +	spin_lock(&BTRFS_I(inode)->lock); +	dropped = drop_outstanding_extent(inode); +	/* +	 * If the inodes csum_bytes is the same as the original +	 * csum_bytes then we know we haven't raced with any free()ers +	 * so we can just reduce our inodes csum bytes and carry on. +	 * Otherwise we have to do the normal free thing to account for +	 * the case that the free side didn't free up its reserve +	 * because of this outstanding reservation. +	 */ +	if (BTRFS_I(inode)->csum_bytes == csum_bytes) +		calc_csum_metadata_size(inode, num_bytes, 0); +	else +		to_free = calc_csum_metadata_size(inode, num_bytes, 0); +	spin_unlock(&BTRFS_I(inode)->lock); +	if (dropped) +		to_free += btrfs_calc_trans_metadata_size(root, dropped); + +	if (to_free) { +		btrfs_block_rsv_release(root, block_rsv, to_free); +		trace_btrfs_space_reservation(root->fs_info, "delalloc", +					      btrfs_ino(inode), to_free, 0); +	} +	if (delalloc_lock) +		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); +	return ret;  }  /** @@ -4670,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)  	spin_lock(&BTRFS_I(inode)->lock);  	dropped = drop_outstanding_extent(inode); -	to_free = calc_csum_metadata_size(inode, num_bytes, 0); +	if (num_bytes) +		to_free = calc_csum_metadata_size(inode, num_bytes, 0);  	spin_unlock(&BTRFS_I(inode)->lock);  	if (dropped > 0)  		to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -4737,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)  	btrfs_free_reserved_data_space(inode, num_bytes);  } -static int update_block_group(struct btrfs_trans_handle *trans, -			      struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root,  			      u64 bytenr, u64 num_bytes, int alloc)  {  	struct btrfs_block_group_cache *cache = NULL; @@ -4775,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,  		 * space back to the block group, otherwise we will leak space.  		 */  		if (!alloc && cache->cached == BTRFS_CACHE_NO) -			cache_block_group(cache, trans, NULL, 1); +			cache_block_group(cache, 1);  		byte_in_group = bytenr - cache->key.objectid;  		WARN_ON(byte_in_group > cache->key.offset); @@ -4825,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)  	struct btrfs_block_group_cache *cache;  	u64 bytenr; +	spin_lock(&root->fs_info->block_group_cache_lock); +	bytenr = root->fs_info->first_logical_byte; +	spin_unlock(&root->fs_info->block_group_cache_lock); + +	if (bytenr < (u64)-1) +		return bytenr; +  	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);  	if (!cache)  		return 0; @@ -4875,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root,  /*   * this function must be called within transaction   */ -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, -				    struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,  				    u64 bytenr, u64 num_bytes)  {  	struct btrfs_block_group_cache *cache; @@ -4890,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,  	 * to one because the slow code to read in the free extents does check  	 * the pinned extents.  	 */ -	cache_block_group(cache, trans, root, 1); +	cache_block_group(cache, 1);  	pin_down_extent(root, cache, bytenr, num_bytes, 0); @@ -5287,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			}  		} -		ret = update_block_group(trans, root, bytenr, num_bytes, 0); +		ret = update_block_group(root, bytenr, num_bytes, 0);  		if (ret) {  			btrfs_abort_transaction(trans, extent_root, ret);  			goto out; @@ -5332,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  	if (head->extent_op) {  		if (!head->must_insert_reserved)  			goto out; -		kfree(head->extent_op); +		btrfs_free_delayed_extent_op(head->extent_op);  		head->extent_op = NULL;  	} @@ -5455,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,  	return ret;  } -static u64 stripe_align(struct btrfs_root *root, u64 val) +static u64 stripe_align(struct btrfs_root *root, +			struct btrfs_block_group_cache *cache, +			u64 val, u64 num_bytes)  { -	u64 mask = ((u64)root->stripesize - 1); -	u64 ret = (val + mask) & ~mask; +	u64 ret = ALIGN(val, root->stripesize);  	return ret;  } @@ -5478,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,  				u64 num_bytes)  {  	struct btrfs_caching_control *caching_ctl; -	DEFINE_WAIT(wait);  	caching_ctl = get_caching_control(cache);  	if (!caching_ctl) @@ -5495,7 +5662,6 @@ static noinline int  wait_block_group_cache_done(struct btrfs_block_group_cache *cache)  {  	struct btrfs_caching_control *caching_ctl; -	DEFINE_WAIT(wait);  	caching_ctl = get_caching_control(cache);  	if (!caching_ctl) @@ -5509,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)  int __get_raid_index(u64 flags)  { -	int index; -  	if (flags & BTRFS_BLOCK_GROUP_RAID10) -		index = 0; +		return BTRFS_RAID_RAID10;  	else if (flags & BTRFS_BLOCK_GROUP_RAID1) -		index = 1; +		return BTRFS_RAID_RAID1;  	else if (flags & BTRFS_BLOCK_GROUP_DUP) -		index = 2; +		return BTRFS_RAID_DUP;  	else if (flags & BTRFS_BLOCK_GROUP_RAID0) -		index = 3; -	else -		index = 4; +		return BTRFS_RAID_RAID0; +	else if (flags & BTRFS_BLOCK_GROUP_RAID5) +		return BTRFS_RAID_RAID5; +	else if (flags & BTRFS_BLOCK_GROUP_RAID6) +		return BTRFS_RAID_RAID6; -	return index; +	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */  }  static int get_block_group_index(struct btrfs_block_group_cache *cache) @@ -5560,7 +5726,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,  	int empty_cluster = 2 * 1024 * 1024;  	struct btrfs_space_info *space_info;  	int loop = 0; -	int index = 0; +	int index = __get_raid_index(data);  	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?  		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;  	bool found_uncached_bg = false; @@ -5665,6 +5831,8 @@ search:  		if (!block_group_bits(block_group, data)) {  		    u64 extra = BTRFS_BLOCK_GROUP_DUP |  				BTRFS_BLOCK_GROUP_RAID1 | +				BTRFS_BLOCK_GROUP_RAID5 | +				BTRFS_BLOCK_GROUP_RAID6 |  				BTRFS_BLOCK_GROUP_RAID10;  			/* @@ -5680,8 +5848,7 @@ have_block_group:  		cached = block_group_cache_done(block_group);  		if (unlikely(!cached)) {  			found_uncached_bg = true; -			ret = cache_block_group(block_group, trans, -						orig_root, 0); +			ret = cache_block_group(block_group, 0);  			BUG_ON(ret < 0);  			ret = 0;  		} @@ -5694,6 +5861,7 @@ have_block_group:  		 * lets look there  		 */  		if (last_ptr) { +			unsigned long aligned_cluster;  			/*  			 * the refill lock keeps out other  			 * people trying to start a new cluster @@ -5760,11 +5928,15 @@ refill_cluster:  				goto unclustered_alloc;  			} +			aligned_cluster = max_t(unsigned long, +						empty_cluster + empty_size, +					      block_group->full_stripe_len); +  			/* allocate a cluster in this block group */  			ret = btrfs_find_space_cluster(trans, root,  					       block_group, last_ptr,  					       search_start, num_bytes, -					       empty_cluster + empty_size); +					       aligned_cluster);  			if (ret == 0) {  				/*  				 * now pull our allocation out of this @@ -5835,7 +6007,8 @@ unclustered_alloc:  			goto loop;  		}  checks: -		search_start = stripe_align(root, offset); +		search_start = stripe_align(root, used_block_group, +					    offset, num_bytes);  		/* move on to the next group */  		if (search_start + num_bytes > @@ -5986,7 +6159,7 @@ again:  	if (ret == -ENOSPC) {  		if (!final_tried) {  			num_bytes = num_bytes >> 1; -			num_bytes = num_bytes & ~(root->sectorsize - 1); +			num_bytes = round_down(num_bytes, root->sectorsize);  			num_bytes = max(num_bytes, min_alloc_size);  			if (num_bytes == min_alloc_size)  				final_tried = true; @@ -6110,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(path->nodes[0]);  	btrfs_free_path(path); -	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); +	ret = update_block_group(root, ins->objectid, ins->offset, 1);  	if (ret) { /* -ENOENT, logic error */  		printk(KERN_ERR "btrfs update block group failed for %llu "  		       "%llu\n", (unsigned long long)ins->objectid, @@ -6174,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	btrfs_free_path(path); -	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); +	ret = update_block_group(root, ins->objectid, ins->offset, 1);  	if (ret) { /* -ENOENT, logic error */  		printk(KERN_ERR "btrfs update block group failed for %llu "  		       "%llu\n", (unsigned long long)ins->objectid, @@ -6217,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,  	u64 num_bytes = ins->offset;  	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); -	cache_block_group(block_group, trans, NULL, 0); +	cache_block_group(block_group, 0);  	caching_ctl = get_caching_control(block_group);  	if (!caching_ctl) { @@ -6331,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,  	if (!ret)  		return block_rsv;  	if (ret && !block_rsv->failfast) { -		static DEFINE_RATELIMIT_STATE(_rs, -				DEFAULT_RATELIMIT_INTERVAL, -				/*DEFAULT_RATELIMIT_BURST*/ 2); -		if (__ratelimit(&_rs)) -			WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", -			     ret); +		if (btrfs_test_opt(root, ENOSPC_DEBUG)) { +			static DEFINE_RATELIMIT_STATE(_rs, +					DEFAULT_RATELIMIT_INTERVAL * 10, +					/*DEFAULT_RATELIMIT_BURST*/ 1); +			if (__ratelimit(&_rs)) +				WARN(1, KERN_DEBUG +					"btrfs: block rsv returned %d\n", ret); +		}  		ret = reserve_metadata_bytes(root, block_rsv, blocksize,  					     BTRFS_RESERVE_NO_FLUSH);  		if (!ret) { @@ -6402,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {  		struct btrfs_delayed_extent_op *extent_op; -		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); +		extent_op = btrfs_alloc_delayed_extent_op();  		BUG_ON(!extent_op); /* -ENOMEM */  		if (key)  			memcpy(&extent_op->key, key, sizeof(extent_op->key)); @@ -6524,7 +6699,7 @@ reada:  }  /* - * hepler to process tree block while walking down the tree. + * helper to process tree block while walking down the tree.   *   * when wc->stage == UPDATE_BACKREF, this function updates   * back refs for pointers in the block. @@ -6599,7 +6774,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,  }  /* - * hepler to process tree block pointer. + * helper to process tree block pointer.   *   * when wc->stage == DROP_REFERENCE, this function checks   * reference count of the block pointed to. if the block @@ -6737,7 +6912,7 @@ skip:  }  /* - * hepler to process tree block while walking up the tree. + * helper to process tree block while walking up the tree.   *   * when wc->stage == DROP_REFERENCE, this function drops   * reference count on the block. @@ -6788,11 +6963,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,  						       &wc->flags[level]);  			if (ret < 0) {  				btrfs_tree_unlock_rw(eb, path->locks[level]); +				path->locks[level] = 0;  				return ret;  			}  			BUG_ON(wc->refs[level] == 0);  			if (wc->refs[level] == 1) {  				btrfs_tree_unlock_rw(eb, path->locks[level]); +				path->locks[level] = 0;  				return 1;  			}  		} @@ -7203,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)  		root->fs_info->fs_devices->missing_devices;  	stripped = BTRFS_BLOCK_GROUP_RAID0 | +		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |  		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;  	if (num_devices == 1) { @@ -7481,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)  		index = get_block_group_index(block_group);  	} -	if (index == 0) { +	if (index == BTRFS_RAID_RAID10) {  		dev_min = 4;  		/* Divide by 2 */  		min_free >>= 1; -	} else if (index == 1) { +	} else if (index == BTRFS_RAID_RAID1) {  		dev_min = 2; -	} else if (index == 2) { +	} else if (index == BTRFS_RAID_DUP) {  		/* Multiply by 2 */  		min_free <<= 1; -	} else if (index == 3) { +	} else if (index == BTRFS_RAID_RAID0) {  		dev_min = fs_devices->rw_devices;  		do_div(min_free, dev_min);  	} @@ -7651,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  		space_info = list_entry(info->space_info.next,  					struct btrfs_space_info,  					list); -		if (space_info->bytes_pinned > 0 || -		    space_info->bytes_reserved > 0 || -		    space_info->bytes_may_use > 0) { -			WARN_ON(1); -			dump_space_info(space_info, 0, 0); +		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { +			if (space_info->bytes_pinned > 0 || +			    space_info->bytes_reserved > 0 || +			    space_info->bytes_may_use > 0) { +				WARN_ON(1); +				dump_space_info(space_info, 0, 0); +			}  		}  		list_del(&space_info->list);  		kfree(space_info); @@ -7754,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		btrfs_release_path(path);  		cache->flags = btrfs_block_group_flags(&cache->item);  		cache->sectorsize = root->sectorsize; - +		cache->full_stripe_len = btrfs_full_stripe_len(root, +					       &root->fs_info->mapping_tree, +					       found_key.objectid);  		btrfs_init_free_space_ctl(cache);  		/* @@ -7808,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		if (!(get_alloc_profile(root, space_info->flags) &  		      (BTRFS_BLOCK_GROUP_RAID10 |  		       BTRFS_BLOCK_GROUP_RAID1 | +		       BTRFS_BLOCK_GROUP_RAID5 | +		       BTRFS_BLOCK_GROUP_RAID6 |  		       BTRFS_BLOCK_GROUP_DUP)))  			continue;  		/* @@ -7883,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;  	cache->sectorsize = root->sectorsize;  	cache->fs_info = root->fs_info; +	cache->full_stripe_len = btrfs_full_stripe_len(root, +					       &root->fs_info->mapping_tree, +					       chunk_offset);  	atomic_set(&cache->count, 1);  	spin_lock_init(&cache->lock); @@ -7932,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)  	u64 extra_flags = chunk_to_extended(flags) &  				BTRFS_EXTENDED_PROFILE_MASK; +	write_seqlock(&fs_info->profiles_lock);  	if (flags & BTRFS_BLOCK_GROUP_DATA)  		fs_info->avail_data_alloc_bits &= ~extra_flags;  	if (flags & BTRFS_BLOCK_GROUP_METADATA)  		fs_info->avail_metadata_alloc_bits &= ~extra_flags;  	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)  		fs_info->avail_system_alloc_bits &= ~extra_flags; +	write_sequnlock(&fs_info->profiles_lock);  }  int btrfs_remove_block_group(struct btrfs_trans_handle *trans, @@ -8036,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	spin_lock(&root->fs_info->block_group_cache_lock);  	rb_erase(&block_group->cache_node,  		 &root->fs_info->block_group_cache_tree); + +	if (root->fs_info->first_logical_byte == block_group->key.objectid) +		root->fs_info->first_logical_byte = (u64)-1;  	spin_unlock(&root->fs_info->block_group_cache_lock);  	down_write(&block_group->space_info->groups_sem); @@ -8158,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)  		if (end - start >= range->minlen) {  			if (!block_group_cache_done(cache)) { -				ret = cache_block_group(cache, NULL, root, 0); +				ret = cache_block_group(cache, 0);  				if (!ret)  					wait_block_group_cache_done(cache);  			} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1b319df29ee..f173c5af646 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4,7 +4,6 @@  #include <linux/mm.h>  #include <linux/pagemap.h>  #include <linux/page-flags.h> -#include <linux/module.h>  #include <linux/spinlock.h>  #include <linux/blkdev.h>  #include <linux/swap.h> @@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,   */  static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)  { -	u64 start = (u64)page->index << PAGE_CACHE_SHIFT; +	u64 start = page_offset(page);  	u64 end = start + PAGE_CACHE_SIZE - 1;  	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))  		SetPageUptodate(page); @@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)   */  static void check_page_locked(struct extent_io_tree *tree, struct page *page)  { -	u64 start = (u64)page->index << PAGE_CACHE_SHIFT; +	u64 start = page_offset(page);  	u64 end = start + PAGE_CACHE_SIZE - 1;  	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))  		unlock_page(page); @@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,  	if (ret)  		err = ret; -	if (did_repair) { -		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, -					rec->start + rec->len - 1, -					EXTENT_DAMAGED, GFP_NOFS); -		if (ret && !err) -			err = ret; -	} +	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, +				rec->start + rec->len - 1, +				EXTENT_DAMAGED, GFP_NOFS); +	if (ret && !err) +		err = ret;  	kfree(rec);  	return err; @@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,  	u64 map_length = 0;  	u64 sector;  	struct btrfs_bio *bbio = NULL; +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;  	int ret;  	BUG_ON(!mirror_num); +	/* we can't repair anything in raid56 yet */ +	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) +		return 0; +  	bio = bio_alloc(GFP_NOFS, 1);  	if (!bio)  		return -EIO; @@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,  		return -EIO;  	}  	bio->bi_bdev = dev->bdev; -	bio_add_page(bio, page, length, start-page_offset(page)); +	bio_add_page(bio, page, length, start - page_offset(page));  	btrfsic_submit_bio(WRITE_SYNC, bio);  	wait_for_completion(&compl); @@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page)  						failrec->failed_mirror);  			did_repair = !ret;  		} +		ret = 0;  	}  out: @@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)  		struct page *page = bvec->bv_page;  		tree = &BTRFS_I(page->mapping->host)->io_tree; -		start = ((u64)page->index << PAGE_CACHE_SHIFT) + -			 bvec->bv_offset; +		start = page_offset(page) + bvec->bv_offset;  		end = start + bvec->bv_len - 1;  		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)  			 (long int)bio->bi_bdev);  		tree = &BTRFS_I(page->mapping->host)->io_tree; -		start = ((u64)page->index << PAGE_CACHE_SHIFT) + -			bvec->bv_offset; +		start = page_offset(page) + bvec->bv_offset;  		end = start + bvec->bv_len - 1;  		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,  	struct extent_io_tree *tree = bio->bi_private;  	u64 start; -	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; +	start = page_offset(page) + bvec->bv_offset;  	bio->bi_private = NULL; @@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,  	return ret;  } -static int merge_bio(struct extent_io_tree *tree, struct page *page, +static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,  		     unsigned long offset, size_t size, struct bio *bio,  		     unsigned long bio_flags)  {  	int ret = 0;  	if (tree->ops && tree->ops->merge_bio_hook) -		ret = tree->ops->merge_bio_hook(page, offset, size, bio, +		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,  						bio_flags);  	BUG_ON(ret < 0);  	return ret; @@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,  				sector;  		if (prev_bio_flags != bio_flags || !contig || -		    merge_bio(tree, page, offset, page_size, bio, bio_flags) || +		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||  		    bio_add_page(bio, page, page_size, offset) < page_size) {  			ret = submit_one_bio(rw, bio, mirror_num,  					     prev_bio_flags); @@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  				   unsigned long *bio_flags)  {  	struct inode *inode = page->mapping->host; -	u64 start = (u64)page->index << PAGE_CACHE_SHIFT; +	u64 start = page_offset(page);  	u64 page_end = start + PAGE_CACHE_SIZE - 1;  	u64 end;  	u64 cur = start; @@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  		}  	}  	while (cur <= end) { +		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; +  		if (cur >= last_byte) {  			char *userpage;  			struct extent_state *cached = NULL; @@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  		iosize = min(extent_map_end(em) - cur, end - cur + 1);  		cur_end = min(extent_map_end(em) - 1, end); -		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); +		iosize = ALIGN(iosize, blocksize);  		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {  			disk_io_size = em->block_len;  			sector = em->block_start >> 9; @@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  			continue;  		} -		ret = 0; -		if (tree->ops && tree->ops->readpage_io_hook) { -			ret = tree->ops->readpage_io_hook(page, cur, -							  cur + iosize - 1); -		} -		if (!ret) { -			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; -			pnr -= page->index; -			ret = submit_extent_page(READ, tree, page, +		pnr -= page->index; +		ret = submit_extent_page(READ, tree, page,  					 sector, disk_io_size, pg_offset,  					 bdev, bio, pnr,  					 end_bio_extent_readpage, mirror_num,  					 *bio_flags,  					 this_bio_flag); -			if (!ret) { -				nr++; -				*bio_flags = this_bio_flag; -			} -		} -		if (ret) { +		if (!ret) { +			nr++; +			*bio_flags = this_bio_flag; +		} else {  			SetPageError(page);  			unlock_extent(tree, cur, cur + iosize - 1);  		} @@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	struct inode *inode = page->mapping->host;  	struct extent_page_data *epd = data;  	struct extent_io_tree *tree = epd->tree; -	u64 start = (u64)page->index << PAGE_CACHE_SHIFT; +	u64 start = page_offset(page);  	u64 delalloc_start;  	u64 page_end = start + PAGE_CACHE_SIZE - 1;  	u64 end; @@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  		BUG_ON(extent_map_end(em) <= cur);  		BUG_ON(end < cur);  		iosize = min(extent_map_end(em) - cur, end - cur + 1); -		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); +		iosize = ALIGN(iosize, blocksize);  		sector = (em->block_start + extent_offset) >> 9;  		bdev = em->bdev;  		block_start = em->block_start; @@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,  		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);  		spin_unlock(&eb->refs_lock);  		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); -		spin_lock(&fs_info->delalloc_lock); -		if (fs_info->dirty_metadata_bytes >= eb->len) -			fs_info->dirty_metadata_bytes -= eb->len; -		else -			WARN_ON(1); -		spin_unlock(&fs_info->delalloc_lock); +		__percpu_counter_add(&fs_info->dirty_metadata_bytes, +				     -eb->len, +				     fs_info->dirty_metadata_batch);  		ret = 1;  	} else {  		spin_unlock(&eb->refs_lock); @@ -3446,15 +3437,9 @@ retry:  			 * swizzled back from swapper_space to tmpfs file  			 * mapping  			 */ -			if (tree->ops && -			    tree->ops->write_cache_pages_lock_hook) { -				tree->ops->write_cache_pages_lock_hook(page, -							       data, flush_fn); -			} else { -				if (!trylock_page(page)) { -					flush_fn(data); -					lock_page(page); -				} +			if (!trylock_page(page)) { +				flush_fn(data); +				lock_page(page);  			}  			if (unlikely(page->mapping != mapping)) { @@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,  			  struct page *page, unsigned long offset)  {  	struct extent_state *cached_state = NULL; -	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); +	u64 start = page_offset(page);  	u64 end = start + PAGE_CACHE_SIZE - 1;  	size_t blocksize = page->mapping->host->i_sb->s_blocksize; -	start += (offset + blocksize - 1) & ~(blocksize - 1); +	start += ALIGN(offset, blocksize);  	if (start > end)  		return 0; @@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map,  			     struct extent_io_tree *tree, struct page *page,  			     gfp_t mask)  { -	u64 start = (u64)page->index << PAGE_CACHE_SHIFT; +	u64 start = page_offset(page);  	u64 end = start + PAGE_CACHE_SIZE - 1;  	int ret = 1; @@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,  			       gfp_t mask)  {  	struct extent_map *em; -	u64 start = (u64)page->index << PAGE_CACHE_SHIFT; +	u64 start = page_offset(page);  	u64 end = start + PAGE_CACHE_SIZE - 1;  	if ((mask & __GFP_WAIT) && @@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,  		len = last - offset;  		if (len == 0)  			break; -		len = (len + sectorsize - 1) & ~(sectorsize - 1); +		len = ALIGN(len, sectorsize);  		em = get_extent(inode, NULL, 0, offset, len, 0);  		if (IS_ERR_OR_NULL(em))  			return em; @@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb)  	list_del(&eb->leak_list);  	spin_unlock_irqrestore(&leak_lock, flags);  #endif -	if (eb->pages && eb->pages != eb->inline_pages) -		kfree(eb->pages);  	kmem_cache_free(extent_buffer_cache, eb);  } @@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,  	atomic_set(&eb->refs, 1);  	atomic_set(&eb->io_pages, 0); -	if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { -		struct page **pages; -		int num_pages = (len + PAGE_CACHE_SIZE - 1) >> -			PAGE_CACHE_SHIFT; -		pages = kzalloc(num_pages, mask); -		if (!pages) { -			__free_extent_buffer(eb); -			return NULL; -		} -		eb->pages = pages; -	} else { -		eb->pages = eb->inline_pages; -	} +	/* +	 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages +	 */ +	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE +		> MAX_INLINE_EXTENT_BUFFER_SIZE); +	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);  	return eb;  } @@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)  static void check_buffer_tree_ref(struct extent_buffer *eb)  { +	int refs;  	/* the ref bit is tricky.  We have to make sure it is set  	 * if we have the buffer dirty.   Otherwise the  	 * code to free a buffer can end up dropping a dirty @@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)  	 * So bump the ref count first, then set the bit.  If someone  	 * beat us to it, drop the ref we added.  	 */ +	refs = atomic_read(&eb->refs); +	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) +		return; +  	spin_lock(&eb->refs_lock);  	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))  		atomic_inc(&eb->refs); @@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)  void free_extent_buffer(struct extent_buffer *eb)  { +	int refs; +	int old;  	if (!eb)  		return; +	while (1) { +		refs = atomic_read(&eb->refs); +		if (refs <= 3) +			break; +		old = atomic_cmpxchg(&eb->refs, refs, refs - 1); +		if (old == refs) +			return; +	} +  	spin_lock(&eb->refs_lock);  	if (atomic_read(&eb->refs) == 2 &&  	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2eacfabd326..6068a198556 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -72,10 +72,9 @@ struct extent_io_ops {  	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);  	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);  	extent_submit_bio_hook_t *submit_bio_hook; -	int (*merge_bio_hook)(struct page *page, unsigned long offset, +	int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,  			      size_t size, struct bio *bio,  			      unsigned long bio_flags); -	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);  	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);  	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,  				    struct extent_state *state, int mirror); @@ -90,8 +89,6 @@ struct extent_io_ops {  				  struct extent_state *other);  	void (*split_extent_hook)(struct inode *inode,  				  struct extent_state *orig, u64 split); -	int (*write_cache_pages_lock_hook)(struct page *page, void *data, -					   void (*flush_fn)(void *));  };  struct extent_io_tree { @@ -161,8 +158,7 @@ struct extent_buffer {  	 */  	wait_queue_head_t read_lock_wq;  	wait_queue_head_t lock_wq; -	struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; -	struct page **pages; +	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];  };  static inline void extent_set_compress_type(unsigned long *bio_flags, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f169d6b11d7..2834ca5768e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,6 +1,5 @@  #include <linux/err.h>  #include <linux/slab.h> -#include <linux/module.h>  #include <linux/spinlock.h>  #include <linux/hardirq.h>  #include "ctree.h" @@ -171,6 +170,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)  	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))  		return 0; +	if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || +	    test_bit(EXTENT_FLAG_LOGGING, &next->flags)) +		return 0; +  	if (extent_map_end(prev) == next->start &&  	    prev->flags == next->flags &&  	    prev->bdev == next->bdev && @@ -255,7 +258,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,  	if (!em)  		goto out; -	list_move(&em->list, &tree->modified_extents); +	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) +		list_move(&em->list, &tree->modified_extents);  	em->generation = gen;  	clear_bit(EXTENT_FLAG_PINNED, &em->flags);  	em->mod_start = em->start; @@ -280,6 +284,13 @@ out:  } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ +	clear_bit(EXTENT_FLAG_LOGGING, &em->flags); +	if (em->in_tree) +		try_merge_map(tree, em); +} +  /**   * add_extent_mapping - add new extent map to the extent tree   * @tree:	tree to insert new map in diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 922943ce29e..c6598c89cff 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em);  int __init extent_map_init(void);  void extent_map_exit(void);  int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);  struct extent_map *search_extent_mapping(struct extent_map_tree *tree,  					 u64 start, u64 len);  #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bd38cef4235..ec160202be3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,  		if (!contig)  			offset = page_offset(bvec->bv_page) + bvec->bv_offset; -		if (!contig && (offset >= ordered->file_offset + ordered->len || -		    offset < ordered->file_offset)) { +		if (offset >= ordered->file_offset + ordered->len || +		    offset < ordered->file_offset) {  			unsigned long bytes_left;  			sums->len = this_sum_bytes;  			this_sum_bytes = 0; @@ -684,6 +684,24 @@ out:  	return ret;  } +static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums, +				 struct btrfs_sector_sum *sector_sum, +				 u64 total_bytes, u64 sectorsize) +{ +	u64 tmp = sectorsize; +	u64 next_sector = sector_sum->bytenr; +	struct btrfs_sector_sum *next = sector_sum + 1; + +	while ((tmp + total_bytes) < sums->len) { +		if (next_sector + sectorsize != next->bytenr) +			break; +		tmp += sectorsize; +		next_sector = next->bytenr; +		next++; +	} +	return tmp; +} +  int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct btrfs_ordered_sum *sums) @@ -789,20 +807,32 @@ again:  		goto insert;  	} -	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / +	if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /  	    csum_size) { -		u32 diff = (csum_offset + 1) * csum_size; +		int extend_nr; +		u64 tmp; +		u32 diff; +		u32 free_space; -		/* -		 * is the item big enough already?  we dropped our lock -		 * before and need to recheck -		 */ -		if (diff < btrfs_item_size_nr(leaf, path->slots[0])) -			goto csum; +		if (btrfs_leaf_free_space(root, leaf) < +				 sizeof(struct btrfs_item) + csum_size * 2) +			goto insert; + +		free_space = btrfs_leaf_free_space(root, leaf) - +					 sizeof(struct btrfs_item) - csum_size; +		tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, +					    root->sectorsize); +		tmp >>= root->fs_info->sb->s_blocksize_bits; +		WARN_ON(tmp < 1); + +		extend_nr = max_t(int, 1, (int)tmp); +		diff = (csum_offset + extend_nr) * csum_size; +		diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);  		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); -		if (diff != csum_size) -			goto insert; +		diff = min(free_space, diff); +		diff /= csum_size; +		diff *= csum_size;  		btrfs_extend_item(trans, root, path, diff);  		goto csum; @@ -812,19 +842,14 @@ insert:  	btrfs_release_path(path);  	csum_offset = 0;  	if (found_next) { -		u64 tmp = total_bytes + root->sectorsize; -		u64 next_sector = sector_sum->bytenr; -		struct btrfs_sector_sum *next = sector_sum + 1; +		u64 tmp; -		while (tmp < sums->len) { -			if (next_sector + root->sectorsize != next->bytenr) -				break; -			tmp += root->sectorsize; -			next_sector = next->bytenr; -			next++; -		} -		tmp = min(tmp, next_offset - file_key.offset); +		tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, +					    root->sectorsize);  		tmp >>= root->fs_info->sb->s_blocksize_bits; +		tmp = min(tmp, (next_offset - file_key.offset) >> +					 root->fs_info->sb->s_blocksize_bits); +  		tmp = max((u64)1, tmp);  		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));  		ins_size = csum_size * tmp; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 77061bf43ed..af1d0605a5c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -30,11 +30,11 @@  #include <linux/statfs.h>  #include <linux/compat.h>  #include <linux/slab.h> +#include <linux/btrfs.h>  #include "ctree.h"  #include "disk-io.h"  #include "transaction.h"  #include "btrfs_inode.h" -#include "ioctl.h"  #include "print-tree.h"  #include "tree-log.h"  #include "locking.h" @@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,  	struct btrfs_key key;  	struct btrfs_ioctl_defrag_range_args range;  	int num_defrag; +	int index; +	int ret;  	/* get the inode */  	key.objectid = defrag->root;  	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);  	key.offset = (u64)-1; + +	index = srcu_read_lock(&fs_info->subvol_srcu); +  	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);  	if (IS_ERR(inode_root)) { -		kmem_cache_free(btrfs_inode_defrag_cachep, defrag); -		return PTR_ERR(inode_root); +		ret = PTR_ERR(inode_root); +		goto cleanup; +	} +	if (btrfs_root_refs(&inode_root->root_item) == 0) { +		ret = -ENOENT; +		goto cleanup;  	}  	key.objectid = defrag->ino; @@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,  	key.offset = 0;  	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);  	if (IS_ERR(inode)) { -		kmem_cache_free(btrfs_inode_defrag_cachep, defrag); -		return PTR_ERR(inode); +		ret = PTR_ERR(inode); +		goto cleanup;  	} +	srcu_read_unlock(&fs_info->subvol_srcu, index);  	/* do a chunk of defrag */  	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,  	iput(inode);  	return 0; +cleanup: +	srcu_read_unlock(&fs_info->subvol_srcu, index); +	kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +	return ret;  }  /* @@ -360,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)  	atomic_inc(&fs_info->defrag_running);  	while(1) { +		/* Pause the auto defragger. */ +		if (test_bit(BTRFS_FS_STATE_REMOUNTING, +			     &fs_info->fs_state)) +			break; +  		if (!__need_auto_defrag(fs_info->tree_root))  			break; @@ -491,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,  	loff_t isize = i_size_read(inode);  	start_pos = pos & ~((u64)root->sectorsize - 1); -	num_bytes = (write_bytes + pos - start_pos + -		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1); +	num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);  	end_of_last_block = start_pos + num_bytes - 1;  	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, @@ -1211,7 +1229,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,  	struct extent_state *cached_state = NULL;  	int i;  	unsigned long index = pos >> PAGE_CACHE_SHIFT; -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);  	int err = 0;  	int faili = 0; @@ -1298,7 +1316,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,  					       struct iov_iter *i,  					       loff_t pos)  { -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct page **pages = NULL;  	unsigned long first_index; @@ -1486,7 +1504,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  				    unsigned long nr_segs, loff_t pos)  {  	struct file *file = iocb->ki_filp; -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	loff_t *ppos = &iocb->ki_pos;  	u64 start_pos; @@ -1530,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  	 * although we have opened a file as writable, we have  	 * to stop this write operation to ensure FS consistency.  	 */ -	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {  		mutex_unlock(&inode->i_mutex);  		err = -EROFS;  		goto out; @@ -1594,9 +1612,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  		if (err < 0 && num_written > 0)  			num_written = err;  	} -out: +  	if (sync)  		atomic_dec(&BTRFS_I(inode)->sync_writers); +out:  	sb_end_write(inode->i_sb);  	current->backing_dev_info = NULL;  	return num_written ? num_written : err; @@ -1612,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)  	 */  	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,  			       &BTRFS_I(inode)->runtime_flags)) { -		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); +		struct btrfs_trans_handle *trans; +		struct btrfs_root *root = BTRFS_I(inode)->root; + +		/* +		 * We need to block on a committing transaction to keep us from +		 * throwing a ordered operation on to the list and causing +		 * something like sync to deadlock trying to flush out this +		 * inode. +		 */ +		trans = btrfs_start_transaction(root, 0); +		if (IS_ERR(trans)) +			return PTR_ERR(trans); +		btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); +		btrfs_end_transaction(trans, root);  		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)  			filemap_flush(inode->i_mapping);  	} @@ -1639,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret = 0;  	struct btrfs_trans_handle *trans; +	bool full_sync = 0;  	trace_btrfs_sync_file(file, datasync);  	/*  	 * We write the dirty pages in the range and wait until they complete  	 * out of the ->i_mutex. If so, we can flush the dirty pages by -	 * multi-task, and make the performance up. +	 * multi-task, and make the performance up.  See +	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.  	 */  	atomic_inc(&BTRFS_I(inode)->sync_writers); -	ret = filemap_write_and_wait_range(inode->i_mapping, start, end); +	ret = filemap_fdatawrite_range(inode->i_mapping, start, end); +	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +			     &BTRFS_I(inode)->runtime_flags)) +		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);  	atomic_dec(&BTRFS_I(inode)->sync_writers);  	if (ret)  		return ret; @@ -1660,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	 * range being left.  	 */  	atomic_inc(&root->log_batch); -	btrfs_wait_ordered_range(inode, start, end - start + 1); +	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +			     &BTRFS_I(inode)->runtime_flags); +	if (full_sync) +		btrfs_wait_ordered_range(inode, start, end - start + 1);  	atomic_inc(&root->log_batch);  	/* @@ -1727,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	if (ret != BTRFS_NO_LOG_SYNC) {  		if (ret > 0) { +			/* +			 * If we didn't already wait for ordered extents we need +			 * to do that now. +			 */ +			if (!full_sync) +				btrfs_wait_ordered_range(inode, start, +							 end - start + 1);  			ret = btrfs_commit_transaction(trans, root);  		} else {  			ret = btrfs_sync_log(trans, root); -			if (ret == 0) +			if (ret == 0) {  				ret = btrfs_end_transaction(trans, root); -			else +			} else { +				if (!full_sync) +					btrfs_wait_ordered_range(inode, start, +								 end - +								 start + 1);  				ret = btrfs_commit_transaction(trans, root); +			}  		}  	} else {  		ret = btrfs_end_transaction(trans, root); @@ -2087,7 +2139,7 @@ out:  static long btrfs_fallocate(struct file *file, int mode,  			    loff_t offset, loff_t len)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct extent_state *cached_state = NULL;  	u64 cur_offset;  	u64 last_byte; @@ -2241,6 +2293,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)  	if (lockend <= lockstart)  		lockend = lockstart + root->sectorsize; +	lockend--;  	len = lockend - lockstart + 1;  	len = max_t(u64, len, root->sectorsize); @@ -2307,9 +2360,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)  					}  				} -				*offset = start; -				free_extent_map(em); -				break; +				if (!test_bit(EXTENT_FLAG_PREALLOC, +					      &em->flags)) { +					*offset = start; +					free_extent_map(em); +					break; +				}  			}  		} diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 59ea2e4349c..1f84fc09c1a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)  	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;  	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); +	max_bitmaps = max(max_bitmaps, 1); +  	BUG_ON(ctl->total_bitmaps > max_bitmaps);  	/* @@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,  }  static struct btrfs_free_space * -find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, +		unsigned long align)  {  	struct btrfs_free_space *entry;  	struct rb_node *node; +	u64 ctl_off; +	u64 tmp; +	u64 align_off;  	int ret;  	if (!ctl->free_space_offset.rb_node) @@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)  		if (entry->bytes < *bytes)  			continue; +		/* make sure the space returned is big enough +		 * to match our requested alignment +		 */ +		if (*bytes >= align) { +			ctl_off = entry->offset - ctl->start; +			tmp = ctl_off + align - 1;; +			do_div(tmp, align); +			tmp = tmp * align + ctl->start; +			align_off = tmp - entry->offset; +		} else { +			align_off = 0; +			tmp = entry->offset; +		} + +		if (entry->bytes < *bytes + align_off) +			continue; +  		if (entry->bitmap) { -			ret = search_bitmap(ctl, entry, offset, bytes); -			if (!ret) +			ret = search_bitmap(ctl, entry, &tmp, bytes); +			if (!ret) { +				*offset = tmp;  				return entry; +			}  			continue;  		} -		*offset = entry->offset; -		*bytes = entry->bytes; +		*offset = tmp; +		*bytes = entry->bytes - align_off;  		return entry;  	} @@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,  	}  	/* -	 * some block groups are so tiny they can't be enveloped by a bitmap, so -	 * don't even bother to create a bitmap for this +	 * The original block groups from mkfs can be really small, like 8 +	 * megabytes, so don't bother with a bitmap for those entries.  However +	 * some block groups can be smaller than what a bitmap would cover but +	 * are still large enough that they could overflow the 32k memory limit, +	 * so allow those block groups to still be allowed to have a bitmap +	 * entry.  	 */ -	if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) +	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)  		return false;  	return true; @@ -1862,11 +1891,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,  {  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	struct btrfs_free_space *info; -	int ret = 0; +	int ret; +	bool re_search = false;  	spin_lock(&ctl->tree_lock);  again: +	ret = 0;  	if (!bytes)  		goto out_lock; @@ -1879,17 +1910,17 @@ again:  		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),  					  1, 0);  		if (!info) { -			/* the tree logging code might be calling us before we -			 * have fully loaded the free space rbtree for this -			 * block group.  So it is possible the entry won't -			 * be in the rbtree yet at all.  The caching code -			 * will make sure not to put it in the rbtree if -			 * the logging code has pinned it. +			/* +			 * If we found a partial bit of our free space in a +			 * bitmap but then couldn't find the other part this may +			 * be a problem, so WARN about it.  			 */ +			WARN_ON(re_search);  			goto out_lock;  		}  	} +	re_search = false;  	if (!info->bitmap) {  		unlink_free_space(ctl, info);  		if (offset == info->offset) { @@ -1935,8 +1966,10 @@ again:  	}  	ret = remove_from_bitmap(ctl, info, &offset, &bytes); -	if (ret == -EAGAIN) +	if (ret == -EAGAIN) { +		re_search = true;  		goto again; +	}  	BUG_ON(ret); /* logic error */  out_lock:  	spin_unlock(&ctl->tree_lock); @@ -2091,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,  	struct btrfs_free_space *entry = NULL;  	u64 bytes_search = bytes + empty_size;  	u64 ret = 0; +	u64 align_gap = 0; +	u64 align_gap_len = 0;  	spin_lock(&ctl->tree_lock); -	entry = find_free_space(ctl, &offset, &bytes_search); +	entry = find_free_space(ctl, &offset, &bytes_search, +				block_group->full_stripe_len);  	if (!entry)  		goto out; @@ -2103,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,  		if (!entry->bytes)  			free_bitmap(ctl, entry);  	} else { +  		unlink_free_space(ctl, entry); -		entry->offset += bytes; -		entry->bytes -= bytes; +		align_gap_len = offset - entry->offset; +		align_gap = entry->offset; + +		entry->offset = offset + bytes; +		WARN_ON(entry->bytes < bytes + align_gap_len); + +		entry->bytes -= bytes + align_gap_len;  		if (!entry->bytes)  			kmem_cache_free(btrfs_free_space_cachep, entry);  		else @@ -2115,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,  out:  	spin_unlock(&ctl->tree_lock); +	if (align_gap_len) +		__btrfs_add_free_space(ctl, align_gap, align_gap_len);  	return ret;  } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67ed24ae86b..c226daefd65 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -39,12 +39,13 @@  #include <linux/slab.h>  #include <linux/ratelimit.h>  #include <linux/mount.h> +#include <linux/btrfs.h> +#include <linux/blkdev.h>  #include "compat.h"  #include "ctree.h"  #include "disk-io.h"  #include "transaction.h"  #include "btrfs_inode.h" -#include "ioctl.h"  #include "print-tree.h"  #include "ordered-data.h"  #include "xattr.h" @@ -54,6 +55,7 @@  #include "locking.h"  #include "free-space-cache.h"  #include "inode-map.h" +#include "backref.h"  struct btrfs_iget_args {  	u64 ino; @@ -88,7 +90,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {  	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,  }; -static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_setsize(struct inode *inode, struct iattr *attr);  static int btrfs_truncate(struct inode *inode);  static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);  static noinline int cow_file_range(struct inode *inode, @@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,  	u64 isize = i_size_read(inode);  	u64 actual_end = min(end + 1, isize);  	u64 inline_len = actual_end - start; -	u64 aligned_end = (end + root->sectorsize - 1) & -			~((u64)root->sectorsize - 1); +	u64 aligned_end = ALIGN(end, root->sectorsize);  	u64 data_len = inline_len;  	int ret; @@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,  		return 1;  	} +	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);  	btrfs_delalloc_release_metadata(inode, end + 1 - start);  	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);  	return 0; @@ -389,7 +391,7 @@ again:  	 * a compressed extent to 128k.  	 */  	total_compressed = min(total_compressed, max_uncompressed); -	num_bytes = (end - start + blocksize) & ~(blocksize - 1); +	num_bytes = ALIGN(end - start + 1, blocksize);  	num_bytes = max(blocksize,  num_bytes);  	total_in = 0;  	ret = 0; @@ -488,15 +490,13 @@ cont:  		 * up to a block size boundary so the allocator does sane  		 * things  		 */ -		total_compressed = (total_compressed + blocksize - 1) & -			~(blocksize - 1); +		total_compressed = ALIGN(total_compressed, blocksize);  		/*  		 * one last check to make sure the compression is really a  		 * win, compare the page count read with the blocks on disk  		 */ -		total_in = (total_in + PAGE_CACHE_SIZE - 1) & -			~(PAGE_CACHE_SIZE - 1); +		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);  		if (total_compressed >= total_in) {  			will_compress = 0;  		} else { @@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode,  	if (list_empty(&async_cow->extents))  		return 0; - +again:  	while (!list_empty(&async_cow->extents)) {  		async_extent = list_entry(async_cow->extents.next,  					  struct async_extent, list); @@ -648,6 +648,8 @@ retry:  						  async_extent->ram_size - 1,  						  btrfs_get_extent,  						  WB_SYNC_ALL); +			else if (ret) +				unlock_page(async_cow->locked_page);  			kfree(async_extent);  			cond_resched();  			continue; @@ -672,6 +674,7 @@ retry:  		if (ret) {  			int i; +  			for (i = 0; i < async_extent->nr_pages; i++) {  				WARN_ON(async_extent->pages[i]->mapping);  				page_cache_release(async_extent->pages[i]); @@ -679,12 +682,10 @@ retry:  			kfree(async_extent->pages);  			async_extent->nr_pages = 0;  			async_extent->pages = NULL; -			unlock_extent(io_tree, async_extent->start, -				      async_extent->start + -				      async_extent->ram_size - 1); +  			if (ret == -ENOSPC)  				goto retry; -			goto out_free; /* JDM: Requeue? */ +			goto out_free;  		}  		/* @@ -696,10 +697,13 @@ retry:  					async_extent->ram_size - 1, 0);  		em = alloc_extent_map(); -		BUG_ON(!em); /* -ENOMEM */ +		if (!em) +			goto out_free_reserve;  		em->start = async_extent->start;  		em->len = async_extent->ram_size;  		em->orig_start = em->start; +		em->mod_start = em->start; +		em->mod_len = em->len;  		em->block_start = ins.objectid;  		em->block_len = ins.offset; @@ -726,6 +730,9 @@ retry:  						async_extent->ram_size - 1, 0);  		} +		if (ret) +			goto out_free_reserve; +  		ret = btrfs_add_ordered_extent_compress(inode,  						async_extent->start,  						ins.objectid, @@ -733,7 +740,8 @@ retry:  						ins.offset,  						BTRFS_ORDERED_COMPRESSED,  						async_extent->compress_type); -		BUG_ON(ret); /* -ENOMEM */ +		if (ret) +			goto out_free_reserve;  		/*  		 * clear dirty, set writeback and unlock the pages. @@ -754,18 +762,30 @@ retry:  				    ins.objectid,  				    ins.offset, async_extent->pages,  				    async_extent->nr_pages); - -		BUG_ON(ret); /* -ENOMEM */  		alloc_hint = ins.objectid + ins.offset;  		kfree(async_extent); +		if (ret) +			goto out;  		cond_resched();  	}  	ret = 0;  out:  	return ret; +out_free_reserve: +	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);  out_free: +	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, +				     async_extent->start, +				     async_extent->start + +				     async_extent->ram_size - 1, +				     NULL, EXTENT_CLEAR_UNLOCK_PAGE | +				     EXTENT_CLEAR_UNLOCK | +				     EXTENT_CLEAR_DELALLOC | +				     EXTENT_CLEAR_DIRTY | +				     EXTENT_SET_WRITEBACK | +				     EXTENT_END_WRITEBACK);  	kfree(async_extent); -	goto out; +	goto again;  }  static u64 get_extent_allocation_hint(struct inode *inode, u64 start, @@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,  	BUG_ON(btrfs_is_free_space_inode(inode)); -	num_bytes = (end - start + blocksize) & ~(blocksize - 1); +	num_bytes = ALIGN(end - start + 1, blocksize);  	num_bytes = max(blocksize,  num_bytes);  	disk_num_bytes = num_bytes; @@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,  		em->orig_start = em->start;  		ram_size = ins.offset;  		em->len = ins.offset; +		em->mod_start = em->start; +		em->mod_len = em->len;  		em->block_start = ins.objectid;  		em->block_len = ins.offset; @@ -1338,6 +1360,8 @@ out_check:  			em->block_start = disk_bytenr;  			em->orig_block_len = disk_num_bytes;  			em->bdev = root->fs_info->fs_devices->latest_bdev; +			em->mod_start = em->start; +			em->mod_len = em->len;  			set_bit(EXTENT_FLAG_PINNED, &em->flags);  			set_bit(EXTENT_FLAG_FILLING, &em->flags);  			em->generation = -1; @@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode,  			spin_unlock(&BTRFS_I(inode)->lock);  		} -		spin_lock(&root->fs_info->delalloc_lock); +		__percpu_counter_add(&root->fs_info->delalloc_bytes, len, +				     root->fs_info->delalloc_batch); +		spin_lock(&BTRFS_I(inode)->lock);  		BTRFS_I(inode)->delalloc_bytes += len; -		root->fs_info->delalloc_bytes += len; -		if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { -			list_add_tail(&BTRFS_I(inode)->delalloc_inodes, -				      &root->fs_info->delalloc_inodes); +		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, +					 &BTRFS_I(inode)->runtime_flags)) { +			spin_lock(&root->fs_info->delalloc_lock); +			if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { +				list_add_tail(&BTRFS_I(inode)->delalloc_inodes, +					      &root->fs_info->delalloc_inodes); +				set_bit(BTRFS_INODE_IN_DELALLOC_LIST, +					&BTRFS_I(inode)->runtime_flags); +			} +			spin_unlock(&root->fs_info->delalloc_lock);  		} -		spin_unlock(&root->fs_info->delalloc_lock); +		spin_unlock(&BTRFS_I(inode)->lock);  	}  } @@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode,  		    && do_list)  			btrfs_free_reserved_data_space(inode, len); -		spin_lock(&root->fs_info->delalloc_lock); -		root->fs_info->delalloc_bytes -= len; +		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len, +				     root->fs_info->delalloc_batch); +		spin_lock(&BTRFS_I(inode)->lock);  		BTRFS_I(inode)->delalloc_bytes -= len; -  		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && -		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { -			list_del_init(&BTRFS_I(inode)->delalloc_inodes); +		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST, +			     &BTRFS_I(inode)->runtime_flags)) { +			spin_lock(&root->fs_info->delalloc_lock); +			if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { +				list_del_init(&BTRFS_I(inode)->delalloc_inodes); +				clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, +					  &BTRFS_I(inode)->runtime_flags); +			} +			spin_unlock(&root->fs_info->delalloc_lock);  		} -		spin_unlock(&root->fs_info->delalloc_lock); +		spin_unlock(&BTRFS_I(inode)->lock);  	}  } @@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,   * extent_io.c merge_bio_hook, this must check the chunk tree to make sure   * we don't create bios that span stripes or chunks   */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,  			 size_t size, struct bio *bio,  			 unsigned long bio_flags)  { @@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,  	length = bio->bi_size;  	map_length = length; -	ret = btrfs_map_block(root->fs_info, READ, logical, +	ret = btrfs_map_block(root->fs_info, rw, logical,  			      &map_length, NULL, 0);  	/* Will always return 0 with map_multi == NULL */  	BUG_ON(ret < 0); @@ -1892,6 +1931,640 @@ out:  	return ret;  } +/* snapshot-aware defrag */ +struct sa_defrag_extent_backref { +	struct rb_node node; +	struct old_sa_defrag_extent *old; +	u64 root_id; +	u64 inum; +	u64 file_pos; +	u64 extent_offset; +	u64 num_bytes; +	u64 generation; +}; + +struct old_sa_defrag_extent { +	struct list_head list; +	struct new_sa_defrag_extent *new; + +	u64 extent_offset; +	u64 bytenr; +	u64 offset; +	u64 len; +	int count; +}; + +struct new_sa_defrag_extent { +	struct rb_root root; +	struct list_head head; +	struct btrfs_path *path; +	struct inode *inode; +	u64 file_pos; +	u64 len; +	u64 bytenr; +	u64 disk_len; +	u8 compress_type; +}; + +static int backref_comp(struct sa_defrag_extent_backref *b1, +			struct sa_defrag_extent_backref *b2) +{ +	if (b1->root_id < b2->root_id) +		return -1; +	else if (b1->root_id > b2->root_id) +		return 1; + +	if (b1->inum < b2->inum) +		return -1; +	else if (b1->inum > b2->inum) +		return 1; + +	if (b1->file_pos < b2->file_pos) +		return -1; +	else if (b1->file_pos > b2->file_pos) +		return 1; + +	/* +	 * [------------------------------] ===> (a range of space) +	 *     |<--->|   |<---->| =============> (fs/file tree A) +	 * |<---------------------------->| ===> (fs/file tree B) +	 * +	 * A range of space can refer to two file extents in one tree while +	 * refer to only one file extent in another tree. +	 * +	 * So we may process a disk offset more than one time(two extents in A) +	 * and locate at the same extent(one extent in B), then insert two same +	 * backrefs(both refer to the extent in B). +	 */ +	return 0; +} + +static void backref_insert(struct rb_root *root, +			   struct sa_defrag_extent_backref *backref) +{ +	struct rb_node **p = &root->rb_node; +	struct rb_node *parent = NULL; +	struct sa_defrag_extent_backref *entry; +	int ret; + +	while (*p) { +		parent = *p; +		entry = rb_entry(parent, struct sa_defrag_extent_backref, node); + +		ret = backref_comp(backref, entry); +		if (ret < 0) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	rb_link_node(&backref->node, parent, p); +	rb_insert_color(&backref->node, root); +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, +				       void *ctx) +{ +	struct btrfs_file_extent_item *extent; +	struct btrfs_fs_info *fs_info; +	struct old_sa_defrag_extent *old = ctx; +	struct new_sa_defrag_extent *new = old->new; +	struct btrfs_path *path = new->path; +	struct btrfs_key key; +	struct btrfs_root *root; +	struct sa_defrag_extent_backref *backref; +	struct extent_buffer *leaf; +	struct inode *inode = new->inode; +	int slot; +	int ret; +	u64 extent_offset; +	u64 num_bytes; + +	if (BTRFS_I(inode)->root->root_key.objectid == root_id && +	    inum == btrfs_ino(inode)) +		return 0; + +	key.objectid = root_id; +	key.type = BTRFS_ROOT_ITEM_KEY; +	key.offset = (u64)-1; + +	fs_info = BTRFS_I(inode)->root->fs_info; +	root = btrfs_read_fs_root_no_name(fs_info, &key); +	if (IS_ERR(root)) { +		if (PTR_ERR(root) == -ENOENT) +			return 0; +		WARN_ON(1); +		pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", +			 inum, offset, root_id); +		return PTR_ERR(root); +	} + +	key.objectid = inum; +	key.type = BTRFS_EXTENT_DATA_KEY; +	if (offset > (u64)-1 << 32) +		key.offset = 0; +	else +		key.offset = offset; + +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) { +		WARN_ON(1); +		return ret; +	} + +	while (1) { +		cond_resched(); + +		leaf = path->nodes[0]; +		slot = path->slots[0]; + +		if (slot >= btrfs_header_nritems(leaf)) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) { +				goto out; +			} else if (ret > 0) { +				ret = 0; +				goto out; +			} +			continue; +		} + +		path->slots[0]++; + +		btrfs_item_key_to_cpu(leaf, &key, slot); + +		if (key.objectid > inum) +			goto out; + +		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) +			continue; + +		extent = btrfs_item_ptr(leaf, slot, +					struct btrfs_file_extent_item); + +		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) +			continue; + +		extent_offset = btrfs_file_extent_offset(leaf, extent); +		if (key.offset - extent_offset != offset) +			continue; + +		num_bytes = btrfs_file_extent_num_bytes(leaf, extent); +		if (extent_offset >= old->extent_offset + old->offset + +		    old->len || extent_offset + num_bytes <= +		    old->extent_offset + old->offset) +			continue; + +		break; +	} + +	backref = kmalloc(sizeof(*backref), GFP_NOFS); +	if (!backref) { +		ret = -ENOENT; +		goto out; +	} + +	backref->root_id = root_id; +	backref->inum = inum; +	backref->file_pos = offset + extent_offset; +	backref->num_bytes = num_bytes; +	backref->extent_offset = extent_offset; +	backref->generation = btrfs_file_extent_generation(leaf, extent); +	backref->old = old; +	backref_insert(&new->root, backref); +	old->count++; +out: +	btrfs_release_path(path); +	WARN_ON(ret); +	return ret; +} + +static noinline bool record_extent_backrefs(struct btrfs_path *path, +				   struct new_sa_defrag_extent *new) +{ +	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; +	struct old_sa_defrag_extent *old, *tmp; +	int ret; + +	new->path = path; + +	list_for_each_entry_safe(old, tmp, &new->head, list) { +		ret = iterate_inodes_from_logical(old->bytenr, fs_info, +						  path, record_one_backref, +						  old); +		BUG_ON(ret < 0 && ret != -ENOENT); + +		/* no backref to be processed for this extent */ +		if (!old->count) { +			list_del(&old->list); +			kfree(old); +		} +	} + +	if (list_empty(&new->head)) +		return false; + +	return true; +} + +static int relink_is_mergable(struct extent_buffer *leaf, +			      struct btrfs_file_extent_item *fi, +			      u64 disk_bytenr) +{ +	if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) +		return 0; + +	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) +		return 0; + +	if (btrfs_file_extent_compression(leaf, fi) || +	    btrfs_file_extent_encryption(leaf, fi) || +	    btrfs_file_extent_other_encoding(leaf, fi)) +		return 0; + +	return 1; +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int relink_extent_backref(struct btrfs_path *path, +				 struct sa_defrag_extent_backref *prev, +				 struct sa_defrag_extent_backref *backref) +{ +	struct btrfs_file_extent_item *extent; +	struct btrfs_file_extent_item *item; +	struct btrfs_ordered_extent *ordered; +	struct btrfs_trans_handle *trans; +	struct btrfs_fs_info *fs_info; +	struct btrfs_root *root; +	struct btrfs_key key; +	struct extent_buffer *leaf; +	struct old_sa_defrag_extent *old = backref->old; +	struct new_sa_defrag_extent *new = old->new; +	struct inode *src_inode = new->inode; +	struct inode *inode; +	struct extent_state *cached = NULL; +	int ret = 0; +	u64 start; +	u64 len; +	u64 lock_start; +	u64 lock_end; +	bool merge = false; +	int index; + +	if (prev && prev->root_id == backref->root_id && +	    prev->inum == backref->inum && +	    prev->file_pos + prev->num_bytes == backref->file_pos) +		merge = true; + +	/* step 1: get root */ +	key.objectid = backref->root_id; +	key.type = BTRFS_ROOT_ITEM_KEY; +	key.offset = (u64)-1; + +	fs_info = BTRFS_I(src_inode)->root->fs_info; +	index = srcu_read_lock(&fs_info->subvol_srcu); + +	root = btrfs_read_fs_root_no_name(fs_info, &key); +	if (IS_ERR(root)) { +		srcu_read_unlock(&fs_info->subvol_srcu, index); +		if (PTR_ERR(root) == -ENOENT) +			return 0; +		return PTR_ERR(root); +	} +	if (btrfs_root_refs(&root->root_item) == 0) { +		srcu_read_unlock(&fs_info->subvol_srcu, index); +		/* parse ENOENT to 0 */ +		return 0; +	} + +	/* step 2: get inode */ +	key.objectid = backref->inum; +	key.type = BTRFS_INODE_ITEM_KEY; +	key.offset = 0; + +	inode = btrfs_iget(fs_info->sb, &key, root, NULL); +	if (IS_ERR(inode)) { +		srcu_read_unlock(&fs_info->subvol_srcu, index); +		return 0; +	} + +	srcu_read_unlock(&fs_info->subvol_srcu, index); + +	/* step 3: relink backref */ +	lock_start = backref->file_pos; +	lock_end = backref->file_pos + backref->num_bytes - 1; +	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, +			 0, &cached); + +	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); +	if (ordered) { +		btrfs_put_ordered_extent(ordered); +		goto out_unlock; +	} + +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		goto out_unlock; +	} + +	key.objectid = backref->inum; +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = backref->file_pos; + +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) { +		goto out_free_path; +	} else if (ret > 0) { +		ret = 0; +		goto out_free_path; +	} + +	extent = btrfs_item_ptr(path->nodes[0], path->slots[0], +				struct btrfs_file_extent_item); + +	if (btrfs_file_extent_generation(path->nodes[0], extent) != +	    backref->generation) +		goto out_free_path; + +	btrfs_release_path(path); + +	start = backref->file_pos; +	if (backref->extent_offset < old->extent_offset + old->offset) +		start += old->extent_offset + old->offset - +			 backref->extent_offset; + +	len = min(backref->extent_offset + backref->num_bytes, +		  old->extent_offset + old->offset + old->len); +	len -= max(backref->extent_offset, old->extent_offset + old->offset); + +	ret = btrfs_drop_extents(trans, root, inode, start, +				 start + len, 1); +	if (ret) +		goto out_free_path; +again: +	key.objectid = btrfs_ino(inode); +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = start; + +	if (merge) { +		struct btrfs_file_extent_item *fi; +		u64 extent_len; +		struct btrfs_key found_key; + +		ret = btrfs_search_slot(trans, root, &key, path, 1, 1); +		if (ret < 0) +			goto out_free_path; + +		path->slots[0]--; +		leaf = path->nodes[0]; +		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + +		fi = btrfs_item_ptr(leaf, path->slots[0], +				    struct btrfs_file_extent_item); +		extent_len = btrfs_file_extent_num_bytes(leaf, fi); + +		if (relink_is_mergable(leaf, fi, new->bytenr) && +		    extent_len + found_key.offset == start) { +			btrfs_set_file_extent_num_bytes(leaf, fi, +							extent_len + len); +			btrfs_mark_buffer_dirty(leaf); +			inode_add_bytes(inode, len); + +			ret = 1; +			goto out_free_path; +		} else { +			merge = false; +			btrfs_release_path(path); +			goto again; +		} +	} + +	ret = btrfs_insert_empty_item(trans, root, path, &key, +					sizeof(*extent)); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_free_path; +	} + +	leaf = path->nodes[0]; +	item = btrfs_item_ptr(leaf, path->slots[0], +				struct btrfs_file_extent_item); +	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); +	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); +	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); +	btrfs_set_file_extent_num_bytes(leaf, item, len); +	btrfs_set_file_extent_ram_bytes(leaf, item, new->len); +	btrfs_set_file_extent_generation(leaf, item, trans->transid); +	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); +	btrfs_set_file_extent_compression(leaf, item, new->compress_type); +	btrfs_set_file_extent_encryption(leaf, item, 0); +	btrfs_set_file_extent_other_encoding(leaf, item, 0); + +	btrfs_mark_buffer_dirty(leaf); +	inode_add_bytes(inode, len); + +	ret = btrfs_inc_extent_ref(trans, root, new->bytenr, +			new->disk_len, 0, +			backref->root_id, backref->inum, +			new->file_pos, 0);	/* start - extent_offset */ +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_free_path; +	} + +	ret = 1; +out_free_path: +	btrfs_release_path(path); +	btrfs_end_transaction(trans, root); +out_unlock: +	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, +			     &cached, GFP_NOFS); +	iput(inode); +	return ret; +} + +static void relink_file_extents(struct new_sa_defrag_extent *new) +{ +	struct btrfs_path *path; +	struct old_sa_defrag_extent *old, *tmp; +	struct sa_defrag_extent_backref *backref; +	struct sa_defrag_extent_backref *prev = NULL; +	struct inode *inode; +	struct btrfs_root *root; +	struct rb_node *node; +	int ret; + +	inode = new->inode; +	root = BTRFS_I(inode)->root; + +	path = btrfs_alloc_path(); +	if (!path) +		return; + +	if (!record_extent_backrefs(path, new)) { +		btrfs_free_path(path); +		goto out; +	} +	btrfs_release_path(path); + +	while (1) { +		node = rb_first(&new->root); +		if (!node) +			break; +		rb_erase(node, &new->root); + +		backref = rb_entry(node, struct sa_defrag_extent_backref, node); + +		ret = relink_extent_backref(path, prev, backref); +		WARN_ON(ret < 0); + +		kfree(prev); + +		if (ret == 1) +			prev = backref; +		else +			prev = NULL; +		cond_resched(); +	} +	kfree(prev); + +	btrfs_free_path(path); + +	list_for_each_entry_safe(old, tmp, &new->head, list) { +		list_del(&old->list); +		kfree(old); +	} +out: +	atomic_dec(&root->fs_info->defrag_running); +	wake_up(&root->fs_info->transaction_wait); + +	kfree(new); +} + +static struct new_sa_defrag_extent * +record_old_file_extents(struct inode *inode, +			struct btrfs_ordered_extent *ordered) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct btrfs_path *path; +	struct btrfs_key key; +	struct old_sa_defrag_extent *old, *tmp; +	struct new_sa_defrag_extent *new; +	int ret; + +	new = kmalloc(sizeof(*new), GFP_NOFS); +	if (!new) +		return NULL; + +	new->inode = inode; +	new->file_pos = ordered->file_offset; +	new->len = ordered->len; +	new->bytenr = ordered->start; +	new->disk_len = ordered->disk_len; +	new->compress_type = ordered->compress_type; +	new->root = RB_ROOT; +	INIT_LIST_HEAD(&new->head); + +	path = btrfs_alloc_path(); +	if (!path) +		goto out_kfree; + +	key.objectid = btrfs_ino(inode); +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = new->file_pos; + +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) +		goto out_free_path; +	if (ret > 0 && path->slots[0] > 0) +		path->slots[0]--; + +	/* find out all the old extents for the file range */ +	while (1) { +		struct btrfs_file_extent_item *extent; +		struct extent_buffer *l; +		int slot; +		u64 num_bytes; +		u64 offset; +		u64 end; +		u64 disk_bytenr; +		u64 extent_offset; + +		l = path->nodes[0]; +		slot = path->slots[0]; + +		if (slot >= btrfs_header_nritems(l)) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) +				goto out_free_list; +			else if (ret > 0) +				break; +			continue; +		} + +		btrfs_item_key_to_cpu(l, &key, slot); + +		if (key.objectid != btrfs_ino(inode)) +			break; +		if (key.type != BTRFS_EXTENT_DATA_KEY) +			break; +		if (key.offset >= new->file_pos + new->len) +			break; + +		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); + +		num_bytes = btrfs_file_extent_num_bytes(l, extent); +		if (key.offset + num_bytes < new->file_pos) +			goto next; + +		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); +		if (!disk_bytenr) +			goto next; + +		extent_offset = btrfs_file_extent_offset(l, extent); + +		old = kmalloc(sizeof(*old), GFP_NOFS); +		if (!old) +			goto out_free_list; + +		offset = max(new->file_pos, key.offset); +		end = min(new->file_pos + new->len, key.offset + num_bytes); + +		old->bytenr = disk_bytenr; +		old->extent_offset = extent_offset; +		old->offset = offset - key.offset; +		old->len = end - offset; +		old->new = new; +		old->count = 0; +		list_add_tail(&old->list, &new->head); +next: +		path->slots[0]++; +		cond_resched(); +	} + +	btrfs_free_path(path); +	atomic_inc(&root->fs_info->defrag_running); + +	return new; + +out_free_list: +	list_for_each_entry_safe(old, tmp, &new->head, list) { +		list_del(&old->list); +		kfree(old); +	} +out_free_path: +	btrfs_free_path(path); +out_kfree: +	kfree(new); +	return NULL; +} +  /*   * helper function for btrfs_finish_ordered_io, this   * just reads in some of the csum leaves to prime them into ram @@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  	struct btrfs_trans_handle *trans = NULL;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct extent_state *cached_state = NULL; +	struct new_sa_defrag_extent *new = NULL;  	int compress_type = 0;  	int ret;  	bool nolock; @@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  			 ordered_extent->file_offset + ordered_extent->len - 1,  			 0, &cached_state); +	ret = test_range_bit(io_tree, ordered_extent->file_offset, +			ordered_extent->file_offset + ordered_extent->len - 1, +			EXTENT_DEFRAG, 1, cached_state); +	if (ret) { +		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); +		if (last_snapshot >= BTRFS_I(inode)->generation) +			/* the inode is shared */ +			new = record_old_file_extents(inode, ordered_extent); + +		clear_extent_bit(io_tree, ordered_extent->file_offset, +			ordered_extent->file_offset + ordered_extent->len - 1, +			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); +	} +  	if (nolock)  		trans = btrfs_join_transaction_nolock(root);  	else @@ -2001,17 +2689,33 @@ out:  	if (trans)  		btrfs_end_transaction(trans, root); -	if (ret) +	if (ret) {  		clear_extent_uptodate(io_tree, ordered_extent->file_offset,  				      ordered_extent->file_offset +  				      ordered_extent->len - 1, NULL, GFP_NOFS); +		/* +		 * If the ordered extent had an IOERR or something else went +		 * wrong we need to return the space for this ordered extent +		 * back to the allocator. +		 */ +		if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && +		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) +			btrfs_free_reserved_extent(root, ordered_extent->start, +						   ordered_extent->disk_len); +	} + +  	/*  	 * This needs to be done to make sure anybody waiting knows we are done  	 * updating everything for this ordered extent.  	 */  	btrfs_remove_ordered_extent(inode, ordered_extent); +	/* for snapshot-aware defrag */ +	if (new) +		relink_file_extents(new); +  	/* once for us */  	btrfs_put_ordered_extent(ordered_extent);  	/* once for the tree */ @@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,  static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,  			       struct extent_state *state, int mirror)  { -	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); +	size_t offset = start - page_offset(page);  	struct inode *inode = page->mapping->host;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	char *kaddr; @@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)  	}  } -enum btrfs_orphan_cleanup_state { -	ORPHAN_CLEANUP_STARTED	= 1, -	ORPHAN_CLEANUP_DONE	= 2, -}; -  /*   * This is called in transaction commit time. If there are no orphan   * files in the subvolume, it removes orphan item and frees block_rsv @@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		 */  		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,  			&BTRFS_I(inode)->runtime_flags); +		atomic_inc(&root->orphan_inodes);  		/* if we have links, this was a truncate, lets do that */  		if (inode->i_nlink) { @@ -2478,7 +3178,21 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  				continue;  			}  			nr_truncate++; + +			/* 1 for the orphan item deletion. */ +			trans = btrfs_start_transaction(root, 1); +			if (IS_ERR(trans)) { +				ret = PTR_ERR(trans); +				goto out; +			} +			ret = btrfs_orphan_add(trans, inode); +			btrfs_end_transaction(trans, root); +			if (ret) +				goto out; +  			ret = btrfs_truncate(inode); +			if (ret) +				btrfs_orphan_del(NULL, inode);  		} else {  			nr_unlink++;  		} @@ -2697,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  			    struct btrfs_inode_item *item,  			    struct inode *inode)  { -	btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); -	btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); -	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); -	btrfs_set_inode_mode(leaf, item, inode->i_mode); -	btrfs_set_inode_nlink(leaf, item, inode->i_nlink); +	struct btrfs_map_token token; + +	btrfs_init_map_token(&token); -	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), -			       inode->i_atime.tv_sec); -	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), -				inode->i_atime.tv_nsec); +	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); +	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); +	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, +				   &token); +	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); +	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); -	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), -			       inode->i_mtime.tv_sec); -	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), -				inode->i_mtime.tv_nsec); +	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), +				     inode->i_atime.tv_sec, &token); +	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), +				      inode->i_atime.tv_nsec, &token); -	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), -			       inode->i_ctime.tv_sec); -	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), -				inode->i_ctime.tv_nsec); +	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), +				     inode->i_mtime.tv_sec, &token); +	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), +				      inode->i_mtime.tv_nsec, &token); -	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); -	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); -	btrfs_set_inode_sequence(leaf, item, inode->i_version); -	btrfs_set_inode_transid(leaf, item, trans->transid); -	btrfs_set_inode_rdev(leaf, item, inode->i_rdev); -	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); -	btrfs_set_inode_block_group(leaf, item, 0); +	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), +				     inode->i_ctime.tv_sec, &token); +	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), +				      inode->i_ctime.tv_nsec, &token); + +	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), +				     &token); +	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, +					 &token); +	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); +	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); +	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); +	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); +	btrfs_set_token_inode_block_group(leaf, item, 0, &token);  }  /* @@ -3292,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  	u64 extent_num_bytes = 0;  	u64 extent_offset = 0;  	u64 item_end = 0; -	u64 mask = root->sectorsize - 1;  	u32 found_type = (u8)-1;  	int found_extent;  	int del_item; @@ -3316,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  	 * extent just the way it is.  	 */  	if (root->ref_cows || root == root->fs_info->tree_root) -		btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); +		btrfs_drop_extent_cache(inode, ALIGN(new_size, +					root->sectorsize), (u64)-1, 0);  	/*  	 * This function is also used to drop the items in the log tree before @@ -3395,10 +4116,9 @@ search_again:  			if (!del_item) {  				u64 orig_num_bytes =  					btrfs_file_extent_num_bytes(leaf, fi); -				extent_num_bytes = new_size - -					found_key.offset + root->sectorsize - 1; -				extent_num_bytes = extent_num_bytes & -					~((u64)root->sectorsize - 1); +				extent_num_bytes = ALIGN(new_size - +						found_key.offset, +						root->sectorsize);  				btrfs_set_file_extent_num_bytes(leaf, fi,  							 extent_num_bytes);  				num_dec = (orig_num_bytes - @@ -3634,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  	struct extent_map *em = NULL;  	struct extent_state *cached_state = NULL;  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; -	u64 mask = root->sectorsize - 1; -	u64 hole_start = (oldsize + mask) & ~mask; -	u64 block_end = (size + mask) & ~mask; +	u64 hole_start = ALIGN(oldsize, root->sectorsize); +	u64 block_end = ALIGN(size, root->sectorsize);  	u64 last_byte;  	u64 cur_offset;  	u64 hole_size; @@ -3665,10 +4384,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  				block_end - cur_offset, 0);  		if (IS_ERR(em)) {  			err = PTR_ERR(em); +			em = NULL;  			break;  		}  		last_byte = min(extent_map_end(em), block_end); -		last_byte = (last_byte + mask) & ~mask; +		last_byte = ALIGN(last_byte , root->sectorsize);  		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {  			struct extent_map *hole_em;  			hole_size = last_byte - cur_offset; @@ -3748,16 +4468,27 @@ next:  	return err;  } -static int btrfs_setsize(struct inode *inode, loff_t newsize) +static int btrfs_setsize(struct inode *inode, struct iattr *attr)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans;  	loff_t oldsize = i_size_read(inode); +	loff_t newsize = attr->ia_size; +	int mask = attr->ia_valid;  	int ret;  	if (newsize == oldsize)  		return 0; +	/* +	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a +	 * special case where we need to update the times despite not having +	 * these flags set.  For all other operations the VFS set these flags +	 * explicitly if it wants a timestamp update. +	 */ +	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) +		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); +  	if (newsize > oldsize) {  		truncate_pagecache(inode, oldsize, newsize);  		ret = btrfs_cont_expand(inode, oldsize, newsize); @@ -3783,9 +4514,40 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)  			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,  				&BTRFS_I(inode)->runtime_flags); +		/* +		 * 1 for the orphan item we're going to add +		 * 1 for the orphan item deletion. +		 */ +		trans = btrfs_start_transaction(root, 2); +		if (IS_ERR(trans)) +			return PTR_ERR(trans); + +		/* +		 * We need to do this in case we fail at _any_ point during the +		 * actual truncate.  Once we do the truncate_setsize we could +		 * invalidate pages which forces any outstanding ordered io to +		 * be instantly completed which will give us extents that need +		 * to be truncated.  If we fail to get an orphan inode down we +		 * could have left over extents that were never meant to live, +		 * so we need to garuntee from this point on that everything +		 * will be consistent. +		 */ +		ret = btrfs_orphan_add(trans, inode); +		btrfs_end_transaction(trans, root); +		if (ret) +			return ret; +  		/* we don't support swapfiles, so vmtruncate shouldn't fail */  		truncate_setsize(inode, newsize); + +		/* Disable nonlocked read DIO to avoid the end less truncate */ +		btrfs_inode_block_unlocked_dio(inode); +		inode_dio_wait(inode); +		btrfs_inode_resume_unlocked_dio(inode); +  		ret = btrfs_truncate(inode); +		if (ret && inode->i_nlink) +			btrfs_orphan_del(NULL, inode);  	}  	return ret; @@ -3805,7 +4567,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)  		return err;  	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { -		err = btrfs_setsize(inode, attr->ia_size); +		err = btrfs_setsize(inode, attr);  		if (err)  			return err;  	} @@ -3855,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode)  		goto no_delete;  	} +	ret = btrfs_commit_inode_delayed_inode(inode); +	if (ret) { +		btrfs_orphan_del(NULL, inode); +		goto no_delete; +	} +  	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);  	if (!rsv) {  		btrfs_orphan_del(NULL, inode); @@ -3892,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode)  			goto no_delete;  		} -		trans = btrfs_start_transaction_lflush(root, 1); +		trans = btrfs_join_transaction(root);  		if (IS_ERR(trans)) {  			btrfs_orphan_del(NULL, inode);  			btrfs_free_block_rsv(root, rsv); @@ -3906,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode)  			break;  		trans->block_rsv = &root->fs_info->trans_block_rsv; -		ret = btrfs_update_inode(trans, root, inode); -		BUG_ON(ret); -  		btrfs_end_transaction(trans, root);  		trans = NULL;  		btrfs_btree_balance_dirty(root); @@ -4262,16 +5027,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)  	if (dentry->d_name.len > BTRFS_NAME_LEN)  		return ERR_PTR(-ENAMETOOLONG); -	if (unlikely(d_need_lookup(dentry))) { -		memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); -		kfree(dentry->d_fsdata); -		dentry->d_fsdata = NULL; -		/* This thing is hashed, drop it for now */ -		d_drop(dentry); -	} else { -		ret = btrfs_inode_by_name(dir, dentry, &location); -	} - +	ret = btrfs_inode_by_name(dir, dentry, &location);  	if (ret < 0)  		return ERR_PTR(ret); @@ -4341,11 +5097,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,  	struct dentry *ret;  	ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); -	if (unlikely(d_need_lookup(dentry))) { -		spin_lock(&dentry->d_lock); -		dentry->d_flags &= ~DCACHE_NEED_LOOKUP; -		spin_unlock(&dentry->d_lock); -	}  	return ret;  } @@ -4356,7 +5107,7 @@ unsigned char btrfs_filetype_table[] = {  static int btrfs_real_readdir(struct file *filp, void *dirent,  			      filldir_t filldir)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_item *item;  	struct btrfs_dir_item *di; @@ -4819,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  		if (btrfs_test_opt(root, NODATASUM))  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;  		if (btrfs_test_opt(root, NODATACOW)) -			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; +			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | +				BTRFS_INODE_NODATASUM;  	}  	insert_inode_hash(inode); @@ -4971,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,  		goto out_unlock;  	} -	err = btrfs_update_inode(trans, root, inode); -	if (err) { -		drop_inode = 1; -		goto out_unlock; -	} -  	/*  	* If the active LSM wants to access the inode during  	* d_instantiate it needs these. Smack checks to see @@ -5361,8 +6107,7 @@ again:  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {  		size_t size;  		size = btrfs_file_extent_inline_len(leaf, item); -		extent_end = (extent_start + size + root->sectorsize - 1) & -			~((u64)root->sectorsize - 1); +		extent_end = ALIGN(extent_start + size, root->sectorsize);  	}  	if (start >= extent_end) { @@ -5434,8 +6179,7 @@ again:  		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,  				size - extent_offset);  		em->start = extent_start + extent_offset; -		em->len = (copy_size + root->sectorsize - 1) & -			~((u64)root->sectorsize - 1); +		em->len = ALIGN(copy_size, root->sectorsize);  		em->orig_block_len = em->len;  		em->orig_start = em->start;  		if (compress_type) { @@ -5586,10 +6330,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag  		return em;  	if (em) {  		/* -		 * if our em maps to a hole, there might -		 * actually be delalloc bytes behind it +		 * if our em maps to +		 * -  a hole or +		 * -  a pre-alloc extent, +		 * there might actually be delalloc bytes behind it.  		 */ -		if (em->block_start != EXTENT_MAP_HOLE) +		if (em->block_start != EXTENT_MAP_HOLE && +		    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))  			return em;  		else  			hole_em = em; @@ -5671,6 +6418,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag  			 */  			em->block_start = hole_em->block_start;  			em->block_len = hole_len; +			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) +				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);  		} else {  			em->start = range_start;  			em->len = found; @@ -5909,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,  	em->start = start;  	em->orig_start = orig_start; +	em->mod_start = start; +	em->mod_len = len;  	em->len = len;  	em->block_len = block_len;  	em->block_start = block_start; @@ -5950,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	u64 len = bh_result->b_size;  	struct btrfs_trans_handle *trans;  	int unlock_bits = EXTENT_LOCKED; -	int ret; +	int ret = 0; -	if (create) { -		ret = btrfs_delalloc_reserve_space(inode, len); -		if (ret) -			return ret; +	if (create)  		unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; -	} else { +	else  		len = min_t(u64, len, root->sectorsize); -	}  	lockstart = start;  	lockend = start + len - 1; @@ -5971,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))  		return -ENOTBLK; -	if (create) { -		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, -				     lockend, EXTENT_DELALLOC, NULL, -				     &cached_state, GFP_NOFS); -		if (ret) -			goto unlock_err; -	} -  	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);  	if (IS_ERR(em)) {  		ret = PTR_ERR(em); @@ -6010,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	if (!create && (em->block_start == EXTENT_MAP_HOLE ||  			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {  		free_extent_map(em); -		ret = 0;  		goto unlock_err;  	} @@ -6108,6 +6846,15 @@ unlock:  		 */  		if (start + len > i_size_read(inode))  			i_size_write(inode, start + len); + +		spin_lock(&BTRFS_I(inode)->lock); +		BTRFS_I(inode)->outstanding_extents++; +		spin_unlock(&BTRFS_I(inode)->lock); + +		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, +				     lockstart + len - 1, EXTENT_DELALLOC, NULL, +				     &cached_state, GFP_NOFS); +		BUG_ON(ret);  	}  	/* @@ -6116,24 +6863,9 @@ unlock:  	 * aren't using if there is any left over space.  	 */  	if (lockstart < lockend) { -		if (create && len < lockend - lockstart) { -			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, -					 lockstart + len - 1, -					 unlock_bits | EXTENT_DEFRAG, 1, 0, -					 &cached_state, GFP_NOFS); -			/* -			 * Beside unlock, we also need to cleanup reserved space -			 * for the left range by attaching EXTENT_DO_ACCOUNTING. -			 */ -			clear_extent_bit(&BTRFS_I(inode)->io_tree, -					 lockstart + len, lockend, -					 unlock_bits | EXTENT_DO_ACCOUNTING | -					 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); -		} else { -			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, -					 lockend, unlock_bits, 1, 0, -					 &cached_state, GFP_NOFS); -		} +		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, +				 lockend, unlock_bits, 1, 0, +				 &cached_state, GFP_NOFS);  	} else {  		free_extent_state(cached_state);  	} @@ -6143,9 +6875,6 @@ unlock:  	return 0;  unlock_err: -	if (create) -		unlock_bits |= EXTENT_DO_ACCOUNTING; -  	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,  			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);  	return ret; @@ -6386,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,  	int async_submit = 0;  	map_length = orig_bio->bi_size; -	ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, +	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,  			      &map_length, NULL, 0);  	if (ret) {  		bio_put(orig_bio);  		return -EIO;  	} -  	if (map_length >= orig_bio->bi_size) {  		bio = orig_bio;  		goto submit;  	} -	async_submit = 1; +	/* async crcs make it difficult to collect full stripe writes. */ +	if (btrfs_get_alloc_profile(root, 1) & +	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) +		async_submit = 0; +	else +		async_submit = 1; +  	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);  	if (!bio)  		return -ENOMEM; @@ -6440,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,  			bio->bi_end_io = btrfs_end_dio_bio;  			map_length = orig_bio->bi_size; -			ret = btrfs_map_block(root->fs_info, READ, +			ret = btrfs_map_block(root->fs_info, rw,  					      start_sector << 9,  					      &map_length, NULL, 0);  			if (ret) { @@ -6583,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host; +	size_t count = 0; +	int flags = 0; +	bool wakeup = true; +	bool relock = false; +	ssize_t ret;  	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,  			    offset, nr_segs))  		return 0; -	return __blockdev_direct_IO(rw, iocb, inode, -		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, -		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, -		   btrfs_submit_direct, 0); +	atomic_inc(&inode->i_dio_count); +	smp_mb__after_atomic_inc(); + +	if (rw & WRITE) { +		count = iov_length(iov, nr_segs); +		/* +		 * If the write DIO is beyond the EOF, we need update +		 * the isize, but it is protected by i_mutex. So we can +		 * not unlock the i_mutex at this case. +		 */ +		if (offset + count <= inode->i_size) { +			mutex_unlock(&inode->i_mutex); +			relock = true; +		} +		ret = btrfs_delalloc_reserve_space(inode, count); +		if (ret) +			goto out; +	} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, +				     &BTRFS_I(inode)->runtime_flags))) { +		inode_dio_done(inode); +		flags = DIO_LOCKING | DIO_SKIP_HOLES; +		wakeup = false; +	} + +	ret = __blockdev_direct_IO(rw, iocb, inode, +			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, +			iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, +			btrfs_submit_direct, flags); +	if (rw & WRITE) { +		if (ret < 0 && ret != -EIOCBQUEUED) +			btrfs_delalloc_release_space(inode, count); +		else if (ret >= 0 && (size_t)ret < count) +			btrfs_delalloc_release_space(inode, +						     count - (size_t)ret); +		else +			btrfs_delalloc_release_metadata(inode, 0); +	} +out: +	if (wakeup) +		inode_dio_done(inode); +	if (relock) +		mutex_lock(&inode->i_mutex); + +	return ret;  }  #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC) @@ -6695,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  		return;  	}  	lock_extent_bits(tree, page_start, page_end, 0, &cached_state); -	ordered = btrfs_lookup_ordered_extent(inode, -					   page_offset(page)); +	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));  	if (ordered) {  		/*  		 * IO on this page will never be started, so we need @@ -6751,7 +7529,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	struct page *page = vmf->page; -	struct inode *inode = fdentry(vma->vm_file)->d_inode; +	struct inode *inode = file_inode(vma->vm_file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct btrfs_ordered_extent *ordered; @@ -6929,11 +7707,9 @@ static int btrfs_truncate(struct inode *inode)  	/*  	 * 1 for the truncate slack space -	 * 1 for the orphan item we're going to add -	 * 1 for the orphan item deletion  	 * 1 for updating the inode.  	 */ -	trans = btrfs_start_transaction(root, 4); +	trans = btrfs_start_transaction(root, 2);  	if (IS_ERR(trans)) {  		err = PTR_ERR(trans);  		goto out; @@ -6944,12 +7720,6 @@ static int btrfs_truncate(struct inode *inode)  				      min_size);  	BUG_ON(ret); -	ret = btrfs_orphan_add(trans, inode); -	if (ret) { -		btrfs_end_transaction(trans, root); -		goto out; -	} -  	/*  	 * setattr is responsible for setting the ordered_data_close flag,  	 * but that is only tested during the last file release.  That @@ -7018,12 +7788,6 @@ static int btrfs_truncate(struct inode *inode)  		ret = btrfs_orphan_del(trans, inode);  		if (ret)  			err = ret; -	} else if (ret && inode->i_nlink > 0) { -		/* -		 * Failed to do the truncate, remove us from the in memory -		 * orphan list. -		 */ -		ret = btrfs_orphan_del(NULL, inode);  	}  	if (trans) { @@ -7190,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; +	/* the snap/subvol tree is on deleting */  	if (btrfs_root_refs(&root->root_item) == 0 && -	    !btrfs_is_free_space_inode(inode)) +	    root != root->fs_info->tree_root)  		return 1;  	else  		return generic_drop_inode(inode); @@ -7273,40 +8038,22 @@ fail:  static int btrfs_getattr(struct vfsmount *mnt,  			 struct dentry *dentry, struct kstat *stat)  { +	u64 delalloc_bytes;  	struct inode *inode = dentry->d_inode;  	u32 blocksize = inode->i_sb->s_blocksize;  	generic_fillattr(inode, stat);  	stat->dev = BTRFS_I(inode)->root->anon_dev;  	stat->blksize = PAGE_CACHE_SIZE; + +	spin_lock(&BTRFS_I(inode)->lock); +	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; +	spin_unlock(&BTRFS_I(inode)->lock);  	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + -		ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; +			ALIGN(delalloc_bytes, blocksize)) >> 9;  	return 0;  } -/* - * If a file is moved, it will inherit the cow and compression flags of the new - * directory. - */ -static void fixup_inode_flags(struct inode *dir, struct inode *inode) -{ -	struct btrfs_inode *b_dir = BTRFS_I(dir); -	struct btrfs_inode *b_inode = BTRFS_I(inode); - -	if (b_dir->flags & BTRFS_INODE_NODATACOW) -		b_inode->flags |= BTRFS_INODE_NODATACOW; -	else -		b_inode->flags &= ~BTRFS_INODE_NODATACOW; - -	if (b_dir->flags & BTRFS_INODE_COMPRESS) { -		b_inode->flags |= BTRFS_INODE_COMPRESS; -		b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; -	} else { -		b_inode->flags &= ~(BTRFS_INODE_COMPRESS | -				    BTRFS_INODE_NOCOMPRESS); -	} -} -  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  			   struct inode *new_dir, struct dentry *new_dentry)  { @@ -7472,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  		}  	} -	fixup_inode_flags(new_dir, old_inode); -  	ret = btrfs_add_link(trans, new_dir, old_inode,  			     new_dentry->d_name.name,  			     new_dentry->d_name.len, 0, index); @@ -7545,41 +8290,57 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)   */  int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)  { -	struct list_head *head = &root->fs_info->delalloc_inodes;  	struct btrfs_inode *binode;  	struct inode *inode;  	struct btrfs_delalloc_work *work, *next;  	struct list_head works; +	struct list_head splice;  	int ret = 0;  	if (root->fs_info->sb->s_flags & MS_RDONLY)  		return -EROFS;  	INIT_LIST_HEAD(&works); +	INIT_LIST_HEAD(&splice);  	spin_lock(&root->fs_info->delalloc_lock); -	while (!list_empty(head)) { -		binode = list_entry(head->next, struct btrfs_inode, +	list_splice_init(&root->fs_info->delalloc_inodes, &splice); +	while (!list_empty(&splice)) { +		binode = list_entry(splice.next, struct btrfs_inode,  				    delalloc_inodes); + +		list_del_init(&binode->delalloc_inodes); +  		inode = igrab(&binode->vfs_inode); -		if (!inode) -			list_del_init(&binode->delalloc_inodes); +		if (!inode) { +			clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, +				  &binode->runtime_flags); +			continue; +		} + +		list_add_tail(&binode->delalloc_inodes, +			      &root->fs_info->delalloc_inodes);  		spin_unlock(&root->fs_info->delalloc_lock); -		if (inode) { -			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); -			if (!work) { -				ret = -ENOMEM; -				goto out; -			} -			list_add_tail(&work->list, &works); -			btrfs_queue_worker(&root->fs_info->flush_workers, -					   &work->work); + +		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); +		if (unlikely(!work)) { +			ret = -ENOMEM; +			goto out;  		} +		list_add_tail(&work->list, &works); +		btrfs_queue_worker(&root->fs_info->flush_workers, +				   &work->work); +  		cond_resched();  		spin_lock(&root->fs_info->delalloc_lock);  	}  	spin_unlock(&root->fs_info->delalloc_lock); +	list_for_each_entry_safe(work, next, &works, list) { +		list_del_init(&work->list); +		btrfs_wait_and_free_delalloc_work(work); +	} +  	/* the filemap_flush will queue IO into the worker threads, but  	 * we have to make sure the IO is actually started and that  	 * ordered extents get created before we return @@ -7592,11 +8353,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)  		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));  	}  	atomic_dec(&root->fs_info->async_submit_draining); +	return 0;  out:  	list_for_each_entry_safe(work, next, &works, list) {  		list_del_init(&work->list);  		btrfs_wait_and_free_delalloc_work(work);  	} + +	if (!list_empty_careful(&splice)) { +		spin_lock(&root->fs_info->delalloc_lock); +		list_splice_tail(&splice, &root->fs_info->delalloc_inodes); +		spin_unlock(&root->fs_info->delalloc_lock); +	}  	return ret;  } @@ -7748,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  			}  		} -		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, -					   0, *alloc_hint, &ins, 1); +		ret = btrfs_reserve_extent(trans, root, +					   min(num_bytes, 256ULL * 1024 * 1024), +					   min_size, 0, *alloc_hint, &ins, 1);  		if (ret) {  			if (own_trans)  				btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4b4516770f0..c83086fdda0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -42,12 +42,12 @@  #include <linux/slab.h>  #include <linux/blkdev.h>  #include <linux/uuid.h> +#include <linux/btrfs.h>  #include "compat.h"  #include "ctree.h"  #include "disk-io.h"  #include "transaction.h"  #include "btrfs_inode.h" -#include "ioctl.h"  #include "print-tree.h"  #include "volumes.h"  #include "locking.h" @@ -152,7 +152,7 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)  static int btrfs_ioctl_getflags(struct file *file, void __user *arg)  { -	struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); +	struct btrfs_inode *ip = BTRFS_I(file_inode(file));  	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);  	if (copy_to_user(arg, &flags, sizeof(flags))) @@ -177,7 +177,7 @@ static int check_flags(unsigned int flags)  static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_inode *ip = BTRFS_I(inode);  	struct btrfs_root *root = ip->root;  	struct btrfs_trans_handle *trans; @@ -310,7 +310,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  static int btrfs_ioctl_getversion(struct file *file, int __user *arg)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	return put_user(inode->i_generation, arg);  } @@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)  	return 0;  } -static noinline int create_subvol(struct btrfs_root *root, +static noinline int create_subvol(struct inode *dir,  				  struct dentry *dentry,  				  char *name, int namelen,  				  u64 *async_transid, -				  struct btrfs_qgroup_inherit **inherit) +				  struct btrfs_qgroup_inherit *inherit)  {  	struct btrfs_trans_handle *trans;  	struct btrfs_key key;  	struct btrfs_root_item root_item;  	struct btrfs_inode_item *inode_item;  	struct extent_buffer *leaf; +	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct btrfs_root *new_root; -	struct dentry *parent = dentry->d_parent; -	struct inode *dir; +	struct btrfs_block_rsv block_rsv;  	struct timespec cur_time = CURRENT_TIME;  	int ret;  	int err;  	u64 objectid;  	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;  	u64 index = 0; +	u64 qgroup_reserved;  	uuid_le new_uuid;  	ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);  	if (ret)  		return ret; -	dir = parent->d_inode; - +	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);  	/* -	 * 1 - inode item -	 * 2 - refs -	 * 1 - root item -	 * 2 - dir items +	 * The same as the snapshot creation, please see the comment +	 * of create_snapshot().  	 */ -	trans = btrfs_start_transaction(root, 6); -	if (IS_ERR(trans)) -		return PTR_ERR(trans); +	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, +					       7, &qgroup_reserved); +	if (ret) +		return ret; -	ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, -				   inherit ? *inherit : NULL); +	trans = btrfs_start_transaction(root, 0); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		goto out; +	} +	trans->block_rsv = &block_rsv; +	trans->bytes_reserved = block_rsv.size; + +	ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);  	if (ret)  		goto fail; @@ -515,8 +521,9 @@ static noinline int create_subvol(struct btrfs_root *root,  	BUG_ON(ret); -	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));  fail: +	trans->block_rsv = NULL; +	trans->bytes_reserved = 0;  	if (async_transid) {  		*async_transid = trans->transid;  		err = btrfs_commit_transaction_async(trans, root, 1); @@ -525,12 +532,18 @@ fail:  	}  	if (err && !ret)  		ret = err; + +	if (!ret) +		d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); +out: +	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);  	return ret;  } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, -			   char *name, int namelen, u64 *async_transid, -			   bool readonly, struct btrfs_qgroup_inherit **inherit) +static int create_snapshot(struct btrfs_root *root, struct inode *dir, +			   struct dentry *dentry, char *name, int namelen, +			   u64 *async_transid, bool readonly, +			   struct btrfs_qgroup_inherit *inherit)  {  	struct inode *inode;  	struct btrfs_pending_snapshot *pending_snapshot; @@ -546,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,  	btrfs_init_block_rsv(&pending_snapshot->block_rsv,  			     BTRFS_BLOCK_RSV_TEMP); +	/* +	 * 1 - parent dir inode +	 * 2 - dir entries +	 * 1 - root item +	 * 2 - root ref/backref +	 * 1 - root of snapshot +	 */ +	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, +					&pending_snapshot->block_rsv, 7, +					&pending_snapshot->qgroup_reserved); +	if (ret) +		goto out; +  	pending_snapshot->dentry = dentry;  	pending_snapshot->root = root;  	pending_snapshot->readonly = readonly; -	if (inherit) { -		pending_snapshot->inherit = *inherit; -		*inherit = NULL;	/* take responsibility to free it */ -	} +	pending_snapshot->dir = dir; +	pending_snapshot->inherit = inherit; -	trans = btrfs_start_transaction(root->fs_info->extent_root, 6); +	trans = btrfs_start_transaction(root, 0);  	if (IS_ERR(trans)) {  		ret = PTR_ERR(trans);  		goto fail;  	} -	ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); -	BUG_ON(ret); -  	spin_lock(&root->fs_info->trans_lock);  	list_add(&pending_snapshot->list,  		 &trans->transaction->pending_snapshots); @@ -599,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,  	d_instantiate(dentry, inode);  	ret = 0;  fail: +	btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, +					 &pending_snapshot->block_rsv, +					 pending_snapshot->qgroup_reserved); +out:  	kfree(pending_snapshot);  	return ret;  } @@ -692,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent,  				   char *name, int namelen,  				   struct btrfs_root *snap_src,  				   u64 *async_transid, bool readonly, -				   struct btrfs_qgroup_inherit **inherit) +				   struct btrfs_qgroup_inherit *inherit)  {  	struct inode *dir  = parent->dentry->d_inode;  	struct dentry *dentry; @@ -729,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent,  		goto out_up_read;  	if (snap_src) { -		error = create_snapshot(snap_src, dentry, name, namelen, +		error = create_snapshot(snap_src, dir, dentry, name, namelen,  					async_transid, readonly, inherit);  	} else { -		error = create_subvol(BTRFS_I(dir)->root, dentry, -				      name, namelen, async_transid, inherit); +		error = create_subvol(dir, dentry, name, namelen, +				      async_transid, inherit);  	}  	if (!error)  		fsnotify_mkdir(dir, dentry); @@ -815,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root,  	while(1) {  		ret = btrfs_search_forward(root, &min_key, &max_key, -					   path, 0, newer_than); +					   path, newer_than);  		if (ret != 0)  			goto none;  		if (min_key.objectid != ino) @@ -1203,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  		if (!(inode->i_sb->s_flags & MS_ACTIVE))  			break; +		if (btrfs_defrag_cancelled(root->fs_info)) { +			printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); +			ret = -EAGAIN; +			break; +		} +  		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,  					 extent_thresh, &last_len, &skip,  					 &defrag_end, range->flags & @@ -1317,7 +1348,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,  	u64 new_size;  	u64 old_size;  	u64 devid = 1; -	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;  	struct btrfs_ioctl_vol_args *vol_args;  	struct btrfs_trans_handle *trans;  	struct btrfs_device *device = NULL; @@ -1326,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,  	int ret = 0;  	int mod = 0; -	if (root->fs_info->sb->s_flags & MS_RDONLY) -		return -EROFS; -  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -1339,7 +1367,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,  	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,  			1)) {  		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); -		return -EINPROGRESS; +		mnt_drop_write_file(file); +		return -EINVAL;  	}  	mutex_lock(&root->fs_info->volume_mutex); @@ -1359,21 +1388,27 @@ static noinline int btrfs_ioctl_resize(struct file *file,  		*devstr = '\0';  		devstr = vol_args->name;  		devid = simple_strtoull(devstr, &end, 10); +		if (!devid) { +			ret = -EINVAL; +			goto out_free; +		}  		printk(KERN_INFO "btrfs: resizing devid %llu\n",  		       (unsigned long long)devid);  	} +  	device = btrfs_find_device(root->fs_info, devid, NULL, NULL);  	if (!device) {  		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",  		       (unsigned long long)devid); -		ret = -EINVAL; +		ret = -ENODEV;  		goto out_free;  	} -	if (device->fs_devices && device->fs_devices->seeding) { + +	if (!device->writeable) {  		printk(KERN_INFO "btrfs: resizer unable to apply on " -		       "seeding device %llu\n", +		       "readonly device %llu\n",  		       (unsigned long long)devid); -		ret = -EINVAL; +		ret = -EPERM;  		goto out_free;  	} @@ -1395,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,  	}  	if (device->is_tgtdev_for_dev_replace) { -		ret = -EINVAL; +		ret = -EPERM;  		goto out_free;  	} @@ -1443,15 +1478,15 @@ out_free:  	kfree(vol_args);  out:  	mutex_unlock(&root->fs_info->volume_mutex); -	mnt_drop_write_file(file);  	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +	mnt_drop_write_file(file);  	return ret;  }  static noinline int btrfs_ioctl_snap_create_transid(struct file *file,  				char *name, unsigned long fd, int subvol,  				u64 *transid, bool readonly, -				struct btrfs_qgroup_inherit **inherit) +				struct btrfs_qgroup_inherit *inherit)  {  	int namelen;  	int ret = 0; @@ -1483,8 +1518,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,  			goto out_drop_write;  		} -		src_inode = src.file->f_path.dentry->d_inode; -		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { +		src_inode = file_inode(src.file); +		if (src_inode->i_sb != file_inode(file)->i_sb) {  			printk(KERN_INFO "btrfs: Snapshot src from "  			       "another FS\n");  			ret = -EINVAL; @@ -1560,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,  	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,  					      vol_args->fd, subvol, ptr, -					      readonly, &inherit); +					      readonly, inherit);  	if (ret == 0 && ptr &&  	    copy_to_user(arg + @@ -1576,7 +1611,7 @@ out:  static noinline int btrfs_ioctl_subvol_getflags(struct file *file,  						void __user *arg)  { -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret = 0;  	u64 flags = 0; @@ -1598,7 +1633,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file,  static noinline int btrfs_ioctl_subvol_setflags(struct file *file,  					      void __user *arg)  { -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans;  	u64 root_flags; @@ -1857,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode,  	path->keep_locks = 1;  	while(1) { -		ret = btrfs_search_forward(root, &key, &max_key, path, 0, +		ret = btrfs_search_forward(root, &key, &max_key, path,  					   sk->min_transid);  		if (ret != 0) {  			if (ret > 0) @@ -1892,7 +1927,7 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,  	if (IS_ERR(args))  		return PTR_ERR(args); -	inode = fdentry(file)->d_inode; +	inode = file_inode(file);  	ret = search_ioctl(inode, args);  	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))  		ret = -EFAULT; @@ -2002,7 +2037,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,  	if (IS_ERR(args))  		return PTR_ERR(args); -	inode = fdentry(file)->d_inode; +	inode = file_inode(file);  	if (args->treeid == 0)  		args->treeid = BTRFS_I(inode)->root->root_key.objectid; @@ -2029,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,  	struct btrfs_root *dest = NULL;  	struct btrfs_ioctl_vol_args *vol_args;  	struct btrfs_trans_handle *trans; +	struct btrfs_block_rsv block_rsv; +	u64 qgroup_reserved;  	int namelen;  	int ret;  	int err = 0; @@ -2095,13 +2132,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,  		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);  		if (err)  			goto out_dput; - -		/* check if subvolume may be deleted by a non-root user */ -		err = btrfs_may_delete(dir, dentry, 1); -		if (err) -			goto out_dput;  	} +	/* check if subvolume may be deleted by a user */ +	err = btrfs_may_delete(dir, dentry, 1); +	if (err) +		goto out_dput; +  	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {  		err = -EINVAL;  		goto out_dput; @@ -2118,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,  	if (err)  		goto out_up_write; +	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); +	/* +	 * One for dir inode, two for dir entries, two for root +	 * ref/backref. +	 */ +	err = btrfs_subvolume_reserve_metadata(root, &block_rsv, +					       5, &qgroup_reserved); +	if (err) +		goto out_up_write; +  	trans = btrfs_start_transaction(root, 0);  	if (IS_ERR(trans)) {  		err = PTR_ERR(trans); -		goto out_up_write; +		goto out_release;  	} -	trans->block_rsv = &root->fs_info->global_block_rsv; +	trans->block_rsv = &block_rsv; +	trans->bytes_reserved = block_rsv.size;  	ret = btrfs_unlink_subvol(trans, root, dir,  				dest->root_key.objectid, @@ -2153,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,  		}  	}  out_end_trans: +	trans->block_rsv = NULL; +	trans->bytes_reserved = 0;  	ret = btrfs_end_transaction(trans, root);  	if (ret && !err)  		err = ret;  	inode->i_flags |= S_DEAD; +out_release: +	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);  out_up_write:  	up_write(&root->fs_info->subvol_sem);  out_unlock: @@ -2165,6 +2217,12 @@ out_unlock:  		shrink_dcache_sb(root->fs_info->sb);  		btrfs_invalidate_inodes(dest);  		d_delete(dentry); + +		/* the last ref */ +		if (dest->cache_inode) { +			iput(dest->cache_inode); +			dest->cache_inode = NULL; +		}  	}  out_dput:  	dput(dentry); @@ -2178,24 +2236,25 @@ out:  static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  { -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_ioctl_defrag_range_args *range;  	int ret; -	if (btrfs_root_readonly(root)) -		return -EROFS; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret;  	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,  			1)) {  		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); -		return -EINPROGRESS; +		mnt_drop_write_file(file); +		return -EINVAL;  	} -	ret = mnt_want_write_file(file); -	if (ret) { -		atomic_set(&root->fs_info->mutually_exclusive_operation_running, -			   0); -		return ret; + +	if (btrfs_root_readonly(root)) { +		ret = -EROFS; +		goto out;  	}  	switch (inode->i_mode & S_IFMT) { @@ -2204,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  			ret = -EPERM;  			goto out;  		} -		ret = btrfs_defrag_root(root, 0); +		ret = btrfs_defrag_root(root);  		if (ret)  			goto out; -		ret = btrfs_defrag_root(root->fs_info->extent_root, 0); +		ret = btrfs_defrag_root(root->fs_info->extent_root);  		break;  	case S_IFREG:  		if (!(file->f_mode & FMODE_WRITE)) { @@ -2237,7 +2296,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  			/* the rest are all set to zero by kzalloc */  			range->len = (u64)-1;  		} -		ret = btrfs_defrag_file(fdentry(file)->d_inode, file, +		ret = btrfs_defrag_file(file_inode(file), file,  					range, 0, 0);  		if (ret > 0)  			ret = 0; @@ -2247,8 +2306,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  		ret = -EINVAL;  	}  out: -	mnt_drop_write_file(file);  	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +	mnt_drop_write_file(file);  	return ret;  } @@ -2263,7 +2322,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)  	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,  			1)) {  		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); -		return -EINPROGRESS; +		return -EINVAL;  	}  	mutex_lock(&root->fs_info->volume_mutex); @@ -2285,7 +2344,7 @@ out:  static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)  { -	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;  	struct btrfs_ioctl_vol_args *vol_args;  	int ret; @@ -2300,7 +2359,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)  			1)) {  		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");  		mnt_drop_write_file(file); -		return -EINPROGRESS; +		return -EINVAL;  	}  	mutex_lock(&root->fs_info->volume_mutex); @@ -2316,8 +2375,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)  	kfree(vol_args);  out:  	mutex_unlock(&root->fs_info->volume_mutex); -	mnt_drop_write_file(file);  	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +	mnt_drop_write_file(file);  	return ret;  } @@ -2408,7 +2467,7 @@ out:  static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  				       u64 off, u64 olen, u64 destoff)  { -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct fd src_file;  	struct inode *src; @@ -2454,7 +2513,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  	if (src_file.file->f_path.mnt != file->f_path.mnt)  		goto out_fput; -	src = src_file.file->f_dentry->d_inode; +	src = file_inode(src_file.file);  	ret = -EINVAL;  	if (src == inode) @@ -2816,7 +2875,7 @@ static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)   */  static long btrfs_ioctl_trans_start(struct file *file)  { -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans;  	int ret; @@ -2856,7 +2915,7 @@ out:  static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  { -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_root *new_root;  	struct btrfs_dir_item *di; @@ -3080,7 +3139,7 @@ out:   */  long btrfs_ioctl_trans_end(struct file *file)  { -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans; @@ -3104,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,  	u64 transid;  	int ret; -	trans = btrfs_attach_transaction(root); +	trans = btrfs_attach_transaction_barrier(root);  	if (IS_ERR(trans)) {  		if (PTR_ERR(trans) != -ENOENT)  			return PTR_ERR(trans); @@ -3142,7 +3201,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,  static long btrfs_ioctl_scrub(struct file *file, void __user *arg)  { -	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;  	struct btrfs_ioctl_scrub_args *sa;  	int ret; @@ -3282,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)  	struct inode_fs_paths *ipath = NULL;  	struct btrfs_path *path; -	if (!capable(CAP_SYS_ADMIN)) +	if (!capable(CAP_DAC_READ_SEARCH))  		return -EPERM;  	path = btrfs_alloc_path(); @@ -3433,12 +3492,12 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,  static long btrfs_ioctl_balance(struct file *file, void __user *arg)  { -	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_ioctl_balance_args *bargs;  	struct btrfs_balance_control *bctl; +	bool need_unlock; /* for mut. excl. ops lock */  	int ret; -	int need_to_clear_lock = 0;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -3447,14 +3506,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)  	if (ret)  		return ret; -	mutex_lock(&fs_info->volume_mutex); +again: +	if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { +		mutex_lock(&fs_info->volume_mutex); +		mutex_lock(&fs_info->balance_mutex); +		need_unlock = true; +		goto locked; +	} + +	/* +	 * mut. excl. ops lock is locked.  Three possibilites: +	 *   (1) some other op is running +	 *   (2) balance is running +	 *   (3) balance is paused -- special case (think resume) +	 */  	mutex_lock(&fs_info->balance_mutex); +	if (fs_info->balance_ctl) { +		/* this is either (2) or (3) */ +		if (!atomic_read(&fs_info->balance_running)) { +			mutex_unlock(&fs_info->balance_mutex); +			if (!mutex_trylock(&fs_info->volume_mutex)) +				goto again; +			mutex_lock(&fs_info->balance_mutex); + +			if (fs_info->balance_ctl && +			    !atomic_read(&fs_info->balance_running)) { +				/* this is (3) */ +				need_unlock = false; +				goto locked; +			} + +			mutex_unlock(&fs_info->balance_mutex); +			mutex_unlock(&fs_info->volume_mutex); +			goto again; +		} else { +			/* this is (2) */ +			mutex_unlock(&fs_info->balance_mutex); +			ret = -EINPROGRESS; +			goto out; +		} +	} else { +		/* this is (1) */ +		mutex_unlock(&fs_info->balance_mutex); +		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); +		ret = -EINVAL; +		goto out; +	} + +locked: +	BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));  	if (arg) {  		bargs = memdup_user(arg, sizeof(*bargs));  		if (IS_ERR(bargs)) {  			ret = PTR_ERR(bargs); -			goto out; +			goto out_unlock;  		}  		if (bargs->flags & BTRFS_BALANCE_RESUME) { @@ -3474,13 +3580,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)  		bargs = NULL;  	} -	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, -			1)) { -		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); +	if (fs_info->balance_ctl) {  		ret = -EINPROGRESS;  		goto out_bargs;  	} -	need_to_clear_lock = 1;  	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);  	if (!bctl) { @@ -3501,11 +3604,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)  	}  do_balance: -	ret = btrfs_balance(bctl, bargs);  	/* -	 * bctl is freed in __cancel_balance or in free_fs_info if -	 * restriper was paused all the way until unmount +	 * Ownership of bctl and mutually_exclusive_operation_running +	 * goes to to btrfs_balance.  bctl is freed in __cancel_balance, +	 * or, if restriper was paused all the way until unmount, in +	 * free_fs_info.  mutually_exclusive_operation_running is +	 * cleared in __cancel_balance.  	 */ +	need_unlock = false; + +	ret = btrfs_balance(bctl, bargs); +  	if (arg) {  		if (copy_to_user(arg, bargs, sizeof(*bargs)))  			ret = -EFAULT; @@ -3513,12 +3622,12 @@ do_balance:  out_bargs:  	kfree(bargs); -out: -	if (need_to_clear_lock) -		atomic_set(&root->fs_info->mutually_exclusive_operation_running, -			   0); +out_unlock:  	mutex_unlock(&fs_info->balance_mutex);  	mutex_unlock(&fs_info->volume_mutex); +	if (need_unlock) +		atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out:  	mnt_drop_write_file(file);  	return ret;  } @@ -3573,7 +3682,7 @@ out:  static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)  { -	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;  	struct btrfs_ioctl_quota_ctl_args *sa;  	struct btrfs_trans_handle *trans = NULL;  	int ret; @@ -3632,7 +3741,7 @@ drop_write:  static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)  { -	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;  	struct btrfs_ioctl_qgroup_assign_args *sa;  	struct btrfs_trans_handle *trans;  	int ret; @@ -3679,7 +3788,7 @@ drop_write:  static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)  { -	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;  	struct btrfs_ioctl_qgroup_create_args *sa;  	struct btrfs_trans_handle *trans;  	int ret; @@ -3698,6 +3807,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)  		goto drop_write;  	} +	if (!sa->qgroupid) { +		ret = -EINVAL; +		goto out; +	} +  	trans = btrfs_join_transaction(root);  	if (IS_ERR(trans)) {  		ret = PTR_ERR(trans); @@ -3725,7 +3839,7 @@ drop_write:  static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)  { -	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;  	struct btrfs_ioctl_qgroup_limit_args *sa;  	struct btrfs_trans_handle *trans;  	int ret; @@ -3775,7 +3889,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,  					    void __user *arg)  {  	struct btrfs_ioctl_received_subvol_args *sa = NULL; -	struct inode *inode = fdentry(file)->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_root_item *root_item = &root->root_item;  	struct btrfs_trans_handle *trans; @@ -3852,10 +3966,69 @@ out:  	return ret;  } +static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) +{ +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	const char *label = root->fs_info->super_copy->label; +	size_t len = strnlen(label, BTRFS_LABEL_SIZE); +	int ret; + +	if (len == BTRFS_LABEL_SIZE) { +		pr_warn("btrfs: label is too long, return the first %zu bytes\n", +			--len); +	} + +	mutex_lock(&root->fs_info->volume_mutex); +	ret = copy_to_user(arg, label, len); +	mutex_unlock(&root->fs_info->volume_mutex); + +	return ret ? -EFAULT : 0; +} + +static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) +{ +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_super_block *super_block = root->fs_info->super_copy; +	struct btrfs_trans_handle *trans; +	char label[BTRFS_LABEL_SIZE]; +	int ret; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	if (copy_from_user(label, arg, sizeof(label))) +		return -EFAULT; + +	if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { +		pr_err("btrfs: unable to set label with more than %d bytes\n", +		       BTRFS_LABEL_SIZE - 1); +		return -EINVAL; +	} + +	ret = mnt_want_write_file(file); +	if (ret) +		return ret; + +	mutex_lock(&root->fs_info->volume_mutex); +	trans = btrfs_start_transaction(root, 0); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		goto out_unlock; +	} + +	strcpy(super_block->label, label); +	ret = btrfs_end_transaction(trans, root); + +out_unlock: +	mutex_unlock(&root->fs_info->volume_mutex); +	mnt_drop_write_file(file); +	return ret; +} +  long btrfs_ioctl(struct file *file, unsigned int  		cmd, unsigned long arg)  { -	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; +	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;  	void __user *argp = (void __user *)arg;  	switch (cmd) { @@ -3952,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int  		return btrfs_ioctl_qgroup_limit(file, argp);  	case BTRFS_IOC_DEV_REPLACE:  		return btrfs_ioctl_dev_replace(root, argp); +	case BTRFS_IOC_GET_FSLABEL: +		return btrfs_ioctl_get_fslabel(file, argp); +	case BTRFS_IOC_SET_FSLABEL: +		return btrfs_ioctl_set_fslabel(file, argp);  	}  	return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h deleted file mode 100644 index dabca9cc8c2..00000000000 --- a/fs/btrfs/ioctl.h +++ /dev/null @@ -1,502 +0,0 @@ -/* - * Copyright (C) 2007 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __IOCTL_ -#define __IOCTL_ -#include <linux/ioctl.h> - -#define BTRFS_IOCTL_MAGIC 0x94 -#define BTRFS_VOL_NAME_MAX 255 - -/* this should be 4k */ -#define BTRFS_PATH_NAME_MAX 4087 -struct btrfs_ioctl_vol_args { -	__s64 fd; -	char name[BTRFS_PATH_NAME_MAX + 1]; -}; - -#define BTRFS_DEVICE_PATH_NAME_MAX 1024 - -#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0) -#define BTRFS_SUBVOL_RDONLY		(1ULL << 1) -#define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2) -#define BTRFS_FSID_SIZE 16 -#define BTRFS_UUID_SIZE 16 - -#define BTRFS_QGROUP_INHERIT_SET_LIMITS	(1ULL << 0) - -struct btrfs_qgroup_limit { -	__u64	flags; -	__u64	max_rfer; -	__u64	max_excl; -	__u64	rsv_rfer; -	__u64	rsv_excl; -}; - -struct btrfs_qgroup_inherit { -	__u64	flags; -	__u64	num_qgroups; -	__u64	num_ref_copies; -	__u64	num_excl_copies; -	struct btrfs_qgroup_limit lim; -	__u64	qgroups[0]; -}; - -struct btrfs_ioctl_qgroup_limit_args { -	__u64	qgroupid; -	struct btrfs_qgroup_limit lim; -}; - -#define BTRFS_SUBVOL_NAME_MAX 4039 -struct btrfs_ioctl_vol_args_v2 { -	__s64 fd; -	__u64 transid; -	__u64 flags; -	union { -		struct { -			__u64 size; -			struct btrfs_qgroup_inherit __user *qgroup_inherit; -		}; -		__u64 unused[4]; -	}; -	char name[BTRFS_SUBVOL_NAME_MAX + 1]; -}; - -/* - * structure to report errors and progress to userspace, either as a - * result of a finished scrub, a canceled scrub or a progress inquiry - */ -struct btrfs_scrub_progress { -	__u64 data_extents_scrubbed;	/* # of data extents scrubbed */ -	__u64 tree_extents_scrubbed;	/* # of tree extents scrubbed */ -	__u64 data_bytes_scrubbed;	/* # of data bytes scrubbed */ -	__u64 tree_bytes_scrubbed;	/* # of tree bytes scrubbed */ -	__u64 read_errors;		/* # of read errors encountered (EIO) */ -	__u64 csum_errors;		/* # of failed csum checks */ -	__u64 verify_errors;		/* # of occurences, where the metadata -					 * of a tree block did not match the -					 * expected values, like generation or -					 * logical */ -	__u64 no_csum;			/* # of 4k data block for which no csum -					 * is present, probably the result of -					 * data written with nodatasum */ -	__u64 csum_discards;		/* # of csum for which no data was found -					 * in the extent tree. */ -	__u64 super_errors;		/* # of bad super blocks encountered */ -	__u64 malloc_errors;		/* # of internal kmalloc errors. These -					 * will likely cause an incomplete -					 * scrub */ -	__u64 uncorrectable_errors;	/* # of errors where either no intact -					 * copy was found or the writeback -					 * failed */ -	__u64 corrected_errors;		/* # of errors corrected */ -	__u64 last_physical;		/* last physical address scrubbed. In -					 * case a scrub was aborted, this can -					 * be used to restart the scrub */ -	__u64 unverified_errors;	/* # of occurences where a read for a -					 * full (64k) bio failed, but the re- -					 * check succeeded for each 4k piece. -					 * Intermittent error. */ -}; - -#define BTRFS_SCRUB_READONLY	1 -struct btrfs_ioctl_scrub_args { -	__u64 devid;				/* in */ -	__u64 start;				/* in */ -	__u64 end;				/* in */ -	__u64 flags;				/* in */ -	struct btrfs_scrub_progress progress;	/* out */ -	/* pad to 1k */ -	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; -}; - -#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0 -#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1 -struct btrfs_ioctl_dev_replace_start_params { -	__u64 srcdevid;	/* in, if 0, use srcdev_name instead */ -	__u64 cont_reading_from_srcdev_mode;	/* in, see #define -						 * above */ -	__u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */ -	__u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */ -}; - -#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED	0 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED		1 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED		2 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED		3 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED		4 -struct btrfs_ioctl_dev_replace_status_params { -	__u64 replace_state;	/* out, see #define above */ -	__u64 progress_1000;	/* out, 0 <= x <= 1000 */ -	__u64 time_started;	/* out, seconds since 1-Jan-1970 */ -	__u64 time_stopped;	/* out, seconds since 1-Jan-1970 */ -	__u64 num_write_errors;	/* out */ -	__u64 num_uncorrectable_read_errors;	/* out */ -}; - -#define BTRFS_IOCTL_DEV_REPLACE_CMD_START			0 -#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS			1 -#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL			2 -#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR			0 -#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED		1 -#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED		2 -struct btrfs_ioctl_dev_replace_args { -	__u64 cmd;	/* in */ -	__u64 result;	/* out */ - -	union { -		struct btrfs_ioctl_dev_replace_start_params start; -		struct btrfs_ioctl_dev_replace_status_params status; -	};	/* in/out */ - -	__u64 spare[64]; -}; - -struct btrfs_ioctl_dev_info_args { -	__u64 devid;				/* in/out */ -	__u8 uuid[BTRFS_UUID_SIZE];		/* in/out */ -	__u64 bytes_used;			/* out */ -	__u64 total_bytes;			/* out */ -	__u64 unused[379];			/* pad to 4k */ -	__u8 path[BTRFS_DEVICE_PATH_NAME_MAX];	/* out */ -}; - -struct btrfs_ioctl_fs_info_args { -	__u64 max_id;				/* out */ -	__u64 num_devices;			/* out */ -	__u8 fsid[BTRFS_FSID_SIZE];		/* out */ -	__u64 reserved[124];			/* pad to 1k */ -}; - -/* balance control ioctl modes */ -#define BTRFS_BALANCE_CTL_PAUSE		1 -#define BTRFS_BALANCE_CTL_CANCEL	2 - -/* - * this is packed, because it should be exactly the same as its disk - * byte order counterpart (struct btrfs_disk_balance_args) - */ -struct btrfs_balance_args { -	__u64 profiles; -	__u64 usage; -	__u64 devid; -	__u64 pstart; -	__u64 pend; -	__u64 vstart; -	__u64 vend; - -	__u64 target; - -	__u64 flags; - -	__u64 unused[8]; -} __attribute__ ((__packed__)); - -/* report balance progress to userspace */ -struct btrfs_balance_progress { -	__u64 expected;		/* estimated # of chunks that will be -				 * relocated to fulfill the request */ -	__u64 considered;	/* # of chunks we have considered so far */ -	__u64 completed;	/* # of chunks relocated so far */ -}; - -#define BTRFS_BALANCE_STATE_RUNNING	(1ULL << 0) -#define BTRFS_BALANCE_STATE_PAUSE_REQ	(1ULL << 1) -#define BTRFS_BALANCE_STATE_CANCEL_REQ	(1ULL << 2) - -struct btrfs_ioctl_balance_args { -	__u64 flags;				/* in/out */ -	__u64 state;				/* out */ - -	struct btrfs_balance_args data;		/* in/out */ -	struct btrfs_balance_args meta;		/* in/out */ -	struct btrfs_balance_args sys;		/* in/out */ - -	struct btrfs_balance_progress stat;	/* out */ - -	__u64 unused[72];			/* pad to 1k */ -}; - -#define BTRFS_INO_LOOKUP_PATH_MAX 4080 -struct btrfs_ioctl_ino_lookup_args { -	__u64 treeid; -	__u64 objectid; -	char name[BTRFS_INO_LOOKUP_PATH_MAX]; -}; - -struct btrfs_ioctl_search_key { -	/* which root are we searching.  0 is the tree of tree roots */ -	__u64 tree_id; - -	/* keys returned will be >= min and <= max */ -	__u64 min_objectid; -	__u64 max_objectid; - -	/* keys returned will be >= min and <= max */ -	__u64 min_offset; -	__u64 max_offset; - -	/* max and min transids to search for */ -	__u64 min_transid; -	__u64 max_transid; - -	/* keys returned will be >= min and <= max */ -	__u32 min_type; -	__u32 max_type; - -	/* -	 * how many items did userland ask for, and how many are we -	 * returning -	 */ -	__u32 nr_items; - -	/* align to 64 bits */ -	__u32 unused; - -	/* some extra for later */ -	__u64 unused1; -	__u64 unused2; -	__u64 unused3; -	__u64 unused4; -}; - -struct btrfs_ioctl_search_header { -	__u64 transid; -	__u64 objectid; -	__u64 offset; -	__u32 type; -	__u32 len; -}; - -#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) -/* - * the buf is an array of search headers where - * each header is followed by the actual item - * the type field is expanded to 32 bits for alignment - */ -struct btrfs_ioctl_search_args { -	struct btrfs_ioctl_search_key key; -	char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; -}; - -struct btrfs_ioctl_clone_range_args { -  __s64 src_fd; -  __u64 src_offset, src_length; -  __u64 dest_offset; -}; - -/* flags for the defrag range ioctl */ -#define BTRFS_DEFRAG_RANGE_COMPRESS 1 -#define BTRFS_DEFRAG_RANGE_START_IO 2 - -struct btrfs_ioctl_space_info { -	__u64 flags; -	__u64 total_bytes; -	__u64 used_bytes; -}; - -struct btrfs_ioctl_space_args { -	__u64 space_slots; -	__u64 total_spaces; -	struct btrfs_ioctl_space_info spaces[0]; -}; - -struct btrfs_data_container { -	__u32	bytes_left;	/* out -- bytes not needed to deliver output */ -	__u32	bytes_missing;	/* out -- additional bytes needed for result */ -	__u32	elem_cnt;	/* out */ -	__u32	elem_missed;	/* out */ -	__u64	val[0];		/* out */ -}; - -struct btrfs_ioctl_ino_path_args { -	__u64				inum;		/* in */ -	__u64				size;		/* in */ -	__u64				reserved[4]; -	/* struct btrfs_data_container	*fspath;	   out */ -	__u64				fspath;		/* out */ -}; - -struct btrfs_ioctl_logical_ino_args { -	__u64				logical;	/* in */ -	__u64				size;		/* in */ -	__u64				reserved[4]; -	/* struct btrfs_data_container	*inodes;	out   */ -	__u64				inodes; -}; - -enum btrfs_dev_stat_values { -	/* disk I/O failure stats */ -	BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ -	BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ -	BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ - -	/* stats for indirect indications for I/O failures */ -	BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or -					 * contents is illegal: this is an -					 * indication that the block was damaged -					 * during read or write, or written to -					 * wrong location or read from wrong -					 * location */ -	BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not -					 * been written */ - -	BTRFS_DEV_STAT_VALUES_MAX -}; - -/* Reset statistics after reading; needs SYS_ADMIN capability */ -#define	BTRFS_DEV_STATS_RESET		(1ULL << 0) - -struct btrfs_ioctl_get_dev_stats { -	__u64 devid;				/* in */ -	__u64 nr_items;				/* in/out */ -	__u64 flags;				/* in/out */ - -	/* out values: */ -	__u64 values[BTRFS_DEV_STAT_VALUES_MAX]; - -	__u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ -}; - -#define BTRFS_QUOTA_CTL_ENABLE	1 -#define BTRFS_QUOTA_CTL_DISABLE	2 -#define BTRFS_QUOTA_CTL_RESCAN	3 -struct btrfs_ioctl_quota_ctl_args { -	__u64 cmd; -	__u64 status; -}; - -struct btrfs_ioctl_qgroup_assign_args { -	__u64 assign; -	__u64 src; -	__u64 dst; -}; - -struct btrfs_ioctl_qgroup_create_args { -	__u64 create; -	__u64 qgroupid; -}; -struct btrfs_ioctl_timespec { -	__u64 sec; -	__u32 nsec; -}; - -struct btrfs_ioctl_received_subvol_args { -	char	uuid[BTRFS_UUID_SIZE];	/* in */ -	__u64	stransid;		/* in */ -	__u64	rtransid;		/* out */ -	struct btrfs_ioctl_timespec stime; /* in */ -	struct btrfs_ioctl_timespec rtime; /* out */ -	__u64	flags;			/* in */ -	__u64	reserved[16];		/* in */ -}; - -struct btrfs_ioctl_send_args { -	__s64 send_fd;			/* in */ -	__u64 clone_sources_count;	/* in */ -	__u64 __user *clone_sources;	/* in */ -	__u64 parent_root;		/* in */ -	__u64 flags;			/* in */ -	__u64 reserved[4];		/* in */ -}; - -#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ -				   struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ -				   struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ -				   struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ -				   struct btrfs_ioctl_vol_args) -/* trans start and trans end are dangerous, and only for - * use by applications that know how to avoid the - * resulting deadlocks - */ -#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6) -#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7) -#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8) - -#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int) -#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ -				   struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ -				   struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ -				   struct btrfs_ioctl_vol_args) - -#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ -				  struct btrfs_ioctl_clone_range_args) - -#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ -				   struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ -				struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ -				struct btrfs_ioctl_defrag_range_args) -#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ -				   struct btrfs_ioctl_search_args) -#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ -				   struct btrfs_ioctl_ino_lookup_args) -#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) -#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ -				    struct btrfs_ioctl_space_args) -#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) -#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) -#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ -				   struct btrfs_ioctl_vol_args_v2) -#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \ -				   struct btrfs_ioctl_vol_args_v2) -#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) -#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) -#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ -			      struct btrfs_ioctl_scrub_args) -#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28) -#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \ -				       struct btrfs_ioctl_scrub_args) -#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \ -				 struct btrfs_ioctl_dev_info_args) -#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ -			       struct btrfs_ioctl_fs_info_args) -#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ -				   struct btrfs_ioctl_balance_args) -#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) -#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ -					struct btrfs_ioctl_balance_args) -#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ -					struct btrfs_ioctl_ino_path_args) -#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ -					struct btrfs_ioctl_ino_path_args) -#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ -				struct btrfs_ioctl_received_subvol_args) -#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args) -#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \ -				     struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \ -			       struct btrfs_ioctl_quota_ctl_args) -#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \ -			       struct btrfs_ioctl_qgroup_assign_args) -#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \ -			       struct btrfs_ioctl_qgroup_create_args) -#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \ -			       struct btrfs_ioctl_qgroup_limit_args) -#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ -				      struct btrfs_ioctl_get_dev_stats) -#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ -				    struct btrfs_ioctl_dev_replace_args) - -#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2a1762c6604..e95df435d89 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -113,11 +113,10 @@ again:  		read_unlock(&eb->lock);  		return;  	} -	read_unlock(&eb->lock); -	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); -	read_lock(&eb->lock);  	if (atomic_read(&eb->blocking_writers)) {  		read_unlock(&eb->lock); +		wait_event(eb->write_lock_wq, +			   atomic_read(&eb->blocking_writers) == 0);  		goto again;  	}  	atomic_inc(&eb->read_locks); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f1073129704..dc08d77b717 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  	entry->file_offset = file_offset;  	entry->start = start;  	entry->len = len; +	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && +	    !(type == BTRFS_ORDERED_NOCOW)) +		entry->csum_bytes_left = disk_len;  	entry->disk_len = disk_len;  	entry->bytes_left = len;  	entry->inode = igrab(inode); @@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  	INIT_LIST_HEAD(&entry->root_extent_list);  	INIT_LIST_HEAD(&entry->work_list);  	init_completion(&entry->completion); +	INIT_LIST_HEAD(&entry->log_list);  	trace_btrfs_ordered_extent_add(inode, entry); @@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode,  	tree = &BTRFS_I(inode)->ordered_tree;  	spin_lock_irq(&tree->lock);  	list_add_tail(&sum->list, &entry->list); +	WARN_ON(entry->csum_bytes_left < sum->len); +	entry->csum_bytes_left -= sum->len; +	if (entry->csum_bytes_left == 0) +		wake_up(&entry->wait);  	spin_unlock_irq(&tree->lock);  } @@ -405,6 +413,66 @@ out:  	return ret == 0;  } +/* Needs to either be called under a log transaction or the log_mutex */ +void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) +{ +	struct btrfs_ordered_inode_tree *tree; +	struct btrfs_ordered_extent *ordered; +	struct rb_node *n; +	int index = log->log_transid % 2; + +	tree = &BTRFS_I(inode)->ordered_tree; +	spin_lock_irq(&tree->lock); +	for (n = rb_first(&tree->tree); n; n = rb_next(n)) { +		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); +		spin_lock(&log->log_extents_lock[index]); +		if (list_empty(&ordered->log_list)) { +			list_add_tail(&ordered->log_list, &log->logged_list[index]); +			atomic_inc(&ordered->refs); +		} +		spin_unlock(&log->log_extents_lock[index]); +	} +	spin_unlock_irq(&tree->lock); +} + +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +{ +	struct btrfs_ordered_extent *ordered; +	int index = transid % 2; + +	spin_lock_irq(&log->log_extents_lock[index]); +	while (!list_empty(&log->logged_list[index])) { +		ordered = list_first_entry(&log->logged_list[index], +					   struct btrfs_ordered_extent, +					   log_list); +		list_del_init(&ordered->log_list); +		spin_unlock_irq(&log->log_extents_lock[index]); +		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, +						   &ordered->flags)); +		btrfs_put_ordered_extent(ordered); +		spin_lock_irq(&log->log_extents_lock[index]); +	} +	spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) +{ +	struct btrfs_ordered_extent *ordered; +	int index = transid % 2; + +	spin_lock_irq(&log->log_extents_lock[index]); +	while (!list_empty(&log->logged_list[index])) { +		ordered = list_first_entry(&log->logged_list[index], +					   struct btrfs_ordered_extent, +					   log_list); +		list_del_init(&ordered->log_list); +		spin_unlock_irq(&log->log_extents_lock[index]); +		btrfs_put_ordered_extent(ordered); +		spin_lock_irq(&log->log_extents_lock[index]); +	} +	spin_unlock_irq(&log->log_extents_lock[index]); +} +  /*   * used to drop a reference on an ordered extent.  This will free   * the extent if the last reference is dropped @@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)   * extra check to make sure the ordered operation list really is empty   * before we return   */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, +				 struct btrfs_root *root, int wait)  {  	struct btrfs_inode *btrfs_inode;  	struct inode *inode; +	struct btrfs_transaction *cur_trans = trans->transaction;  	struct list_head splice;  	struct list_head works;  	struct btrfs_delalloc_work *work, *next; @@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)  	mutex_lock(&root->fs_info->ordered_operations_mutex);  	spin_lock(&root->fs_info->ordered_extent_lock); -again: -	list_splice_init(&root->fs_info->ordered_operations, &splice); - +	list_splice_init(&cur_trans->ordered_operations, &splice);  	while (!list_empty(&splice)) { -  		btrfs_inode = list_entry(splice.next, struct btrfs_inode,  				   ordered_operations); -  		inode = &btrfs_inode->vfs_inode;  		list_del_init(&btrfs_inode->ordered_operations); @@ -574,24 +640,22 @@ again:  		 * the inode may be getting freed (in sys_unlink path).  		 */  		inode = igrab(inode); - -		if (!wait && inode) { -			list_add_tail(&BTRFS_I(inode)->ordered_operations, -			      &root->fs_info->ordered_operations); -		} -  		if (!inode)  			continue; + +		if (!wait) +			list_add_tail(&BTRFS_I(inode)->ordered_operations, +				      &cur_trans->ordered_operations);  		spin_unlock(&root->fs_info->ordered_extent_lock);  		work = btrfs_alloc_delalloc_work(inode, wait, 1);  		if (!work) { +			spin_lock(&root->fs_info->ordered_extent_lock);  			if (list_empty(&BTRFS_I(inode)->ordered_operations))  				list_add_tail(&btrfs_inode->ordered_operations,  					      &splice); -			spin_lock(&root->fs_info->ordered_extent_lock);  			list_splice_tail(&splice, -					 &root->fs_info->ordered_operations); +					 &cur_trans->ordered_operations);  			spin_unlock(&root->fs_info->ordered_extent_lock);  			ret = -ENOMEM;  			goto out; @@ -603,9 +667,6 @@ again:  		cond_resched();  		spin_lock(&root->fs_info->ordered_extent_lock);  	} -	if (wait && !list_empty(&root->fs_info->ordered_operations)) -		goto again; -  	spin_unlock(&root->fs_info->ordered_extent_lock);  out:  	list_for_each_entry_safe(work, next, &works, list) { @@ -836,9 +897,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  	 * if the disk i_size is already at the inode->i_size, or  	 * this ordered extent is inside the disk i_size, we're done  	 */ -	if (disk_i_size == i_size || offset <= disk_i_size) { +	if (disk_i_size == i_size) +		goto out; + +	/* +	 * We still need to update disk_i_size if outstanding_isize is greater +	 * than disk_i_size. +	 */ +	if (offset <= disk_i_size && +	    (!ordered || ordered->outstanding_isize <= disk_i_size))  		goto out; -	}  	/*  	 * walk backward from this ordered extent to disk_i_size. @@ -870,7 +938,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  			break;  		if (test->file_offset >= i_size)  			break; -		if (test->file_offset >= disk_i_size) { +		if (entry_end(test) > disk_i_size) {  			/*  			 * we don't update disk_i_size now, so record this  			 * undealt i_size. Or we will not know the real @@ -967,6 +1035,7 @@ out:  void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,  				 struct btrfs_root *root, struct inode *inode)  { +	struct btrfs_transaction *cur_trans = trans->transaction;  	u64 last_mod;  	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); @@ -981,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,  	spin_lock(&root->fs_info->ordered_extent_lock);  	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {  		list_add_tail(&BTRFS_I(inode)->ordered_operations, -			      &root->fs_info->ordered_operations); +			      &cur_trans->ordered_operations);  	}  	spin_unlock(&root->fs_info->ordered_extent_lock);  } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f29d4bf5fbe..8eadfe406cd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -79,6 +79,8 @@ struct btrfs_ordered_sum {  #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent  				       * has done its due diligence in updating  				       * the isize. */ +#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered +				       ordered extent */  struct btrfs_ordered_extent {  	/* logical offset in the file */ @@ -96,6 +98,9 @@ struct btrfs_ordered_extent {  	/* number of bytes that still need writing */  	u64 bytes_left; +	/* number of bytes that still need csumming */ +	u64 csum_bytes_left; +  	/*  	 * the end of the ordered extent which is behind it but  	 * didn't update disk_i_size. Please see the comment of @@ -118,6 +123,9 @@ struct btrfs_ordered_extent {  	/* list of checksums for insertion when the extent io is done */  	struct list_head list; +	/* If we need to wait on this to be done */ +	struct list_head log_list; +  	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */  	wait_queue_head_t wait; @@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,  int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  				struct btrfs_ordered_extent *ordered);  int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, +				 struct btrfs_root *root, int wait);  void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,  				 struct btrfs_root *root,  				 struct inode *inode);  void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); +void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);  int __init ordered_data_init(void);  void ordered_data_exit(void);  #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 50d95fd190a..920957ecb27 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)  			       btrfs_dev_extent_chunk_offset(l, dev_extent),  			       (unsigned long long)  			       btrfs_dev_extent_length(l, dev_extent)); +			break;  		case BTRFS_DEV_STATS_KEY:  			printk(KERN_INFO "\t\tdevice stats\n");  			break; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fe9d02c45f8..aee4b1cc3d9 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -23,13 +23,13 @@  #include <linux/rbtree.h>  #include <linux/slab.h>  #include <linux/workqueue.h> +#include <linux/btrfs.h>  #include "ctree.h"  #include "transaction.h"  #include "disk-io.h"  #include "locking.h"  #include "ulist.h" -#include "ioctl.h"  #include "backref.h"  /* TODO XXX FIXME @@ -379,6 +379,13 @@ next1:  		ret = add_relation_rb(fs_info, found_key.objectid,  				      found_key.offset); +		if (ret == -ENOENT) { +			printk(KERN_WARNING +				"btrfs: orphan qgroup relation 0x%llx->0x%llx\n", +				(unsigned long long)found_key.objectid, +				(unsigned long long)found_key.offset); +			ret = 0;	/* ignore the error */ +		}  		if (ret)  			goto out;  next2: @@ -613,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,  	key.offset = qgroupid;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return -ENOMEM; +  	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);  	if (ret > 0)  		ret = -ENOENT; @@ -654,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,  	key.offset = qgroup->qgroupid;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return -ENOMEM; +  	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);  	if (ret > 0)  		ret = -ENOENT; @@ -695,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,  	key.offset = 0;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return -ENOMEM; +  	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);  	if (ret > 0)  		ret = -ENOENT; @@ -725,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,  {  	struct btrfs_path *path;  	struct btrfs_key key; +	struct extent_buffer *leaf = NULL;  	int ret; - -	if (!root) -		return -EINVAL; +	int nr = 0;  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; -	while (1) { -		key.objectid = 0; -		key.offset = 0; -		key.type = 0; +	path->leave_spinning = 1; -		path->leave_spinning = 1; +	key.objectid = 0; +	key.offset = 0; +	key.type = 0; + +	while (1) {  		ret = btrfs_search_slot(trans, root, &key, path, -1, 1); -		if (ret > 0) { -			if (path->slots[0] == 0) -				break; -			path->slots[0]--; -		} else if (ret < 0) { +		if (ret < 0) +			goto out; +		leaf = path->nodes[0]; +		nr = btrfs_header_nritems(leaf); +		if (!nr)  			break; -		} - -		ret = btrfs_del_item(trans, root, path); +		/* +		 * delete the leaf one by one +		 * since the whole tree is going +		 * to be deleted. +		 */ +		path->slots[0] = 0; +		ret = btrfs_del_items(trans, root, path, 0, nr);  		if (ret)  			goto out; +  		btrfs_release_path(path);  	}  	ret = 0; @@ -840,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,  	int ret = 0;  	spin_lock(&fs_info->qgroup_lock); +	if (!fs_info->quota_root) { +		spin_unlock(&fs_info->qgroup_lock); +		return 0; +	}  	fs_info->quota_enabled = 0;  	fs_info->pending_quota_state = 0;  	quota_root = fs_info->quota_root; @@ -956,17 +978,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,  			struct btrfs_fs_info *fs_info, u64 qgroupid)  {  	struct btrfs_root *quota_root; +	struct btrfs_qgroup *qgroup;  	int ret = 0;  	quota_root = fs_info->quota_root;  	if (!quota_root)  		return -EINVAL; +	/* check if there are no relations to this qgroup */ +	spin_lock(&fs_info->qgroup_lock); +	qgroup = find_qgroup_rb(fs_info, qgroupid); +	if (qgroup) { +		if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { +			spin_unlock(&fs_info->qgroup_lock); +			return -EBUSY; +		} +	} +	spin_unlock(&fs_info->qgroup_lock); +  	ret = del_qgroup_item(trans, quota_root, qgroupid);  	spin_lock(&fs_info->qgroup_lock);  	del_qgroup_rb(quota_root->fs_info, qgroupid); -  	spin_unlock(&fs_info->qgroup_lock);  	return ret; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 00000000000..9a79fb790ad --- /dev/null +++ b/fs/btrfs/raid56.c @@ -0,0 +1,2100 @@ +/* + * Copyright (C) 2012 Fusion-io  All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/ratelimit.h> +#include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <linux/hash.h> +#include <linux/list_sort.h> +#include <linux/raid/xor.h> +#include <linux/vmalloc.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" + +/* set when additional merges to this rbio are not allowed */ +#define RBIO_RMW_LOCKED_BIT	1 + +/* + * set when this rbio is sitting in the hash, but it is just a cache + * of past RMW + */ +#define RBIO_CACHE_BIT		2 + +/* + * set when it is safe to trust the stripe_pages for caching + */ +#define RBIO_CACHE_READY_BIT	3 + + +#define RBIO_CACHE_SIZE 1024 + +struct btrfs_raid_bio { +	struct btrfs_fs_info *fs_info; +	struct btrfs_bio *bbio; + +	/* +	 * logical block numbers for the start of each stripe +	 * The last one or two are p/q.  These are sorted, +	 * so raid_map[0] is the start of our full stripe +	 */ +	u64 *raid_map; + +	/* while we're doing rmw on a stripe +	 * we put it into a hash table so we can +	 * lock the stripe and merge more rbios +	 * into it. +	 */ +	struct list_head hash_list; + +	/* +	 * LRU list for the stripe cache +	 */ +	struct list_head stripe_cache; + +	/* +	 * for scheduling work in the helper threads +	 */ +	struct btrfs_work work; + +	/* +	 * bio list and bio_list_lock are used +	 * to add more bios into the stripe +	 * in hopes of avoiding the full rmw +	 */ +	struct bio_list bio_list; +	spinlock_t bio_list_lock; + +	/* also protected by the bio_list_lock, the +	 * plug list is used by the plugging code +	 * to collect partial bios while plugged.  The +	 * stripe locking code also uses it to hand off +	 * the stripe lock to the next pending IO +	 */ +	struct list_head plug_list; + +	/* +	 * flags that tell us if it is safe to +	 * merge with this bio +	 */ +	unsigned long flags; + +	/* size of each individual stripe on disk */ +	int stripe_len; + +	/* number of data stripes (no p/q) */ +	int nr_data; + +	/* +	 * set if we're doing a parity rebuild +	 * for a read from higher up, which is handled +	 * differently from a parity rebuild as part of +	 * rmw +	 */ +	int read_rebuild; + +	/* first bad stripe */ +	int faila; + +	/* second bad stripe (for raid6 use) */ +	int failb; + +	/* +	 * number of pages needed to represent the full +	 * stripe +	 */ +	int nr_pages; + +	/* +	 * size of all the bios in the bio_list.  This +	 * helps us decide if the rbio maps to a full +	 * stripe or not +	 */ +	int bio_list_bytes; + +	atomic_t refs; + +	/* +	 * these are two arrays of pointers.  We allocate the +	 * rbio big enough to hold them both and setup their +	 * locations when the rbio is allocated +	 */ + +	/* pointers to pages that we allocated for +	 * reading/writing stripes directly from the disk (including P/Q) +	 */ +	struct page **stripe_pages; + +	/* +	 * pointers to the pages in the bio_list.  Stored +	 * here for faster lookup +	 */ +	struct page **bio_pages; +}; + +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); +static noinline void finish_rmw(struct btrfs_raid_bio *rbio); +static void rmw_work(struct btrfs_work *work); +static void read_rebuild_work(struct btrfs_work *work); +static void async_rmw_stripe(struct btrfs_raid_bio *rbio); +static void async_read_rebuild(struct btrfs_raid_bio *rbio); +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); +static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void index_rbio_pages(struct btrfs_raid_bio *rbio); +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); + +/* + * the stripe hash table is used for locking, and to collect + * bios in hopes of making a full stripe + */ +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) +{ +	struct btrfs_stripe_hash_table *table; +	struct btrfs_stripe_hash_table *x; +	struct btrfs_stripe_hash *cur; +	struct btrfs_stripe_hash *h; +	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; +	int i; +	int table_size; + +	if (info->stripe_hash_table) +		return 0; + +	/* +	 * The table is large, starting with order 4 and can go as high as +	 * order 7 in case lock debugging is turned on. +	 * +	 * Try harder to allocate and fallback to vmalloc to lower the chance +	 * of a failing mount. +	 */ +	table_size = sizeof(*table) + sizeof(*h) * num_entries; +	table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); +	if (!table) { +		table = vzalloc(table_size); +		if (!table) +			return -ENOMEM; +	} + +	spin_lock_init(&table->cache_lock); +	INIT_LIST_HEAD(&table->stripe_cache); + +	h = table->table; + +	for (i = 0; i < num_entries; i++) { +		cur = h + i; +		INIT_LIST_HEAD(&cur->hash_list); +		spin_lock_init(&cur->lock); +		init_waitqueue_head(&cur->wait); +	} + +	x = cmpxchg(&info->stripe_hash_table, NULL, table); +	if (x) { +		if (is_vmalloc_addr(x)) +			vfree(x); +		else +			kfree(x); +	} +	return 0; +} + +/* + * caching an rbio means to copy anything from the + * bio_pages array into the stripe_pages array.  We + * use the page uptodate bit in the stripe cache array + * to indicate if it has valid data + * + * once the caching is done, we set the cache ready + * bit. + */ +static void cache_rbio_pages(struct btrfs_raid_bio *rbio) +{ +	int i; +	char *s; +	char *d; +	int ret; + +	ret = alloc_rbio_pages(rbio); +	if (ret) +		return; + +	for (i = 0; i < rbio->nr_pages; i++) { +		if (!rbio->bio_pages[i]) +			continue; + +		s = kmap(rbio->bio_pages[i]); +		d = kmap(rbio->stripe_pages[i]); + +		memcpy(d, s, PAGE_CACHE_SIZE); + +		kunmap(rbio->bio_pages[i]); +		kunmap(rbio->stripe_pages[i]); +		SetPageUptodate(rbio->stripe_pages[i]); +	} +	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); +} + +/* + * we hash on the first logical address of the stripe + */ +static int rbio_bucket(struct btrfs_raid_bio *rbio) +{ +	u64 num = rbio->raid_map[0]; + +	/* +	 * we shift down quite a bit.  We're using byte +	 * addressing, and most of the lower bits are zeros. +	 * This tends to upset hash_64, and it consistently +	 * returns just one or two different values. +	 * +	 * shifting off the lower bits fixes things. +	 */ +	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); +} + +/* + * stealing an rbio means taking all the uptodate pages from the stripe + * array in the source rbio and putting them into the destination rbio + */ +static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) +{ +	int i; +	struct page *s; +	struct page *d; + +	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) +		return; + +	for (i = 0; i < dest->nr_pages; i++) { +		s = src->stripe_pages[i]; +		if (!s || !PageUptodate(s)) { +			continue; +		} + +		d = dest->stripe_pages[i]; +		if (d) +			__free_page(d); + +		dest->stripe_pages[i] = s; +		src->stripe_pages[i] = NULL; +	} +} + +/* + * merging means we take the bio_list from the victim and + * splice it into the destination.  The victim should + * be discarded afterwards. + * + * must be called with dest->rbio_list_lock held + */ +static void merge_rbio(struct btrfs_raid_bio *dest, +		       struct btrfs_raid_bio *victim) +{ +	bio_list_merge(&dest->bio_list, &victim->bio_list); +	dest->bio_list_bytes += victim->bio_list_bytes; +	bio_list_init(&victim->bio_list); +} + +/* + * used to prune items that are in the cache.  The caller + * must hold the hash table lock. + */ +static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ +	int bucket = rbio_bucket(rbio); +	struct btrfs_stripe_hash_table *table; +	struct btrfs_stripe_hash *h; +	int freeit = 0; + +	/* +	 * check the bit again under the hash table lock. +	 */ +	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) +		return; + +	table = rbio->fs_info->stripe_hash_table; +	h = table->table + bucket; + +	/* hold the lock for the bucket because we may be +	 * removing it from the hash table +	 */ +	spin_lock(&h->lock); + +	/* +	 * hold the lock for the bio list because we need +	 * to make sure the bio list is empty +	 */ +	spin_lock(&rbio->bio_list_lock); + +	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { +		list_del_init(&rbio->stripe_cache); +		table->cache_size -= 1; +		freeit = 1; + +		/* if the bio list isn't empty, this rbio is +		 * still involved in an IO.  We take it out +		 * of the cache list, and drop the ref that +		 * was held for the list. +		 * +		 * If the bio_list was empty, we also remove +		 * the rbio from the hash_table, and drop +		 * the corresponding ref +		 */ +		if (bio_list_empty(&rbio->bio_list)) { +			if (!list_empty(&rbio->hash_list)) { +				list_del_init(&rbio->hash_list); +				atomic_dec(&rbio->refs); +				BUG_ON(!list_empty(&rbio->plug_list)); +			} +		} +	} + +	spin_unlock(&rbio->bio_list_lock); +	spin_unlock(&h->lock); + +	if (freeit) +		__free_raid_bio(rbio); +} + +/* + * prune a given rbio from the cache + */ +static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ +	struct btrfs_stripe_hash_table *table; +	unsigned long flags; + +	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) +		return; + +	table = rbio->fs_info->stripe_hash_table; + +	spin_lock_irqsave(&table->cache_lock, flags); +	__remove_rbio_from_cache(rbio); +	spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove everything in the cache + */ +void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +{ +	struct btrfs_stripe_hash_table *table; +	unsigned long flags; +	struct btrfs_raid_bio *rbio; + +	table = info->stripe_hash_table; + +	spin_lock_irqsave(&table->cache_lock, flags); +	while (!list_empty(&table->stripe_cache)) { +		rbio = list_entry(table->stripe_cache.next, +				  struct btrfs_raid_bio, +				  stripe_cache); +		__remove_rbio_from_cache(rbio); +	} +	spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove all cached entries and free the hash table + * used by unmount + */ +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) +{ +	if (!info->stripe_hash_table) +		return; +	btrfs_clear_rbio_cache(info); +	if (is_vmalloc_addr(info->stripe_hash_table)) +		vfree(info->stripe_hash_table); +	else +		kfree(info->stripe_hash_table); +	info->stripe_hash_table = NULL; +} + +/* + * insert an rbio into the stripe cache.  It + * must have already been prepared by calling + * cache_rbio_pages + * + * If this rbio was already cached, it gets + * moved to the front of the lru. + * + * If the size of the rbio cache is too big, we + * prune an item. + */ +static void cache_rbio(struct btrfs_raid_bio *rbio) +{ +	struct btrfs_stripe_hash_table *table; +	unsigned long flags; + +	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) +		return; + +	table = rbio->fs_info->stripe_hash_table; + +	spin_lock_irqsave(&table->cache_lock, flags); +	spin_lock(&rbio->bio_list_lock); + +	/* bump our ref if we were not in the list before */ +	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) +		atomic_inc(&rbio->refs); + +	if (!list_empty(&rbio->stripe_cache)){ +		list_move(&rbio->stripe_cache, &table->stripe_cache); +	} else { +		list_add(&rbio->stripe_cache, &table->stripe_cache); +		table->cache_size += 1; +	} + +	spin_unlock(&rbio->bio_list_lock); + +	if (table->cache_size > RBIO_CACHE_SIZE) { +		struct btrfs_raid_bio *found; + +		found = list_entry(table->stripe_cache.prev, +				  struct btrfs_raid_bio, +				  stripe_cache); + +		if (found != rbio) +			__remove_rbio_from_cache(found); +	} + +	spin_unlock_irqrestore(&table->cache_lock, flags); +	return; +} + +/* + * helper function to run the xor_blocks api.  It is only + * able to do MAX_XOR_BLOCKS at a time, so we need to + * loop through. + */ +static void run_xor(void **pages, int src_cnt, ssize_t len) +{ +	int src_off = 0; +	int xor_src_cnt = 0; +	void *dest = pages[src_cnt]; + +	while(src_cnt > 0) { +		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); +		xor_blocks(xor_src_cnt, len, dest, pages + src_off); + +		src_cnt -= xor_src_cnt; +		src_off += xor_src_cnt; +	} +} + +/* + * returns true if the bio list inside this rbio + * covers an entire stripe (no rmw required). + * Must be called with the bio list lock held, or + * at a time when you know it is impossible to add + * new bios into the list + */ +static int __rbio_is_full(struct btrfs_raid_bio *rbio) +{ +	unsigned long size = rbio->bio_list_bytes; +	int ret = 1; + +	if (size != rbio->nr_data * rbio->stripe_len) +		ret = 0; + +	BUG_ON(size > rbio->nr_data * rbio->stripe_len); +	return ret; +} + +static int rbio_is_full(struct btrfs_raid_bio *rbio) +{ +	unsigned long flags; +	int ret; + +	spin_lock_irqsave(&rbio->bio_list_lock, flags); +	ret = __rbio_is_full(rbio); +	spin_unlock_irqrestore(&rbio->bio_list_lock, flags); +	return ret; +} + +/* + * returns 1 if it is safe to merge two rbios together. + * The merging is safe if the two rbios correspond to + * the same stripe and if they are both going in the same + * direction (read vs write), and if neither one is + * locked for final IO + * + * The caller is responsible for locking such that + * rmw_locked is safe to test + */ +static int rbio_can_merge(struct btrfs_raid_bio *last, +			  struct btrfs_raid_bio *cur) +{ +	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || +	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) +		return 0; + +	/* +	 * we can't merge with cached rbios, since the +	 * idea is that when we merge the destination +	 * rbio is going to run our IO for us.  We can +	 * steal from cached rbio's though, other functions +	 * handle that. +	 */ +	if (test_bit(RBIO_CACHE_BIT, &last->flags) || +	    test_bit(RBIO_CACHE_BIT, &cur->flags)) +		return 0; + +	if (last->raid_map[0] != +	    cur->raid_map[0]) +		return 0; + +	/* reads can't merge with writes */ +	if (last->read_rebuild != +	    cur->read_rebuild) { +		return 0; +	} + +	return 1; +} + +/* + * helper to index into the pstripe + */ +static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +{ +	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; +	return rbio->stripe_pages[index]; +} + +/* + * helper to index into the qstripe, returns null + * if there is no qstripe + */ +static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +{ +	if (rbio->nr_data + 1 == rbio->bbio->num_stripes) +		return NULL; + +	index += ((rbio->nr_data + 1) * rbio->stripe_len) >> +		PAGE_CACHE_SHIFT; +	return rbio->stripe_pages[index]; +} + +/* + * The first stripe in the table for a logical address + * has the lock.  rbios are added in one of three ways: + * + * 1) Nobody has the stripe locked yet.  The rbio is given + * the lock and 0 is returned.  The caller must start the IO + * themselves. + * + * 2) Someone has the stripe locked, but we're able to merge + * with the lock owner.  The rbio is freed and the IO will + * start automatically along with the existing rbio.  1 is returned. + * + * 3) Someone has the stripe locked, but we're not able to merge. + * The rbio is added to the lock owner's plug list, or merged into + * an rbio already on the plug list.  When the lock owner unlocks, + * the next rbio on the list is run and the IO is started automatically. + * 1 is returned + * + * If we return 0, the caller still owns the rbio and must continue with + * IO submission.  If we return 1, the caller must assume the rbio has + * already been freed. + */ +static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) +{ +	int bucket = rbio_bucket(rbio); +	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; +	struct btrfs_raid_bio *cur; +	struct btrfs_raid_bio *pending; +	unsigned long flags; +	DEFINE_WAIT(wait); +	struct btrfs_raid_bio *freeit = NULL; +	struct btrfs_raid_bio *cache_drop = NULL; +	int ret = 0; +	int walk = 0; + +	spin_lock_irqsave(&h->lock, flags); +	list_for_each_entry(cur, &h->hash_list, hash_list) { +		walk++; +		if (cur->raid_map[0] == rbio->raid_map[0]) { +			spin_lock(&cur->bio_list_lock); + +			/* can we steal this cached rbio's pages? */ +			if (bio_list_empty(&cur->bio_list) && +			    list_empty(&cur->plug_list) && +			    test_bit(RBIO_CACHE_BIT, &cur->flags) && +			    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { +				list_del_init(&cur->hash_list); +				atomic_dec(&cur->refs); + +				steal_rbio(cur, rbio); +				cache_drop = cur; +				spin_unlock(&cur->bio_list_lock); + +				goto lockit; +			} + +			/* can we merge into the lock owner? */ +			if (rbio_can_merge(cur, rbio)) { +				merge_rbio(cur, rbio); +				spin_unlock(&cur->bio_list_lock); +				freeit = rbio; +				ret = 1; +				goto out; +			} + + +			/* +			 * we couldn't merge with the running +			 * rbio, see if we can merge with the +			 * pending ones.  We don't have to +			 * check for rmw_locked because there +			 * is no way they are inside finish_rmw +			 * right now +			 */ +			list_for_each_entry(pending, &cur->plug_list, +					    plug_list) { +				if (rbio_can_merge(pending, rbio)) { +					merge_rbio(pending, rbio); +					spin_unlock(&cur->bio_list_lock); +					freeit = rbio; +					ret = 1; +					goto out; +				} +			} + +			/* no merging, put us on the tail of the plug list, +			 * our rbio will be started with the currently +			 * running rbio unlocks +			 */ +			list_add_tail(&rbio->plug_list, &cur->plug_list); +			spin_unlock(&cur->bio_list_lock); +			ret = 1; +			goto out; +		} +	} +lockit: +	atomic_inc(&rbio->refs); +	list_add(&rbio->hash_list, &h->hash_list); +out: +	spin_unlock_irqrestore(&h->lock, flags); +	if (cache_drop) +		remove_rbio_from_cache(cache_drop); +	if (freeit) +		__free_raid_bio(freeit); +	return ret; +} + +/* + * called as rmw or parity rebuild is completed.  If the plug list has more + * rbios waiting for this stripe, the next one on the list will be started + */ +static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) +{ +	int bucket; +	struct btrfs_stripe_hash *h; +	unsigned long flags; +	int keep_cache = 0; + +	bucket = rbio_bucket(rbio); +	h = rbio->fs_info->stripe_hash_table->table + bucket; + +	if (list_empty(&rbio->plug_list)) +		cache_rbio(rbio); + +	spin_lock_irqsave(&h->lock, flags); +	spin_lock(&rbio->bio_list_lock); + +	if (!list_empty(&rbio->hash_list)) { +		/* +		 * if we're still cached and there is no other IO +		 * to perform, just leave this rbio here for others +		 * to steal from later +		 */ +		if (list_empty(&rbio->plug_list) && +		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) { +			keep_cache = 1; +			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); +			BUG_ON(!bio_list_empty(&rbio->bio_list)); +			goto done; +		} + +		list_del_init(&rbio->hash_list); +		atomic_dec(&rbio->refs); + +		/* +		 * we use the plug list to hold all the rbios +		 * waiting for the chance to lock this stripe. +		 * hand the lock over to one of them. +		 */ +		if (!list_empty(&rbio->plug_list)) { +			struct btrfs_raid_bio *next; +			struct list_head *head = rbio->plug_list.next; + +			next = list_entry(head, struct btrfs_raid_bio, +					  plug_list); + +			list_del_init(&rbio->plug_list); + +			list_add(&next->hash_list, &h->hash_list); +			atomic_inc(&next->refs); +			spin_unlock(&rbio->bio_list_lock); +			spin_unlock_irqrestore(&h->lock, flags); + +			if (next->read_rebuild) +				async_read_rebuild(next); +			else { +				steal_rbio(rbio, next); +				async_rmw_stripe(next); +			} + +			goto done_nolock; +		} else  if (waitqueue_active(&h->wait)) { +			spin_unlock(&rbio->bio_list_lock); +			spin_unlock_irqrestore(&h->lock, flags); +			wake_up(&h->wait); +			goto done_nolock; +		} +	} +done: +	spin_unlock(&rbio->bio_list_lock); +	spin_unlock_irqrestore(&h->lock, flags); + +done_nolock: +	if (!keep_cache) +		remove_rbio_from_cache(rbio); +} + +static void __free_raid_bio(struct btrfs_raid_bio *rbio) +{ +	int i; + +	WARN_ON(atomic_read(&rbio->refs) < 0); +	if (!atomic_dec_and_test(&rbio->refs)) +		return; + +	WARN_ON(!list_empty(&rbio->stripe_cache)); +	WARN_ON(!list_empty(&rbio->hash_list)); +	WARN_ON(!bio_list_empty(&rbio->bio_list)); + +	for (i = 0; i < rbio->nr_pages; i++) { +		if (rbio->stripe_pages[i]) { +			__free_page(rbio->stripe_pages[i]); +			rbio->stripe_pages[i] = NULL; +		} +	} +	kfree(rbio->raid_map); +	kfree(rbio->bbio); +	kfree(rbio); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ +	unlock_stripe(rbio); +	__free_raid_bio(rbio); +} + +/* + * this frees the rbio and runs through all the bios in the + * bio_list and calls end_io on them + */ +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +{ +	struct bio *cur = bio_list_get(&rbio->bio_list); +	struct bio *next; +	free_raid_bio(rbio); + +	while (cur) { +		next = cur->bi_next; +		cur->bi_next = NULL; +		if (uptodate) +			set_bit(BIO_UPTODATE, &cur->bi_flags); +		bio_endio(cur, err); +		cur = next; +	} +} + +/* + * end io function used by finish_rmw.  When we finally + * get here, we've written a full stripe + */ +static void raid_write_end_io(struct bio *bio, int err) +{ +	struct btrfs_raid_bio *rbio = bio->bi_private; + +	if (err) +		fail_bio_stripe(rbio, bio); + +	bio_put(bio); + +	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) +		return; + +	err = 0; + +	/* OK, we have read all the stripes we need to. */ +	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) +		err = -EIO; + +	rbio_orig_end_io(rbio, err, 0); +	return; +} + +/* + * the read/modify/write code wants to use the original bio for + * any pages it included, and then use the rbio for everything + * else.  This function decides if a given index (stripe number) + * and page number in that stripe fall inside the original bio + * or the rbio. + * + * if you set bio_list_only, you'll get a NULL back for any ranges + * that are outside the bio_list + * + * This doesn't take any refs on anything, you get a bare page pointer + * and the caller must bump refs as required. + * + * You must call index_rbio_pages once before you can trust + * the answers from this function. + */ +static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, +				 int index, int pagenr, int bio_list_only) +{ +	int chunk_page; +	struct page *p = NULL; + +	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + +	spin_lock_irq(&rbio->bio_list_lock); +	p = rbio->bio_pages[chunk_page]; +	spin_unlock_irq(&rbio->bio_list_lock); + +	if (p || bio_list_only) +		return p; + +	return rbio->stripe_pages[chunk_page]; +} + +/* + * number of pages we need for the entire stripe across all the + * drives + */ +static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) +{ +	unsigned long nr = stripe_len * nr_stripes; +	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +/* + * allocation and initial setup for the btrfs_raid_bio.  Not + * this does not allocate any pages for rbio->pages. + */ +static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, +			  struct btrfs_bio *bbio, u64 *raid_map, +			  u64 stripe_len) +{ +	struct btrfs_raid_bio *rbio; +	int nr_data = 0; +	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); +	void *p; + +	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, +			GFP_NOFS); +	if (!rbio) { +		kfree(raid_map); +		kfree(bbio); +		return ERR_PTR(-ENOMEM); +	} + +	bio_list_init(&rbio->bio_list); +	INIT_LIST_HEAD(&rbio->plug_list); +	spin_lock_init(&rbio->bio_list_lock); +	INIT_LIST_HEAD(&rbio->stripe_cache); +	INIT_LIST_HEAD(&rbio->hash_list); +	rbio->bbio = bbio; +	rbio->raid_map = raid_map; +	rbio->fs_info = root->fs_info; +	rbio->stripe_len = stripe_len; +	rbio->nr_pages = num_pages; +	rbio->faila = -1; +	rbio->failb = -1; +	atomic_set(&rbio->refs, 1); + +	/* +	 * the stripe_pages and bio_pages array point to the extra +	 * memory we allocated past the end of the rbio +	 */ +	p = rbio + 1; +	rbio->stripe_pages = p; +	rbio->bio_pages = p + sizeof(struct page *) * num_pages; + +	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) +		nr_data = bbio->num_stripes - 2; +	else +		nr_data = bbio->num_stripes - 1; + +	rbio->nr_data = nr_data; +	return rbio; +} + +/* allocate pages for all the stripes in the bio, including parity */ +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) +{ +	int i; +	struct page *page; + +	for (i = 0; i < rbio->nr_pages; i++) { +		if (rbio->stripe_pages[i]) +			continue; +		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); +		if (!page) +			return -ENOMEM; +		rbio->stripe_pages[i] = page; +		ClearPageUptodate(page); +	} +	return 0; +} + +/* allocate pages for just the p/q stripes */ +static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) +{ +	int i; +	struct page *page; + +	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + +	for (; i < rbio->nr_pages; i++) { +		if (rbio->stripe_pages[i]) +			continue; +		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); +		if (!page) +			return -ENOMEM; +		rbio->stripe_pages[i] = page; +	} +	return 0; +} + +/* + * add a single page from a specific stripe into our list of bios for IO + * this will try to merge into existing bios if possible, and returns + * zero if all went well. + */ +int rbio_add_io_page(struct btrfs_raid_bio *rbio, +		     struct bio_list *bio_list, +		     struct page *page, +		     int stripe_nr, +		     unsigned long page_index, +		     unsigned long bio_max_len) +{ +	struct bio *last = bio_list->tail; +	u64 last_end = 0; +	int ret; +	struct bio *bio; +	struct btrfs_bio_stripe *stripe; +	u64 disk_start; + +	stripe = &rbio->bbio->stripes[stripe_nr]; +	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); + +	/* if the device is missing, just fail this stripe */ +	if (!stripe->dev->bdev) +		return fail_rbio_index(rbio, stripe_nr); + +	/* see if we can add this page onto our existing bio */ +	if (last) { +		last_end = (u64)last->bi_sector << 9; +		last_end += last->bi_size; + +		/* +		 * we can't merge these if they are from different +		 * devices or if they are not contiguous +		 */ +		if (last_end == disk_start && stripe->dev->bdev && +		    test_bit(BIO_UPTODATE, &last->bi_flags) && +		    last->bi_bdev == stripe->dev->bdev) { +			ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); +			if (ret == PAGE_CACHE_SIZE) +				return 0; +		} +	} + +	/* put a new bio on the list */ +	bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); +	if (!bio) +		return -ENOMEM; + +	bio->bi_size = 0; +	bio->bi_bdev = stripe->dev->bdev; +	bio->bi_sector = disk_start >> 9; +	set_bit(BIO_UPTODATE, &bio->bi_flags); + +	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); +	bio_list_add(bio_list, bio); +	return 0; +} + +/* + * while we're doing the read/modify/write cycle, we could + * have errors in reading pages off the disk.  This checks + * for errors and if we're not able to read the page it'll + * trigger parity reconstruction.  The rmw will be finished + * after we've reconstructed the failed stripes + */ +static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) +{ +	if (rbio->faila >= 0 || rbio->failb >= 0) { +		BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); +		__raid56_parity_recover(rbio); +	} else { +		finish_rmw(rbio); +	} +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) +{ +	int index; +	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); +	index += page; +	return rbio->stripe_pages[index]; +} + +/* + * helper function to walk our bio list and populate the bio_pages array with + * the result.  This seems expensive, but it is faster than constantly + * searching through the bio list as we setup the IO in finish_rmw or stripe + * reconstruction. + * + * This must be called before you trust the answers from page_in_rbio + */ +static void index_rbio_pages(struct btrfs_raid_bio *rbio) +{ +	struct bio *bio; +	u64 start; +	unsigned long stripe_offset; +	unsigned long page_index; +	struct page *p; +	int i; + +	spin_lock_irq(&rbio->bio_list_lock); +	bio_list_for_each(bio, &rbio->bio_list) { +		start = (u64)bio->bi_sector << 9; +		stripe_offset = start - rbio->raid_map[0]; +		page_index = stripe_offset >> PAGE_CACHE_SHIFT; + +		for (i = 0; i < bio->bi_vcnt; i++) { +			p = bio->bi_io_vec[i].bv_page; +			rbio->bio_pages[page_index + i] = p; +		} +	} +	spin_unlock_irq(&rbio->bio_list_lock); +} + +/* + * this is called from one of two situations.  We either + * have a full stripe from the higher layers, or we've read all + * the missing bits off disk. + * + * This will calculate the parity and then send down any + * changed blocks. + */ +static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +{ +	struct btrfs_bio *bbio = rbio->bbio; +	void *pointers[bbio->num_stripes]; +	int stripe_len = rbio->stripe_len; +	int nr_data = rbio->nr_data; +	int stripe; +	int pagenr; +	int p_stripe = -1; +	int q_stripe = -1; +	struct bio_list bio_list; +	struct bio *bio; +	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; +	int ret; + +	bio_list_init(&bio_list); + +	if (bbio->num_stripes - rbio->nr_data == 1) { +		p_stripe = bbio->num_stripes - 1; +	} else if (bbio->num_stripes - rbio->nr_data == 2) { +		p_stripe = bbio->num_stripes - 2; +		q_stripe = bbio->num_stripes - 1; +	} else { +		BUG(); +	} + +	/* at this point we either have a full stripe, +	 * or we've read the full stripe from the drive. +	 * recalculate the parity and write the new results. +	 * +	 * We're not allowed to add any new bios to the +	 * bio list here, anyone else that wants to +	 * change this stripe needs to do their own rmw. +	 */ +	spin_lock_irq(&rbio->bio_list_lock); +	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); +	spin_unlock_irq(&rbio->bio_list_lock); + +	atomic_set(&rbio->bbio->error, 0); + +	/* +	 * now that we've set rmw_locked, run through the +	 * bio list one last time and map the page pointers +	 * +	 * We don't cache full rbios because we're assuming +	 * the higher layers are unlikely to use this area of +	 * the disk again soon.  If they do use it again, +	 * hopefully they will send another full bio. +	 */ +	index_rbio_pages(rbio); +	if (!rbio_is_full(rbio)) +		cache_rbio_pages(rbio); +	else +		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + +	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { +		struct page *p; +		/* first collect one page from each data stripe */ +		for (stripe = 0; stripe < nr_data; stripe++) { +			p = page_in_rbio(rbio, stripe, pagenr, 0); +			pointers[stripe] = kmap(p); +		} + +		/* then add the parity stripe */ +		p = rbio_pstripe_page(rbio, pagenr); +		SetPageUptodate(p); +		pointers[stripe++] = kmap(p); + +		if (q_stripe != -1) { + +			/* +			 * raid6, add the qstripe and call the +			 * library function to fill in our p/q +			 */ +			p = rbio_qstripe_page(rbio, pagenr); +			SetPageUptodate(p); +			pointers[stripe++] = kmap(p); + +			raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, +						pointers); +		} else { +			/* raid5 */ +			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); +			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); +		} + + +		for (stripe = 0; stripe < bbio->num_stripes; stripe++) +			kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); +	} + +	/* +	 * time to start writing.  Make bios for everything from the +	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore +	 * everything else. +	 */ +	for (stripe = 0; stripe < bbio->num_stripes; stripe++) { +		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { +			struct page *page; +			if (stripe < rbio->nr_data) { +				page = page_in_rbio(rbio, stripe, pagenr, 1); +				if (!page) +					continue; +			} else { +			       page = rbio_stripe_page(rbio, stripe, pagenr); +			} + +			ret = rbio_add_io_page(rbio, &bio_list, +				       page, stripe, pagenr, rbio->stripe_len); +			if (ret) +				goto cleanup; +		} +	} + +	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); +	BUG_ON(atomic_read(&bbio->stripes_pending) == 0); + +	while (1) { +		bio = bio_list_pop(&bio_list); +		if (!bio) +			break; + +		bio->bi_private = rbio; +		bio->bi_end_io = raid_write_end_io; +		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); +		submit_bio(WRITE, bio); +	} +	return; + +cleanup: +	rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * helper to find the stripe number for a given bio.  Used to figure out which + * stripe has failed.  This expects the bio to correspond to a physical disk, + * so it looks up based on physical sector numbers. + */ +static int find_bio_stripe(struct btrfs_raid_bio *rbio, +			   struct bio *bio) +{ +	u64 physical = bio->bi_sector; +	u64 stripe_start; +	int i; +	struct btrfs_bio_stripe *stripe; + +	physical <<= 9; + +	for (i = 0; i < rbio->bbio->num_stripes; i++) { +		stripe = &rbio->bbio->stripes[i]; +		stripe_start = stripe->physical; +		if (physical >= stripe_start && +		    physical < stripe_start + rbio->stripe_len) { +			return i; +		} +	} +	return -1; +} + +/* + * helper to find the stripe number for a given + * bio (before mapping).  Used to figure out which stripe has + * failed.  This looks up based on logical block numbers. + */ +static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, +				   struct bio *bio) +{ +	u64 logical = bio->bi_sector; +	u64 stripe_start; +	int i; + +	logical <<= 9; + +	for (i = 0; i < rbio->nr_data; i++) { +		stripe_start = rbio->raid_map[i]; +		if (logical >= stripe_start && +		    logical < stripe_start + rbio->stripe_len) { +			return i; +		} +	} +	return -1; +} + +/* + * returns -EIO if we had too many failures + */ +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +{ +	unsigned long flags; +	int ret = 0; + +	spin_lock_irqsave(&rbio->bio_list_lock, flags); + +	/* we already know this stripe is bad, move on */ +	if (rbio->faila == failed || rbio->failb == failed) +		goto out; + +	if (rbio->faila == -1) { +		/* first failure on this rbio */ +		rbio->faila = failed; +		atomic_inc(&rbio->bbio->error); +	} else if (rbio->failb == -1) { +		/* second failure on this rbio */ +		rbio->failb = failed; +		atomic_inc(&rbio->bbio->error); +	} else { +		ret = -EIO; +	} +out: +	spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + +	return ret; +} + +/* + * helper to fail a stripe based on a physical disk + * bio. + */ +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, +			   struct bio *bio) +{ +	int failed = find_bio_stripe(rbio, bio); + +	if (failed < 0) +		return -EIO; + +	return fail_rbio_index(rbio, failed); +} + +/* + * this sets each page in the bio uptodate.  It should only be used on private + * rbio pages, nothing that comes in from the higher layers + */ +static void set_bio_pages_uptodate(struct bio *bio) +{ +	int i; +	struct page *p; + +	for (i = 0; i < bio->bi_vcnt; i++) { +		p = bio->bi_io_vec[i].bv_page; +		SetPageUptodate(p); +	} +} + +/* + * end io for the read phase of the rmw cycle.  All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid_rmw_end_io(struct bio *bio, int err) +{ +	struct btrfs_raid_bio *rbio = bio->bi_private; + +	if (err) +		fail_bio_stripe(rbio, bio); +	else +		set_bio_pages_uptodate(bio); + +	bio_put(bio); + +	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) +		return; + +	err = 0; +	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) +		goto cleanup; + +	/* +	 * this will normally call finish_rmw to start our write +	 * but if there are any failed stripes we'll reconstruct +	 * from parity first +	 */ +	validate_rbio_for_rmw(rbio); +	return; + +cleanup: + +	rbio_orig_end_io(rbio, -EIO, 0); +} + +static void async_rmw_stripe(struct btrfs_raid_bio *rbio) +{ +	rbio->work.flags = 0; +	rbio->work.func = rmw_work; + +	btrfs_queue_worker(&rbio->fs_info->rmw_workers, +			   &rbio->work); +} + +static void async_read_rebuild(struct btrfs_raid_bio *rbio) +{ +	rbio->work.flags = 0; +	rbio->work.func = read_rebuild_work; + +	btrfs_queue_worker(&rbio->fs_info->rmw_workers, +			   &rbio->work); +} + +/* + * the stripe must be locked by the caller.  It will + * unlock after all the writes are done + */ +static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +{ +	int bios_to_read = 0; +	struct btrfs_bio *bbio = rbio->bbio; +	struct bio_list bio_list; +	int ret; +	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +	int pagenr; +	int stripe; +	struct bio *bio; + +	bio_list_init(&bio_list); + +	ret = alloc_rbio_pages(rbio); +	if (ret) +		goto cleanup; + +	index_rbio_pages(rbio); + +	atomic_set(&rbio->bbio->error, 0); +	/* +	 * build a list of bios to read all the missing parts of this +	 * stripe +	 */ +	for (stripe = 0; stripe < rbio->nr_data; stripe++) { +		for (pagenr = 0; pagenr < nr_pages; pagenr++) { +			struct page *page; +			/* +			 * we want to find all the pages missing from +			 * the rbio and read them from the disk.  If +			 * page_in_rbio finds a page in the bio list +			 * we don't need to read it off the stripe. +			 */ +			page = page_in_rbio(rbio, stripe, pagenr, 1); +			if (page) +				continue; + +			page = rbio_stripe_page(rbio, stripe, pagenr); +			/* +			 * the bio cache may have handed us an uptodate +			 * page.  If so, be happy and use it +			 */ +			if (PageUptodate(page)) +				continue; + +			ret = rbio_add_io_page(rbio, &bio_list, page, +				       stripe, pagenr, rbio->stripe_len); +			if (ret) +				goto cleanup; +		} +	} + +	bios_to_read = bio_list_size(&bio_list); +	if (!bios_to_read) { +		/* +		 * this can happen if others have merged with +		 * us, it means there is nothing left to read. +		 * But if there are missing devices it may not be +		 * safe to do the full stripe write yet. +		 */ +		goto finish; +	} + +	/* +	 * the bbio may be freed once we submit the last bio.  Make sure +	 * not to touch it after that +	 */ +	atomic_set(&bbio->stripes_pending, bios_to_read); +	while (1) { +		bio = bio_list_pop(&bio_list); +		if (!bio) +			break; + +		bio->bi_private = rbio; +		bio->bi_end_io = raid_rmw_end_io; + +		btrfs_bio_wq_end_io(rbio->fs_info, bio, +				    BTRFS_WQ_ENDIO_RAID56); + +		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); +		submit_bio(READ, bio); +	} +	/* the actual write will happen once the reads are done */ +	return 0; + +cleanup: +	rbio_orig_end_io(rbio, -EIO, 0); +	return -EIO; + +finish: +	validate_rbio_for_rmw(rbio); +	return 0; +} + +/* + * if the upper layers pass in a full stripe, we thank them by only allocating + * enough pages to hold the parity, and sending it all down quickly. + */ +static int full_stripe_write(struct btrfs_raid_bio *rbio) +{ +	int ret; + +	ret = alloc_rbio_parity_pages(rbio); +	if (ret) +		return ret; + +	ret = lock_stripe_add(rbio); +	if (ret == 0) +		finish_rmw(rbio); +	return 0; +} + +/* + * partial stripe writes get handed over to async helpers. + * We're really hoping to merge a few more writes into this + * rbio before calculating new parity + */ +static int partial_stripe_write(struct btrfs_raid_bio *rbio) +{ +	int ret; + +	ret = lock_stripe_add(rbio); +	if (ret == 0) +		async_rmw_stripe(rbio); +	return 0; +} + +/* + * sometimes while we were reading from the drive to + * recalculate parity, enough new bios come into create + * a full stripe.  So we do a check here to see if we can + * go directly to finish_rmw + */ +static int __raid56_parity_write(struct btrfs_raid_bio *rbio) +{ +	/* head off into rmw land if we don't have a full stripe */ +	if (!rbio_is_full(rbio)) +		return partial_stripe_write(rbio); +	return full_stripe_write(rbio); +} + +/* + * We use plugging call backs to collect full stripes. + * Any time we get a partial stripe write while plugged + * we collect it into a list.  When the unplug comes down, + * we sort the list by logical block number and merge + * everything we can into the same rbios + */ +struct btrfs_plug_cb { +	struct blk_plug_cb cb; +	struct btrfs_fs_info *info; +	struct list_head rbio_list; +	struct btrfs_work work; +}; + +/* + * rbios on the plug list are sorted for easier merging. + */ +static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +{ +	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, +						 plug_list); +	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, +						 plug_list); +	u64 a_sector = ra->bio_list.head->bi_sector; +	u64 b_sector = rb->bio_list.head->bi_sector; + +	if (a_sector < b_sector) +		return -1; +	if (a_sector > b_sector) +		return 1; +	return 0; +} + +static void run_plug(struct btrfs_plug_cb *plug) +{ +	struct btrfs_raid_bio *cur; +	struct btrfs_raid_bio *last = NULL; + +	/* +	 * sort our plug list then try to merge +	 * everything we can in hopes of creating full +	 * stripes. +	 */ +	list_sort(NULL, &plug->rbio_list, plug_cmp); +	while (!list_empty(&plug->rbio_list)) { +		cur = list_entry(plug->rbio_list.next, +				 struct btrfs_raid_bio, plug_list); +		list_del_init(&cur->plug_list); + +		if (rbio_is_full(cur)) { +			/* we have a full stripe, send it down */ +			full_stripe_write(cur); +			continue; +		} +		if (last) { +			if (rbio_can_merge(last, cur)) { +				merge_rbio(last, cur); +				__free_raid_bio(cur); +				continue; + +			} +			__raid56_parity_write(last); +		} +		last = cur; +	} +	if (last) { +		__raid56_parity_write(last); +	} +	kfree(plug); +} + +/* + * if the unplug comes from schedule, we have to push the + * work off to a helper thread + */ +static void unplug_work(struct btrfs_work *work) +{ +	struct btrfs_plug_cb *plug; +	plug = container_of(work, struct btrfs_plug_cb, work); +	run_plug(plug); +} + +static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ +	struct btrfs_plug_cb *plug; +	plug = container_of(cb, struct btrfs_plug_cb, cb); + +	if (from_schedule) { +		plug->work.flags = 0; +		plug->work.func = unplug_work; +		btrfs_queue_worker(&plug->info->rmw_workers, +				   &plug->work); +		return; +	} +	run_plug(plug); +} + +/* + * our main entry point for writes from the rest of the FS. + */ +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, +			struct btrfs_bio *bbio, u64 *raid_map, +			u64 stripe_len) +{ +	struct btrfs_raid_bio *rbio; +	struct btrfs_plug_cb *plug = NULL; +	struct blk_plug_cb *cb; + +	rbio = alloc_rbio(root, bbio, raid_map, stripe_len); +	if (IS_ERR(rbio)) { +		kfree(raid_map); +		kfree(bbio); +		return PTR_ERR(rbio); +	} +	bio_list_add(&rbio->bio_list, bio); +	rbio->bio_list_bytes = bio->bi_size; + +	/* +	 * don't plug on full rbios, just get them out the door +	 * as quickly as we can +	 */ +	if (rbio_is_full(rbio)) +		return full_stripe_write(rbio); + +	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, +			       sizeof(*plug)); +	if (cb) { +		plug = container_of(cb, struct btrfs_plug_cb, cb); +		if (!plug->info) { +			plug->info = root->fs_info; +			INIT_LIST_HEAD(&plug->rbio_list); +		} +		list_add_tail(&rbio->plug_list, &plug->rbio_list); +	} else { +		return __raid56_parity_write(rbio); +	} +	return 0; +} + +/* + * all parity reconstruction happens here.  We've read in everything + * we can find from the drives and this does the heavy lifting of + * sorting the good from the bad. + */ +static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +{ +	int pagenr, stripe; +	void **pointers; +	int faila = -1, failb = -1; +	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +	struct page *page; +	int err; +	int i; + +	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), +			   GFP_NOFS); +	if (!pointers) { +		err = -ENOMEM; +		goto cleanup_io; +	} + +	faila = rbio->faila; +	failb = rbio->failb; + +	if (rbio->read_rebuild) { +		spin_lock_irq(&rbio->bio_list_lock); +		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); +		spin_unlock_irq(&rbio->bio_list_lock); +	} + +	index_rbio_pages(rbio); + +	for (pagenr = 0; pagenr < nr_pages; pagenr++) { +		/* setup our array of pointers with pages +		 * from each stripe +		 */ +		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { +			/* +			 * if we're rebuilding a read, we have to use +			 * pages from the bio list +			 */ +			if (rbio->read_rebuild && +			    (stripe == faila || stripe == failb)) { +				page = page_in_rbio(rbio, stripe, pagenr, 0); +			} else { +				page = rbio_stripe_page(rbio, stripe, pagenr); +			} +			pointers[stripe] = kmap(page); +		} + +		/* all raid6 handling here */ +		if (rbio->raid_map[rbio->bbio->num_stripes - 1] == +		    RAID6_Q_STRIPE) { + +			/* +			 * single failure, rebuild from parity raid5 +			 * style +			 */ +			if (failb < 0) { +				if (faila == rbio->nr_data) { +					/* +					 * Just the P stripe has failed, without +					 * a bad data or Q stripe. +					 * TODO, we should redo the xor here. +					 */ +					err = -EIO; +					goto cleanup; +				} +				/* +				 * a single failure in raid6 is rebuilt +				 * in the pstripe code below +				 */ +				goto pstripe; +			} + +			/* make sure our ps and qs are in order */ +			if (faila > failb) { +				int tmp = failb; +				failb = faila; +				faila = tmp; +			} + +			/* if the q stripe is failed, do a pstripe reconstruction +			 * from the xors. +			 * If both the q stripe and the P stripe are failed, we're +			 * here due to a crc mismatch and we can't give them the +			 * data they want +			 */ +			if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { +				if (rbio->raid_map[faila] == RAID5_P_STRIPE) { +					err = -EIO; +					goto cleanup; +				} +				/* +				 * otherwise we have one bad data stripe and +				 * a good P stripe.  raid5! +				 */ +				goto pstripe; +			} + +			if (rbio->raid_map[failb] == RAID5_P_STRIPE) { +				raid6_datap_recov(rbio->bbio->num_stripes, +						  PAGE_SIZE, faila, pointers); +			} else { +				raid6_2data_recov(rbio->bbio->num_stripes, +						  PAGE_SIZE, faila, failb, +						  pointers); +			} +		} else { +			void *p; + +			/* rebuild from P stripe here (raid5 or raid6) */ +			BUG_ON(failb != -1); +pstripe: +			/* Copy parity block into failed block to start with */ +			memcpy(pointers[faila], +			       pointers[rbio->nr_data], +			       PAGE_CACHE_SIZE); + +			/* rearrange the pointer array */ +			p = pointers[faila]; +			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) +				pointers[stripe] = pointers[stripe + 1]; +			pointers[rbio->nr_data - 1] = p; + +			/* xor in the rest */ +			run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); +		} +		/* if we're doing this rebuild as part of an rmw, go through +		 * and set all of our private rbio pages in the +		 * failed stripes as uptodate.  This way finish_rmw will +		 * know they can be trusted.  If this was a read reconstruction, +		 * other endio functions will fiddle the uptodate bits +		 */ +		if (!rbio->read_rebuild) { +			for (i = 0;  i < nr_pages; i++) { +				if (faila != -1) { +					page = rbio_stripe_page(rbio, faila, i); +					SetPageUptodate(page); +				} +				if (failb != -1) { +					page = rbio_stripe_page(rbio, failb, i); +					SetPageUptodate(page); +				} +			} +		} +		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { +			/* +			 * if we're rebuilding a read, we have to use +			 * pages from the bio list +			 */ +			if (rbio->read_rebuild && +			    (stripe == faila || stripe == failb)) { +				page = page_in_rbio(rbio, stripe, pagenr, 0); +			} else { +				page = rbio_stripe_page(rbio, stripe, pagenr); +			} +			kunmap(page); +		} +	} + +	err = 0; +cleanup: +	kfree(pointers); + +cleanup_io: + +	if (rbio->read_rebuild) { +		if (err == 0) +			cache_rbio_pages(rbio); +		else +			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + +		rbio_orig_end_io(rbio, err, err == 0); +	} else if (err == 0) { +		rbio->faila = -1; +		rbio->failb = -1; +		finish_rmw(rbio); +	} else { +		rbio_orig_end_io(rbio, err, 0); +	} +} + +/* + * This is called only for stripes we've read from disk to + * reconstruct the parity. + */ +static void raid_recover_end_io(struct bio *bio, int err) +{ +	struct btrfs_raid_bio *rbio = bio->bi_private; + +	/* +	 * we only read stripe pages off the disk, set them +	 * up to date if there were no errors +	 */ +	if (err) +		fail_bio_stripe(rbio, bio); +	else +		set_bio_pages_uptodate(bio); +	bio_put(bio); + +	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) +		return; + +	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) +		rbio_orig_end_io(rbio, -EIO, 0); +	else +		__raid_recover_end_io(rbio); +} + +/* + * reads everything we need off the disk to reconstruct + * the parity. endio handlers trigger final reconstruction + * when the IO is done. + * + * This is used both for reads from the higher layers and for + * parity construction required to finish a rmw cycle. + */ +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +{ +	int bios_to_read = 0; +	struct btrfs_bio *bbio = rbio->bbio; +	struct bio_list bio_list; +	int ret; +	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +	int pagenr; +	int stripe; +	struct bio *bio; + +	bio_list_init(&bio_list); + +	ret = alloc_rbio_pages(rbio); +	if (ret) +		goto cleanup; + +	atomic_set(&rbio->bbio->error, 0); + +	/* +	 * read everything that hasn't failed.  Thanks to the +	 * stripe cache, it is possible that some or all of these +	 * pages are going to be uptodate. +	 */ +	for (stripe = 0; stripe < bbio->num_stripes; stripe++) { +		if (rbio->faila == stripe || +		    rbio->failb == stripe) +			continue; + +		for (pagenr = 0; pagenr < nr_pages; pagenr++) { +			struct page *p; + +			/* +			 * the rmw code may have already read this +			 * page in +			 */ +			p = rbio_stripe_page(rbio, stripe, pagenr); +			if (PageUptodate(p)) +				continue; + +			ret = rbio_add_io_page(rbio, &bio_list, +				       rbio_stripe_page(rbio, stripe, pagenr), +				       stripe, pagenr, rbio->stripe_len); +			if (ret < 0) +				goto cleanup; +		} +	} + +	bios_to_read = bio_list_size(&bio_list); +	if (!bios_to_read) { +		/* +		 * we might have no bios to read just because the pages +		 * were up to date, or we might have no bios to read because +		 * the devices were gone. +		 */ +		if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { +			__raid_recover_end_io(rbio); +			goto out; +		} else { +			goto cleanup; +		} +	} + +	/* +	 * the bbio may be freed once we submit the last bio.  Make sure +	 * not to touch it after that +	 */ +	atomic_set(&bbio->stripes_pending, bios_to_read); +	while (1) { +		bio = bio_list_pop(&bio_list); +		if (!bio) +			break; + +		bio->bi_private = rbio; +		bio->bi_end_io = raid_recover_end_io; + +		btrfs_bio_wq_end_io(rbio->fs_info, bio, +				    BTRFS_WQ_ENDIO_RAID56); + +		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); +		submit_bio(READ, bio); +	} +out: +	return 0; + +cleanup: +	if (rbio->read_rebuild) +		rbio_orig_end_io(rbio, -EIO, 0); +	return -EIO; +} + +/* + * the main entry point for reads from the higher layers.  This + * is really only called when the normal read path had a failure, + * so we assume the bio they send down corresponds to a failed part + * of the drive. + */ +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, +			  struct btrfs_bio *bbio, u64 *raid_map, +			  u64 stripe_len, int mirror_num) +{ +	struct btrfs_raid_bio *rbio; +	int ret; + +	rbio = alloc_rbio(root, bbio, raid_map, stripe_len); +	if (IS_ERR(rbio)) { +		return PTR_ERR(rbio); +	} + +	rbio->read_rebuild = 1; +	bio_list_add(&rbio->bio_list, bio); +	rbio->bio_list_bytes = bio->bi_size; + +	rbio->faila = find_logical_bio_stripe(rbio, bio); +	if (rbio->faila == -1) { +		BUG(); +		kfree(rbio); +		return -EIO; +	} + +	/* +	 * reconstruct from the q stripe if they are +	 * asking for mirror 3 +	 */ +	if (mirror_num == 3) +		rbio->failb = bbio->num_stripes - 2; + +	ret = lock_stripe_add(rbio); + +	/* +	 * __raid56_parity_recover will end the bio with +	 * any errors it hits.  We don't want to return +	 * its error value up the stack because our caller +	 * will end up calling bio_endio with any nonzero +	 * return +	 */ +	if (ret == 0) +		__raid56_parity_recover(rbio); +	/* +	 * our rbio has been added to the list of +	 * rbios that will be handled after the +	 * currently lock owner is done +	 */ +	return 0; + +} + +static void rmw_work(struct btrfs_work *work) +{ +	struct btrfs_raid_bio *rbio; + +	rbio = container_of(work, struct btrfs_raid_bio, work); +	raid56_rmw_stripe(rbio); +} + +static void read_rebuild_work(struct btrfs_work *work) +{ +	struct btrfs_raid_bio *rbio; + +	rbio = container_of(work, struct btrfs_raid_bio, work); +	__raid56_parity_recover(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 00000000000..ea5d73bfdfb --- /dev/null +++ b/fs/btrfs/raid56.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2012 Fusion-io  All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_RAID56__ +#define __BTRFS_RAID56__ +static inline int nr_parity_stripes(struct map_lookup *map) +{ +	if (map->type & BTRFS_BLOCK_GROUP_RAID5) +		return 1; +	else if (map->type & BTRFS_BLOCK_GROUP_RAID6) +		return 2; +	else +		return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ +	return map->num_stripes - nr_parity_stripes(map); +} +#define RAID5_P_STRIPE ((u64)-2) +#define RAID6_Q_STRIPE ((u64)-1) + +#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\ +			     ((x) == RAID6_Q_STRIPE)) + +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, +				 struct btrfs_bio *bbio, u64 *raid_map, +				 u64 stripe_len, int mirror_num); +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, +			       struct btrfs_bio *bbio, u64 *raid_map, +			       u64 stripe_len); + +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 300e09ac365..50695dc5e2a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode,  			}  		} -		page_start = (u64)page->index << PAGE_CACHE_SHIFT; +		page_start = page_offset(page);  		page_end = page_start + PAGE_CACHE_SIZE - 1;  		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); @@ -3472,7 +3472,7 @@ out:  }  /* - * hepler to find all tree blocks that reference a given data extent + * helper to find all tree blocks that reference a given data extent   */  static noinline_for_stack  int add_data_references(struct reloc_control *rc, @@ -3566,7 +3566,7 @@ int add_data_references(struct reloc_control *rc,  }  /* - * hepler to find next unprocessed extent + * helper to find next unprocessed extent   */  static noinline_for_stack  int find_next_extent(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f245c..53c3501fa4c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -28,6 +28,7 @@  #include "dev-replace.h"  #include "check-integrity.h"  #include "rcu-string.h" +#include "raid56.h"  /*   * This is only the first step towards a full-features scrub. It reads all @@ -580,20 +581,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)  	int corrected = 0;  	struct btrfs_key key;  	struct inode *inode = NULL; +	struct btrfs_fs_info *fs_info;  	u64 end = offset + PAGE_SIZE - 1;  	struct btrfs_root *local_root; +	int srcu_index;  	key.objectid = root;  	key.type = BTRFS_ROOT_ITEM_KEY;  	key.offset = (u64)-1; -	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); -	if (IS_ERR(local_root)) + +	fs_info = fixup->root->fs_info; +	srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + +	local_root = btrfs_read_fs_root_no_name(fs_info, &key); +	if (IS_ERR(local_root)) { +		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);  		return PTR_ERR(local_root); +	}  	key.type = BTRFS_INODE_ITEM_KEY;  	key.objectid = inum;  	key.offset = 0; -	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); +	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); +	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);  	if (IS_ERR(inode))  		return PTR_ERR(inode); @@ -606,7 +616,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)  	}  	if (PageUptodate(page)) { -		struct btrfs_fs_info *fs_info;  		if (PageDirty(page)) {  			/*  			 * we need to write the data to the defect sector. the @@ -2246,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  	struct btrfs_device *extent_dev;  	int extent_mirror_num; +	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +			 BTRFS_BLOCK_GROUP_RAID6)) { +		if (num >= nr_data_stripes(map)) { +			return 0; +		} +	} +  	nstripes = length;  	offset = 0;  	do_div(nstripes, map->stripe_len); @@ -2700,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,  	int	ret;  	struct btrfs_root *root = sctx->dev_root; -	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) +	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))  		return -EIO;  	gen = root->fs_info->last_trans_committed; @@ -3180,18 +3196,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)  	u64 physical_for_dev_replace;  	u64 len;  	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; +	int srcu_index;  	key.objectid = root;  	key.type = BTRFS_ROOT_ITEM_KEY;  	key.offset = (u64)-1; + +	srcu_index = srcu_read_lock(&fs_info->subvol_srcu); +  	local_root = btrfs_read_fs_root_no_name(fs_info, &key); -	if (IS_ERR(local_root)) +	if (IS_ERR(local_root)) { +		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);  		return PTR_ERR(local_root); +	}  	key.type = BTRFS_INODE_ITEM_KEY;  	key.objectid = inum;  	key.offset = 0;  	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); +	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);  	if (IS_ERR(inode))  		return PTR_ERR(inode); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 54454542ad4..f7a8b861058 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -85,6 +85,7 @@ struct send_ctx {  	u32 send_max_size;  	u64 total_send_size;  	u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; +	u64 flags;	/* 'flags' member of btrfs_ioctl_send_args is u64 */  	struct vfsmount *mnt; @@ -1814,8 +1815,10 @@ static int name_cache_insert(struct send_ctx *sctx,  			(unsigned long)nce->ino);  	if (!nce_head) {  		nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); -		if (!nce_head) +		if (!nce_head) { +			kfree(nce);  			return -ENOMEM; +		}  		INIT_LIST_HEAD(nce_head);  		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); @@ -3707,6 +3710,39 @@ out:  	return ret;  } +/* + * Send an update extent command to user space. + */ +static int send_update_extent(struct send_ctx *sctx, +			      u64 offset, u32 len) +{ +	int ret = 0; +	struct fs_path *p; + +	p = fs_path_alloc(sctx); +	if (!p) +		return -ENOMEM; + +	ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); +	if (ret < 0) +		goto out; + +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); +	if (ret < 0) +		goto out; + +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); +	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); +	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + +	ret = send_cmd(sctx); + +tlv_put_failure: +out: +	fs_path_free(sctx, p); +	return ret; +} +  static int send_write_or_clone(struct send_ctx *sctx,  			       struct btrfs_path *path,  			       struct btrfs_key *key, @@ -3742,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx,  		goto out;  	} -	if (!clone_root) { +	if (clone_root) { +		ret = send_clone(sctx, offset, len, clone_root); +	} else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { +		ret = send_update_extent(sctx, offset, len); +	} else {  		while (pos < len) {  			l = len - pos;  			if (l > BTRFS_SEND_READ_SIZE) @@ -3755,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx,  			pos += ret;  		}  		ret = 0; -	} else { -		ret = send_clone(sctx, offset, len, clone_root);  	} -  out:  	return ret;  } @@ -4534,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	struct btrfs_fs_info *fs_info;  	struct btrfs_ioctl_send_args *arg = NULL;  	struct btrfs_key key; -	struct file *filp = NULL;  	struct send_ctx *sctx = NULL;  	u32 i;  	u64 *clone_sources_tmp = NULL; @@ -4542,7 +4578,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root; +	send_root = BTRFS_I(file_inode(mnt_file))->root;  	fs_info = send_root->fs_info;  	arg = memdup_user(arg_, sizeof(*arg)); @@ -4559,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  		goto out;  	} +	if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) { +		ret = -EINVAL; +		goto out; +	} +  	sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);  	if (!sctx) {  		ret = -ENOMEM; @@ -4570,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);  	INIT_LIST_HEAD(&sctx->name_cache_list); +	sctx->flags = arg->flags; +  	sctx->send_filp = fget(arg->send_fd);  	if (IS_ERR(sctx->send_filp)) {  		ret = PTR_ERR(sctx->send_filp); @@ -4671,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  		goto out;  out: -	if (filp) -		fput(filp);  	kfree(arg);  	vfree(clone_sources_tmp); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 1bf4f32fd4e..8bb18f7ccaa 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -86,6 +86,7 @@ enum btrfs_send_cmd {  	BTRFS_SEND_C_UTIMES,  	BTRFS_SEND_C_END, +	BTRFS_SEND_C_UPDATE_EXTENT,  	__BTRFS_SEND_C_MAX,  };  #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 99545df1b86..68a29a1ea06 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -41,13 +41,13 @@  #include <linux/slab.h>  #include <linux/cleancache.h>  #include <linux/ratelimit.h> +#include <linux/btrfs.h>  #include "compat.h"  #include "delayed-inode.h"  #include "ctree.h"  #include "disk-io.h"  #include "transaction.h"  #include "btrfs_inode.h" -#include "ioctl.h"  #include "print-tree.h"  #include "xattr.h"  #include "volumes.h" @@ -63,8 +63,7 @@  static const struct super_operations btrfs_super_ops;  static struct file_system_type btrfs_fs_type; -static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, -				      char nbuf[16]) +static const char *btrfs_decode_error(int errno, char nbuf[16])  {  	char *errstr = NULL; @@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)  	 * today we only save the error info into ram.  Long term we'll  	 * also send it down to the disk  	 */ -	fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; +	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);  }  static void save_error_info(struct btrfs_fs_info *fs_info) @@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)  	if (sb->s_flags & MS_RDONLY)  		return; -	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {  		sb->s_flags |= MS_RDONLY;  		printk(KERN_INFO "btrfs is forced readonly\n");  		/* @@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,  	struct super_block *sb = fs_info->sb;  	char nbuf[16];  	const char *errstr; -	va_list args; -	va_start(args, fmt);  	/*  	 * Special case: if the error is EROFS, and we're already @@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,  	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))    		return; -  	errstr = btrfs_decode_error(fs_info, errno, nbuf); +  	errstr = btrfs_decode_error(errno, nbuf);  	if (fmt) { -		struct va_format vaf = { -			.fmt = fmt, -			.va = &args, -		}; +		struct va_format vaf; +		va_list args; + +		va_start(args, fmt); +		vaf.fmt = fmt; +		vaf.va = &args;  		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",  			sb->s_id, function, line, errstr, &vaf); +		va_end(args);  	} else {  		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",  			sb->s_id, function, line, errstr); @@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,  		save_error_info(fs_info);  		btrfs_handle_error(fs_info);  	} -	va_end(args);  }  static const char * const logtypes[] = { @@ -261,13 +260,13 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,  		char nbuf[16];  		const char *errstr; -		errstr = btrfs_decode_error(root->fs_info, errno, nbuf); +		errstr = btrfs_decode_error(errno, nbuf);  		btrfs_printk(root->fs_info,  			     "%s:%d: Aborting unused transaction(%s).\n",  			     function, line, errstr);  		return;  	} -	trans->transaction->aborted = errno; +	ACCESS_ONCE(trans->transaction->aborted) = errno;  	__btrfs_std_error(root->fs_info, function, line, errno, NULL);  }  /* @@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,  	va_start(args, fmt);  	vaf.va = &args; -	errstr = btrfs_decode_error(fs_info, errno, nbuf); -	if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) +	errstr = btrfs_decode_error(errno, nbuf); +	if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))  		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",  			s_id, function, line, &vaf, errstr); @@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  		case Opt_compress_force:  		case Opt_compress_force_type:  			compress_force = true; +			/* Fallthrough */  		case Opt_compress:  		case Opt_compress_type:  			if (token == Opt_compress || @@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  		case Opt_alloc_start:  			num = match_strdup(&args[0]);  			if (num) { +				mutex_lock(&info->chunk_mutex);  				info->alloc_start = memparse(num, NULL); +				mutex_unlock(&info->chunk_mutex);  				kfree(num);  				printk(KERN_INFO  					"btrfs: allocations start at %llu\n", @@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)  	btrfs_wait_ordered_extents(root, 0); -	trans = btrfs_attach_transaction(root); +	trans = btrfs_attach_transaction_barrier(root);  	if (IS_ERR(trans)) {  		/* no transaction, don't bother */  		if (PTR_ERR(trans) == -ENOENT) @@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,  			      new_pool_size);  } +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, +					 unsigned long old_opts, int flags) +{ +	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + +	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && +	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || +	     (flags & MS_RDONLY))) { +		/* wait for any defraggers to finish */ +		wait_event(fs_info->transaction_wait, +			   (atomic_read(&fs_info->defrag_running) == 0)); +		if (flags & MS_RDONLY) +			sync_filesystem(fs_info->sb); +	} +} + +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, +					 unsigned long old_opts) +{ +	/* +	 * We need cleanup all defragable inodes if the autodefragment is +	 * close or the fs is R/O. +	 */ +	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && +	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || +	     (fs_info->sb->s_flags & MS_RDONLY))) { +		btrfs_cleanup_defrag_inodes(fs_info); +	} + +	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} +  static int btrfs_remount(struct super_block *sb, int *flags, char *data)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  	unsigned int old_metadata_ratio = fs_info->metadata_ratio;  	int ret; +	btrfs_remount_prepare(fs_info, old_opts, *flags); +  	ret = btrfs_parse_options(root, data);  	if (ret) {  		ret = -EINVAL; @@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  		fs_info->thread_pool_size, old_thread_pool_size);  	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) -		return 0; +		goto out;  	if (*flags & MS_RDONLY) {  		/* @@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  		}  		sb->s_flags &= ~MS_RDONLY;  	} - +out: +	btrfs_remount_cleanup(fs_info, old_opts);  	return 0;  restore: @@ -1289,10 +1326,13 @@ restore:  	fs_info->mount_opt = old_opts;  	fs_info->compress_type = old_compress_type;  	fs_info->max_inline = old_max_inline; +	mutex_lock(&fs_info->chunk_mutex);  	fs_info->alloc_start = old_alloc_start; +	mutex_unlock(&fs_info->chunk_mutex);  	btrfs_resize_thread_pool(fs_info,  		old_thread_pool_size, fs_info->thread_pool_size);  	fs_info->metadata_ratio = old_metadata_ratio; +	btrfs_remount_cleanup(fs_info, old_opts);  	return ret;  } @@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb)  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = btrfs_sb(sb)->tree_root; -	trans = btrfs_attach_transaction(root); +	trans = btrfs_attach_transaction_barrier(root);  	if (IS_ERR(trans)) {  		/* no transaction, don't bother */  		if (PTR_ERR(trans) == -ENOENT) @@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void)  	if (err)  		goto free_delayed_inode; -	err = btrfs_interface_init(); +	err = btrfs_delayed_ref_init();  	if (err)  		goto free_auto_defrag; +	err = btrfs_interface_init(); +	if (err) +		goto free_delayed_ref; +  	err = register_filesystem(&btrfs_fs_type);  	if (err)  		goto unregister_ioctl; @@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void)  unregister_ioctl:  	btrfs_interface_exit(); +free_delayed_ref: +	btrfs_delayed_ref_exit();  free_auto_defrag:  	btrfs_auto_defrag_exit();  free_delayed_inode: @@ -1720,6 +1766,7 @@ free_compress:  static void __exit exit_btrfs_fs(void)  {  	btrfs_destroy_cachep(); +	btrfs_delayed_ref_exit();  	btrfs_auto_defrag_exit();  	btrfs_delayed_inode_exit();  	ordered_data_exit(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index daac9ae6d73..5b326cd60a4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,7 +21,6 @@  #include <linux/spinlock.h>  #include <linux/completion.h>  #include <linux/buffer_head.h> -#include <linux/module.h>  #include <linux/kobject.h>  #include "ctree.h" diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a21ea..e52da6fb116 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction)  	if (atomic_dec_and_test(&transaction->use_count)) {  		BUG_ON(!list_empty(&transaction->list));  		WARN_ON(transaction->delayed_refs.root.rb_node); -		memset(transaction, 0, sizeof(*transaction));  		kmem_cache_free(btrfs_transaction_cachep, transaction);  	}  } @@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root)  	root->commit_root = btrfs_root_node(root);  } +static inline int can_join_transaction(struct btrfs_transaction *trans, +				       int type) +{ +	return !(trans->in_commit && +		 type != TRANS_JOIN && +		 type != TRANS_JOIN_NOLOCK); +} +  /*   * either allocate a new transaction or hop into the existing one   */ @@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type)  	spin_lock(&fs_info->trans_lock);  loop:  	/* The file system has been taken offline. No new transactions. */ -	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {  		spin_unlock(&fs_info->trans_lock);  		return -EROFS;  	} @@ -86,6 +93,10 @@ loop:  			spin_unlock(&fs_info->trans_lock);  			return cur_trans->aborted;  		} +		if (!can_join_transaction(cur_trans, type)) { +			spin_unlock(&fs_info->trans_lock); +			return -EBUSY; +		}  		atomic_inc(&cur_trans->use_count);  		atomic_inc(&cur_trans->num_writers);  		cur_trans->num_joined++; @@ -112,9 +123,8 @@ loop:  		 * to redo the trans_no_join checks above  		 */  		kmem_cache_free(btrfs_transaction_cachep, cur_trans); -		cur_trans = fs_info->running_transaction;  		goto loop; -	} else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {  		spin_unlock(&fs_info->trans_lock);  		kmem_cache_free(btrfs_transaction_cachep, cur_trans);  		return -EROFS; @@ -156,8 +166,12 @@ loop:  	spin_lock_init(&cur_trans->commit_lock);  	spin_lock_init(&cur_trans->delayed_refs.lock); +	atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); +	atomic_set(&cur_trans->delayed_refs.ref_seq, 0); +	init_waitqueue_head(&cur_trans->delayed_refs.wait);  	INIT_LIST_HEAD(&cur_trans->pending_snapshots); +	INIT_LIST_HEAD(&cur_trans->ordered_operations);  	list_add_tail(&cur_trans->list, &fs_info->trans_list);  	extent_io_tree_init(&cur_trans->dirty_pages,  			     fs_info->btree_inode->i_mapping); @@ -302,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,  	int ret;  	u64 qgroup_reserved = 0; -	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) +	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))  		return ERR_PTR(-EROFS);  	if (current->journal_info) { @@ -333,12 +347,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,  					  &root->fs_info->trans_block_rsv,  					  num_bytes, flush);  		if (ret) -			return ERR_PTR(ret); +			goto reserve_fail;  	}  again:  	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); -	if (!h) -		return ERR_PTR(-ENOMEM); +	if (!h) { +		ret = -ENOMEM; +		goto alloc_fail; +	}  	/*  	 * If we are JOIN_NOLOCK we're already committing a transaction and @@ -358,18 +374,17 @@ again:  	do {  		ret = join_transaction(root, type); -		if (ret == -EBUSY) +		if (ret == -EBUSY) {  			wait_current_trans(root); +			if (unlikely(type == TRANS_ATTACH)) +				ret = -ENOENT; +		}  	} while (ret == -EBUSY);  	if (ret < 0) {  		/* We must get the transaction if we are JOIN_NOLOCK. */  		BUG_ON(type == TRANS_JOIN_NOLOCK); - -		if (type < TRANS_JOIN_NOLOCK) -			sb_end_intwrite(root->fs_info->sb); -		kmem_cache_free(btrfs_trans_handle_cachep, h); -		return ERR_PTR(ret); +		goto join_fail;  	}  	cur_trans = root->fs_info->running_transaction; @@ -385,9 +400,10 @@ again:  	h->block_rsv = NULL;  	h->orig_rsv = NULL;  	h->aborted = 0; -	h->qgroup_reserved = qgroup_reserved; +	h->qgroup_reserved = 0;  	h->delayed_ref_elem.seq = 0;  	h->type = type; +	h->allocating_chunk = false;  	INIT_LIST_HEAD(&h->qgroup_ref_list);  	INIT_LIST_HEAD(&h->new_bgs); @@ -403,6 +419,7 @@ again:  		h->block_rsv = &root->fs_info->trans_block_rsv;  		h->bytes_reserved = num_bytes;  	} +	h->qgroup_reserved = qgroup_reserved;  got_it:  	btrfs_record_root_in_trans(h, root); @@ -410,6 +427,19 @@ got_it:  	if (!current->journal_info && type != TRANS_USERSPACE)  		current->journal_info = h;  	return h; + +join_fail: +	if (type < TRANS_JOIN_NOLOCK) +		sb_end_intwrite(root->fs_info->sb); +	kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: +	if (num_bytes) +		btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, +					num_bytes); +reserve_fail: +	if (qgroup_reserved) +		btrfs_qgroup_free(root, qgroup_reserved); +	return ERR_PTR(ret);  }  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, @@ -441,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root  	return start_transaction(root, 0, TRANS_USERSPACE, 0);  } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is used when we want to commit the current the transaction, but + * don't want to start a new one. + * + * Note: If this function return -ENOENT, it just means there is no + * running transaction. But it is possible that the inactive transaction + * is still in the memory, not fully on disk. If you hope there is no + * inactive transaction in the fs when -ENOENT is returned, you should + * invoke + *     btrfs_attach_transaction_barrier() + */  struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)  {  	return start_transaction(root, 0, TRANS_ATTACH, 0);  } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is similar to the above function, the differentia is this one + * will wait for all the inactive transactions until they fully + * complete. + */ +struct btrfs_trans_handle * +btrfs_attach_transaction_barrier(struct btrfs_root *root) +{ +	struct btrfs_trans_handle *trans; + +	trans = start_transaction(root, 0, TRANS_ATTACH, 0); +	if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) +		btrfs_wait_for_commit(root, 0); + +	return trans; +} +  /* wait for a transaction commit to be fully complete */  static noinline void wait_for_commit(struct btrfs_root *root,  				    struct btrfs_transaction *commit) @@ -577,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	if (!list_empty(&trans->new_bgs))  		btrfs_create_pending_block_groups(trans, root); -	while (count < 2) { +	while (count < 1) {  		unsigned long cur = trans->delayed_ref_updates;  		trans->delayed_ref_updates = 0;  		if (cur && @@ -589,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  		}  		count++;  	} +  	btrfs_trans_release_metadata(trans, root);  	trans->block_rsv = NULL; @@ -634,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  		btrfs_run_delayed_iputs(root);  	if (trans->aborted || -	    root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))  		err = -EIO; -	}  	assert_qgroups_uptodate(trans); -	memset(trans, 0, sizeof(*trans));  	kmem_cache_free(btrfs_trans_handle_cachep, trans);  	return err;  } @@ -686,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,  	struct extent_state *cached_state = NULL;  	u64 start = 0;  	u64 end; +	struct blk_plug plug; +	blk_start_plug(&plug);  	while (!find_first_extent_bit(dirty_pages, start, &start, &end,  				      mark, &cached_state)) {  		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, @@ -700,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,  	}  	if (err)  		werr = err; +	blk_finish_plug(&plug);  	return werr;  } @@ -950,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,  }  /* - * defrag a given btree.  If cacheonly == 1, this won't read from the disk, - * otherwise every leaf in the btree is read and defragged. + * defrag a given btree. + * Every leaf in the btree is read and defragged.   */ -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +int btrfs_defrag_root(struct btrfs_root *root)  {  	struct btrfs_fs_info *info = root->fs_info;  	struct btrfs_trans_handle *trans; @@ -967,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)  		if (IS_ERR(trans))  			return PTR_ERR(trans); -		ret = btrfs_defrag_leaves(trans, root, cacheonly); +		ret = btrfs_defrag_leaves(trans, root);  		btrfs_end_transaction(trans, root);  		btrfs_btree_balance_dirty(info->tree_root); @@ -975,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)  		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)  			break; + +		if (btrfs_defrag_cancelled(root->fs_info)) { +			printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); +			ret = -EAGAIN; +			break; +		}  	}  	root->defrag_running = 0;  	return ret; @@ -997,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	struct inode *parent_inode;  	struct btrfs_path *path;  	struct btrfs_dir_item *dir_item; -	struct dentry *parent;  	struct dentry *dentry;  	struct extent_buffer *tmp;  	struct extent_buffer *old; @@ -1012,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	path = btrfs_alloc_path();  	if (!path) {  		ret = pending->error = -ENOMEM; -		goto path_alloc_fail; +		return ret;  	}  	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); @@ -1052,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	rsv = trans->block_rsv;  	trans->block_rsv = &pending->block_rsv; +	trans->bytes_reserved = trans->block_rsv->reserved;  	dentry = pending->dentry; -	parent = dget_parent(dentry); -	parent_inode = parent->d_inode; +	parent_inode = pending->dir;  	parent_root = BTRFS_I(parent_inode)->root;  	record_root_in_trans(trans, parent_root); @@ -1203,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	if (ret)  		btrfs_abort_transaction(trans, root, ret);  fail: -	dput(parent);  	trans->block_rsv = rsv; +	trans->bytes_reserved = 0;  no_free_objectid:  	kfree(new_root_item);  root_item_alloc_fail:  	btrfs_free_path(path); -path_alloc_fail: -	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);  	return ret;  } @@ -1296,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,  struct btrfs_async_commit {  	struct btrfs_trans_handle *newtrans;  	struct btrfs_root *root; -	struct delayed_work work; +	struct work_struct work;  };  static void do_async_commit(struct work_struct *work)  {  	struct btrfs_async_commit *ac = -		container_of(work, struct btrfs_async_commit, work.work); +		container_of(work, struct btrfs_async_commit, work);  	/*  	 * We've got freeze protection passed with the transaction. @@ -1330,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,  	if (!ac)  		return -ENOMEM; -	INIT_DELAYED_WORK(&ac->work, do_async_commit); +	INIT_WORK(&ac->work, do_async_commit);  	ac->root = root;  	ac->newtrans = btrfs_join_transaction(root);  	if (IS_ERR(ac->newtrans)) { @@ -1354,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,  			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],  			1, _THIS_IP_); -	schedule_delayed_work(&ac->work, 0); +	schedule_work(&ac->work);  	/* wait for transaction to start and unblock */  	if (wait_for_unblock) @@ -1374,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,  				struct btrfs_root *root, int err)  {  	struct btrfs_transaction *cur_trans = trans->transaction; +	DEFINE_WAIT(wait);  	WARN_ON(trans->use_count > 1); @@ -1382,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,  	spin_lock(&root->fs_info->trans_lock);  	list_del_init(&cur_trans->list);  	if (cur_trans == root->fs_info->running_transaction) { +		root->fs_info->trans_no_join = 1; +		spin_unlock(&root->fs_info->trans_lock); +		wait_event(cur_trans->writer_wait, +			   atomic_read(&cur_trans->num_writers) == 1); + +		spin_lock(&root->fs_info->trans_lock);  		root->fs_info->running_transaction = NULL; -		root->fs_info->trans_no_join = 0;  	}  	spin_unlock(&root->fs_info->trans_lock); @@ -1417,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,  	}  	if (flush_on_commit || snap_pending) { -		btrfs_start_delalloc_inodes(root, 1); +		ret = btrfs_start_delalloc_inodes(root, 1); +		if (ret) +			return ret;  		btrfs_wait_ordered_extents(root, 1);  	} @@ -1439,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,  	 * it here and no for sure that nothing new will be added  	 * to the list  	 */ -	btrfs_run_ordered_operations(root, 1); +	ret = btrfs_run_ordered_operations(trans, root, 1); -	return 0; +	return ret;  }  /* @@ -1462,26 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	int should_grow = 0;  	unsigned long now = get_seconds(); -	ret = btrfs_run_ordered_operations(root, 0); +	ret = btrfs_run_ordered_operations(trans, root, 0);  	if (ret) {  		btrfs_abort_transaction(trans, root, ret); -		goto cleanup_transaction; +		btrfs_end_transaction(trans, root); +		return ret;  	} -	if (cur_trans->aborted) { +	/* Stop the commit early if ->aborted is set */ +	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {  		ret = cur_trans->aborted; -		goto cleanup_transaction; +		btrfs_end_transaction(trans, root); +		return ret;  	}  	/* make a pass through all the delayed refs we have so far  	 * any runnings procs may add more while we are here  	 */  	ret = btrfs_run_delayed_refs(trans, root, 0); -	if (ret) -		goto cleanup_transaction; +	if (ret) { +		btrfs_end_transaction(trans, root); +		return ret; +	}  	btrfs_trans_release_metadata(trans, root);  	trans->block_rsv = NULL; +	if (trans->qgroup_reserved) { +		btrfs_qgroup_free(root, trans->qgroup_reserved); +		trans->qgroup_reserved = 0; +	}  	cur_trans = trans->transaction; @@ -1495,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  		btrfs_create_pending_block_groups(trans, root);  	ret = btrfs_run_delayed_refs(trans, root, 0); -	if (ret) -		goto cleanup_transaction; +	if (ret) { +		btrfs_end_transaction(trans, root); +		return ret; +	}  	spin_lock(&cur_trans->commit_lock);  	if (cur_trans->in_commit) { @@ -1574,6 +1660,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	wait_event(cur_trans->writer_wait,  		   atomic_read(&cur_trans->num_writers) == 1); +	/* ->aborted might be set after the previous check, so check it */ +	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { +		ret = cur_trans->aborted; +		goto cleanup_transaction; +	}  	/*  	 * the reloc mutex makes sure that we stop  	 * the balancing code from coming in and moving @@ -1657,6 +1748,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  		goto cleanup_transaction;  	} +	/* +	 * The tasks which save the space cache and inode cache may also +	 * update ->aborted, check it. +	 */ +	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { +		ret = cur_trans->aborted; +		mutex_unlock(&root->fs_info->tree_log_mutex); +		mutex_unlock(&root->fs_info->reloc_mutex); +		goto cleanup_transaction; +	} +  	btrfs_prepare_extent_commit(trans, root);  	cur_trans = root->fs_info->running_transaction; @@ -1744,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  cleanup_transaction:  	btrfs_trans_release_metadata(trans, root);  	trans->block_rsv = NULL; +	if (trans->qgroup_reserved) { +		btrfs_qgroup_free(root, trans->qgroup_reserved); +		trans->qgroup_reserved = 0; +	}  	btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");  //	WARN_ON(1);  	if (current->journal_info == trans) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0e8aa1e6c28..3c8e0d25c8e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -43,6 +43,7 @@ struct btrfs_transaction {  	wait_queue_head_t writer_wait;  	wait_queue_head_t commit_wait;  	struct list_head pending_snapshots; +	struct list_head ordered_operations;  	struct btrfs_delayed_ref_root delayed_refs;  	int aborted;  }; @@ -68,6 +69,7 @@ struct btrfs_trans_handle {  	struct btrfs_block_rsv *orig_rsv;  	short aborted;  	short adding_csums; +	bool allocating_chunk;  	enum btrfs_trans_type type;  	/*  	 * this root is only needed to validate that the root passed to @@ -82,11 +84,13 @@ struct btrfs_trans_handle {  struct btrfs_pending_snapshot {  	struct dentry *dentry; +	struct inode *dir;  	struct btrfs_root *root;  	struct btrfs_root *snap;  	struct btrfs_qgroup_inherit *inherit;  	/* block reservation for the operation */  	struct btrfs_block_rsv block_rsv; +	u64 qgroup_reserved;  	/* extra metadata reseration for relocation */  	int error;  	bool readonly; @@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(  struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);  struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);  struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction_barrier( +					struct btrfs_root *root);  struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);  int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);  int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root);  int btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); +int btrfs_defrag_root(struct btrfs_root *root);  int btrfs_clean_old_snapshots(struct btrfs_root *root);  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root); diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3b580ee8ab1..94e05c1f118 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -23,13 +23,14 @@  #include "transaction.h"  #include "locking.h" -/* defrag all the leaves in a given btree.  If cache_only == 1, don't read - * things from disk, otherwise read all the leaves and try to get key order to +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to   * better reflect disk order   */  int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, -			struct btrfs_root *root, int cache_only) +			struct btrfs_root *root)  {  	struct btrfs_path *path = NULL;  	struct btrfs_key key; @@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,  	u64 last_ret = 0;  	u64 min_trans = 0; -	if (cache_only) -		goto out; -  	if (root->fs_info->extent_root == root) {  		/*  		 * there's recursion here right now in the tree locking, @@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,  	}  	path->keep_locks = 1; -	if (cache_only) -		min_trans = root->defrag_trans_start; -	ret = btrfs_search_forward(root, &key, NULL, path, -				   cache_only, min_trans); +	ret = btrfs_search_forward(root, &key, NULL, path, min_trans);  	if (ret < 0)  		goto out;  	if (ret > 0) { @@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,  		goto out;  	}  	path->slots[1] = btrfs_header_nritems(path->nodes[1]); -	next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, +	next_key_ret = btrfs_find_next_key(root, path, &key, 1,  					   min_trans);  	ret = btrfs_realloc_node(trans, root,  				 path->nodes[1], 0, -				 cache_only, &last_ret, +				 &last_ret,  				 &root->defrag_progress);  	if (ret) {  		WARN_ON(ret == -EAGAIN); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83186c7e45d..c7ef569eb22 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log,  			      struct walk_control *wc, u64 gen)  {  	if (wc->pin) -		btrfs_pin_extent_for_log_replay(wc->trans, -						log->fs_info->extent_root, +		btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,  						eb->start, eb->len);  	if (btrfs_buffer_uptodate(eb, gen, 0)) { @@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,  				      struct btrfs_key *key)  {  	int found_type; -	u64 mask = root->sectorsize - 1;  	u64 extent_end;  	u64 start = key->offset;  	u64 saved_nbytes; @@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,  		extent_end = start + btrfs_file_extent_num_bytes(eb, item);  	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {  		size = btrfs_file_extent_inline_len(eb, item); -		extent_end = (start + size + mask) & ~mask; +		extent_end = ALIGN(start + size, root->sectorsize);  	} else {  		ret = 0;  		goto out; @@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	unsigned long log_transid = 0;  	mutex_lock(&root->log_mutex); +	log_transid = root->log_transid;  	index1 = root->log_transid % 2;  	if (atomic_read(&root->log_commit[index1])) {  		wait_log_commit(trans, root, root->log_transid); @@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	/* bail out if we need to do a full commit */  	if (root->fs_info->last_trans_log_full_commit == trans->transid) {  		ret = -EAGAIN; +		btrfs_free_logged_extents(log, log_transid);  		mutex_unlock(&root->log_mutex);  		goto out;  	} -	log_transid = root->log_transid;  	if (log_transid % 2 == 0)  		mark = EXTENT_DIRTY;  	else @@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);  	if (ret) {  		btrfs_abort_transaction(trans, root, ret); +		btrfs_free_logged_extents(log, log_transid);  		mutex_unlock(&root->log_mutex);  		goto out;  	} @@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  		}  		root->fs_info->last_trans_log_full_commit = trans->transid;  		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); +		btrfs_free_logged_extents(log, log_transid);  		mutex_unlock(&log_root_tree->log_mutex);  		ret = -EAGAIN;  		goto out; @@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);  		wait_log_commit(trans, log_root_tree,  				log_root_tree->log_transid); +		btrfs_free_logged_extents(log, log_transid);  		mutex_unlock(&log_root_tree->log_mutex);  		ret = 0;  		goto out; @@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	 */  	if (root->fs_info->last_trans_log_full_commit == trans->transid) {  		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); +		btrfs_free_logged_extents(log, log_transid);  		mutex_unlock(&log_root_tree->log_mutex);  		ret = -EAGAIN;  		goto out_wake_log_root; @@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  				EXTENT_DIRTY | EXTENT_NEW);  	if (ret) {  		btrfs_abort_transaction(trans, root, ret); +		btrfs_free_logged_extents(log, log_transid);  		mutex_unlock(&log_root_tree->log_mutex);  		goto out_wake_log_root;  	}  	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); +	btrfs_wait_logged_extents(log, log_transid);  	btrfs_set_super_log_root(root->fs_info->super_for_commit,  				log_root_tree->node->start); @@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,  		.process_func = process_one_buffer  	}; -	ret = walk_log_tree(trans, log, &wc); -	BUG_ON(ret); +	if (trans) { +		ret = walk_log_tree(trans, log, &wc); +		BUG_ON(ret); +	}  	while (1) {  		ret = find_first_extent_bit(&log->dirty_log_pages, @@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,  				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);  	} +	/* +	 * We may have short-circuited the log tree with the full commit logic +	 * and left ordered extents on our list, so clear these out to keep us +	 * from leaking inodes and memory. +	 */ +	btrfs_free_logged_extents(log, 0); +	btrfs_free_logged_extents(log, 1); +  	free_extent_buffer(log->node);  	kfree(log);  } @@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,  	path->keep_locks = 1;  	ret = btrfs_search_forward(root, &min_key, &max_key, -				   path, 0, trans->transid); +				   path, trans->transid);  	/*  	 * we didn't find anything from this transaction, see if there @@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans,  	struct btrfs_root *log = root->log_root;  	struct btrfs_file_extent_item *fi;  	struct extent_buffer *leaf; +	struct btrfs_ordered_extent *ordered;  	struct list_head ordered_sums;  	struct btrfs_map_token token;  	struct btrfs_key key; -	u64 csum_offset = em->mod_start - em->start; -	u64 csum_len = em->mod_len; +	u64 mod_start = em->mod_start; +	u64 mod_len = em->mod_len; +	u64 csum_offset; +	u64 csum_len;  	u64 extent_offset = em->start - em->orig_start;  	u64 block_len;  	int ret; +	int index = log->log_transid % 2;  	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; +insert:  	INIT_LIST_HEAD(&ordered_sums);  	btrfs_init_map_token(&token);  	key.objectid = btrfs_ino(inode); @@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans,  	leaf = path->nodes[0];  	fi = btrfs_item_ptr(leaf, path->slots[0],  			    struct btrfs_file_extent_item); + +	/* +	 * If we are overwriting an inline extent with a real one then we need +	 * to just delete the inline extent as it may not be large enough to +	 * have the entire file_extent_item. +	 */ +	if (ret && btrfs_token_file_extent_type(leaf, fi, &token) == +	    BTRFS_FILE_EXTENT_INLINE) { +		ret = btrfs_del_item(trans, log, path); +		btrfs_release_path(path); +		if (ret) { +			path->really_keep_locks = 0; +			return ret; +		} +		goto insert; +	} +  	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,  					       &token);  	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { @@ -3357,6 +3394,97 @@ static int log_one_extent(struct btrfs_trans_handle *trans,  	if (skip_csum)  		return 0; +	if (em->compress_type) { +		csum_offset = 0; +		csum_len = block_len; +	} + +	/* +	 * First check and see if our csums are on our outstanding ordered +	 * extents. +	 */ +again: +	spin_lock_irq(&log->log_extents_lock[index]); +	list_for_each_entry(ordered, &log->logged_list[index], log_list) { +		struct btrfs_ordered_sum *sum; + +		if (!mod_len) +			break; + +		if (ordered->inode != inode) +			continue; + +		if (ordered->file_offset + ordered->len <= mod_start || +		    mod_start + mod_len <= ordered->file_offset) +			continue; + +		/* +		 * We are going to copy all the csums on this ordered extent, so +		 * go ahead and adjust mod_start and mod_len in case this +		 * ordered extent has already been logged. +		 */ +		if (ordered->file_offset > mod_start) { +			if (ordered->file_offset + ordered->len >= +			    mod_start + mod_len) +				mod_len = ordered->file_offset - mod_start; +			/* +			 * If we have this case +			 * +			 * |--------- logged extent ---------| +			 *       |----- ordered extent ----| +			 * +			 * Just don't mess with mod_start and mod_len, we'll +			 * just end up logging more csums than we need and it +			 * will be ok. +			 */ +		} else { +			if (ordered->file_offset + ordered->len < +			    mod_start + mod_len) { +				mod_len = (mod_start + mod_len) - +					(ordered->file_offset + ordered->len); +				mod_start = ordered->file_offset + +					ordered->len; +			} else { +				mod_len = 0; +			} +		} + +		/* +		 * To keep us from looping for the above case of an ordered +		 * extent that falls inside of the logged extent. +		 */ +		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, +				     &ordered->flags)) +			continue; +		atomic_inc(&ordered->refs); +		spin_unlock_irq(&log->log_extents_lock[index]); +		/* +		 * we've dropped the lock, we must either break or +		 * start over after this. +		 */ + +		wait_event(ordered->wait, ordered->csum_bytes_left == 0); + +		list_for_each_entry(sum, &ordered->list, list) { +			ret = btrfs_csum_file_blocks(trans, log, sum); +			if (ret) { +				btrfs_put_ordered_extent(ordered); +				goto unlocked; +			} +		} +		btrfs_put_ordered_extent(ordered); +		goto again; + +	} +	spin_unlock_irq(&log->log_extents_lock[index]); +unlocked: + +	if (!mod_len || ret) +		return ret; + +	csum_offset = mod_start - em->start; +	csum_len = mod_len; +  	/* block start is already adjusted for the file extent offset. */  	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,  				       em->block_start + csum_offset, @@ -3388,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,  	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;  	u64 test_gen;  	int ret = 0; +	int num = 0;  	INIT_LIST_HEAD(&extents); @@ -3396,27 +3525,42 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,  	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {  		list_del_init(&em->list); + +		/* +		 * Just an arbitrary number, this can be really CPU intensive +		 * once we start getting a lot of extents, and really once we +		 * have a bunch of extents we just want to commit since it will +		 * be faster. +		 */ +		if (++num > 32768) { +			list_del_init(&tree->modified_extents); +			ret = -EFBIG; +			goto process; +		} +  		if (em->generation <= test_gen)  			continue;  		/* Need a ref to keep it from getting evicted from cache */  		atomic_inc(&em->refs);  		set_bit(EXTENT_FLAG_LOGGING, &em->flags);  		list_add_tail(&em->list, &extents); +		num++;  	}  	list_sort(NULL, &extents, extent_cmp); +process:  	while (!list_empty(&extents)) {  		em = list_entry(extents.next, struct extent_map, list);  		list_del_init(&em->list); -		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);  		/*  		 * If we had an error we just need to delete everybody from our  		 * private list.  		 */  		if (ret) { +			clear_em_logging(tree, em);  			free_extent_map(em);  			continue;  		} @@ -3424,8 +3568,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,  		write_unlock(&tree->lock);  		ret = log_one_extent(trans, inode, root, em, path); -		free_extent_map(em);  		write_lock(&tree->lock); +		clear_em_logging(tree, em); +		free_extent_map(em);  	}  	WARN_ON(!list_empty(&extents));  	write_unlock(&tree->lock); @@ -3507,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  	mutex_lock(&BTRFS_I(inode)->log_mutex); +	btrfs_get_logged_extents(log, inode); +  	/*  	 * a brute force approach to making sure we get the most uptodate  	 * copies of everything. @@ -3552,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  	while (1) {  		ins_nr = 0;  		ret = btrfs_search_forward(root, &min_key, &max_key, -					   path, 0, trans->transid); +					   path, trans->transid);  		if (ret != 0)  			break;  again: @@ -3650,6 +3797,8 @@ log_extents:  	BTRFS_I(inode)->logged_trans = trans->transid;  	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;  out_unlock: +	if (err) +		btrfs_free_logged_extents(log, log->log_transid);  	mutex_unlock(&BTRFS_I(inode)->log_mutex);  	btrfs_free_path(path); @@ -3816,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,  end_trans:  	dput(old_parent);  	if (ret < 0) { -		WARN_ON(ret != -ENOSPC);  		root->fs_info->last_trans_log_full_commit = trans->transid;  		ret = 1;  	} diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 99be4c138db..ddc61cad008 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,7 +5,7 @@   */  #include <linux/slab.h> -#include <linux/module.h> +#include <linux/export.h>  #include "ulist.h"  /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cce6aa7401..35bb2d4ed29 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,6 +25,8 @@  #include <linux/capability.h>  #include <linux/ratelimit.h>  #include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <asm/div64.h>  #include "compat.h"  #include "ctree.h"  #include "extent_map.h" @@ -32,6 +34,7 @@  #include "transaction.h"  #include "print-tree.h"  #include "volumes.h" +#include "raid56.h"  #include "async-thread.h"  #include "check-integrity.h"  #include "rcu-string.h" @@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)  		new_device->writeable = 0;  		new_device->in_fs_metadata = 0;  		new_device->can_discard = 0; +		spin_lock_init(&new_device->io_lock);  		list_replace_rcu(&device->dev_list, &new_device->dev_list);  		call_rcu(&device->rcu, free_device); @@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  	return ret;  } +/* + * Look for a btrfs signature on a device. This may be called out of the mount path + * and we are not allowed to call set_blocksize during the scan. The superblock + * is read via pagecache + */  int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,  			  struct btrfs_fs_devices **fs_devices_ret)  {  	struct btrfs_super_block *disk_super;  	struct block_device *bdev; -	struct buffer_head *bh; -	int ret; +	struct page *page; +	void *p; +	int ret = -EINVAL;  	u64 devid;  	u64 transid;  	u64 total_devices; +	u64 bytenr; +	pgoff_t index; +	/* +	 * we would like to check all the supers, but that would make +	 * a btrfs mount succeed after a mkfs from a different FS. +	 * So, we need to add a special mount option to scan for +	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead +	 */ +	bytenr = btrfs_sb_offset(0);  	flags |= FMODE_EXCL;  	mutex_lock(&uuid_mutex); -	ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); -	if (ret) + +	bdev = blkdev_get_by_path(path, flags, holder); + +	if (IS_ERR(bdev)) { +		ret = PTR_ERR(bdev);  		goto error; -	disk_super = (struct btrfs_super_block *)bh->b_data; +	} + +	/* make sure our super fits in the device */ +	if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) +		goto error_bdev_put; + +	/* make sure our super fits in the page */ +	if (sizeof(*disk_super) > PAGE_CACHE_SIZE) +		goto error_bdev_put; + +	/* make sure our super doesn't straddle pages on disk */ +	index = bytenr >> PAGE_CACHE_SHIFT; +	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) +		goto error_bdev_put; + +	/* pull in the page with our super */ +	page = read_cache_page_gfp(bdev->bd_inode->i_mapping, +				   index, GFP_NOFS); + +	if (IS_ERR_OR_NULL(page)) +		goto error_bdev_put; + +	p = kmap(page); + +	/* align our pointer to the offset of the super block */ +	disk_super = p + (bytenr & ~PAGE_CACHE_MASK); + +	if (btrfs_super_bytenr(disk_super) != bytenr || +	    disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) +		goto error_unmap; +  	devid = btrfs_stack_device_id(&disk_super->dev_item);  	transid = btrfs_super_generation(disk_super);  	total_devices = btrfs_super_num_devices(disk_super); +  	if (disk_super->label[0]) {  		if (disk_super->label[BTRFS_LABEL_SIZE - 1])  			disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; @@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,  	} else {  		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);  	} +  	printk(KERN_CONT "devid %llu transid %llu %s\n",  	       (unsigned long long)devid, (unsigned long long)transid, path); +  	ret = device_list_add(path, disk_super, devid, fs_devices_ret);  	if (!ret && fs_devices_ret)  		(*fs_devices_ret)->total_devices = total_devices; -	brelse(bh); + +error_unmap: +	kunmap(page); +	page_cache_release(page); + +error_bdev_put:  	blkdev_put(bdev, flags);  error:  	mutex_unlock(&uuid_mutex); @@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	u64 devid;  	u64 num_devices;  	u8 *dev_uuid; +	unsigned seq;  	int ret = 0;  	bool clear_super = false;  	mutex_lock(&uuid_mutex); -	all_avail = root->fs_info->avail_data_alloc_bits | -		root->fs_info->avail_system_alloc_bits | -		root->fs_info->avail_metadata_alloc_bits; +	do { +		seq = read_seqbegin(&root->fs_info->profiles_lock); + +		all_avail = root->fs_info->avail_data_alloc_bits | +			    root->fs_info->avail_system_alloc_bits | +			    root->fs_info->avail_metadata_alloc_bits; +	} while (read_seqretry(&root->fs_info->profiles_lock, seq));  	num_devices = root->fs_info->fs_devices->num_devices;  	btrfs_dev_replace_lock(&root->fs_info->dev_replace); @@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		goto out;  	} +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && +	    root->fs_info->fs_devices->rw_devices <= 2) { +		printk(KERN_ERR "btrfs: unable to go below two " +		       "devices on raid5\n"); +		ret = -EINVAL; +		goto out; +	} +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && +	    root->fs_info->fs_devices->rw_devices <= 3) { +		printk(KERN_ERR "btrfs: unable to go below three " +		       "devices on raid6\n"); +		ret = -EINVAL; +		goto out; +	} +  	if (strcmp(device_path, "missing") == 0) {  		struct list_head *devices;  		struct btrfs_device *tmp; @@ -1431,7 +1511,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		}  	} else {  		ret = btrfs_get_bdev_and_sb(device_path, -					    FMODE_READ | FMODE_EXCL, +					    FMODE_WRITE | FMODE_EXCL,  					    root->fs_info->bdev_holder, 0,  					    &bdev, &bh);  		if (ret) @@ -1556,7 +1636,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	ret = 0;  	/* Notify udev that device has changed */ -	btrfs_kobject_uevent(bdev, KOBJ_CHANGE); +	if (bdev) +		btrfs_kobject_uevent(bdev, KOBJ_CHANGE);  error_brelse:  	brelse(bh); @@ -2614,7 +2695,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,  	cache = btrfs_lookup_block_group(fs_info, chunk_offset);  	chunk_used = btrfs_block_group_used(&cache->item); -	user_thresh = div_factor_fine(cache->key.offset, bargs->usage); +	if (bargs->usage == 0) +		user_thresh = 1; +	else if (bargs->usage > 100) +		user_thresh = cache->key.offset; +	else +		user_thresh = div_factor_fine(cache->key.offset, +					      bargs->usage); +  	if (chunk_used < user_thresh)  		ret = 0; @@ -2656,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,  		return 0;  	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | -	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) -		factor = 2; -	else -		factor = 1; -	factor = num_stripes / factor; +	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { +		factor = num_stripes / 2; +	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { +		factor = num_stripes - 1; +	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { +		factor = num_stripes - 2; +	} else { +		factor = num_stripes; +	}  	for (i = 0; i < num_stripes; i++) {  		stripe = btrfs_stripe_nr(chunk, i); @@ -2959,6 +3051,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)  	unset_balance_control(fs_info);  	ret = del_balance_item(fs_info->tree_root);  	BUG_ON(ret); + +	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);  }  void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, @@ -2975,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  	int mixed = 0;  	int ret;  	u64 num_devices; +	unsigned seq;  	if (btrfs_fs_closing(fs_info) ||  	    atomic_read(&fs_info->balance_pause_req) || @@ -3017,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);  	else  		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | -				BTRFS_BLOCK_GROUP_RAID10); +				BTRFS_BLOCK_GROUP_RAID10 | +				BTRFS_BLOCK_GROUP_RAID5 | +				BTRFS_BLOCK_GROUP_RAID6);  	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&  	    (!alloc_profile_is_valid(bctl->data.target, 1) || @@ -3057,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  	/* allow to reduce meta or sys integrity only if force set */  	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | -			BTRFS_BLOCK_GROUP_RAID10; -	if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && -	     (fs_info->avail_system_alloc_bits & allowed) && -	     !(bctl->sys.target & allowed)) || -	    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && -	     (fs_info->avail_metadata_alloc_bits & allowed) && -	     !(bctl->meta.target & allowed))) { -		if (bctl->flags & BTRFS_BALANCE_FORCE) { -			printk(KERN_INFO "btrfs: force reducing metadata " -			       "integrity\n"); -		} else { -			printk(KERN_ERR "btrfs: balance will reduce metadata " -			       "integrity, use force if you want this\n"); -			ret = -EINVAL; -			goto out; +			BTRFS_BLOCK_GROUP_RAID10 | +			BTRFS_BLOCK_GROUP_RAID5 | +			BTRFS_BLOCK_GROUP_RAID6; +	do { +		seq = read_seqbegin(&fs_info->profiles_lock); + +		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && +		     (fs_info->avail_system_alloc_bits & allowed) && +		     !(bctl->sys.target & allowed)) || +		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && +		     (fs_info->avail_metadata_alloc_bits & allowed) && +		     !(bctl->meta.target & allowed))) { +			if (bctl->flags & BTRFS_BALANCE_FORCE) { +				printk(KERN_INFO "btrfs: force reducing metadata " +				       "integrity\n"); +			} else { +				printk(KERN_ERR "btrfs: balance will reduce metadata " +				       "integrity, use force if you want this\n"); +				ret = -EINVAL; +				goto out; +			}  		} -	} +	} while (read_seqretry(&fs_info->profiles_lock, seq));  	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {  		int num_tolerated_disk_barrier_failures; @@ -3117,29 +3220,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  	mutex_lock(&fs_info->balance_mutex);  	atomic_dec(&fs_info->balance_running); -	if (bargs) { -		memset(bargs, 0, sizeof(*bargs)); -		update_ioctl_balance_args(fs_info, 0, bargs); -	} - -	if ((ret && ret != -ECANCELED && ret != -ENOSPC) || -	    balance_need_close(fs_info)) { -		__cancel_balance(fs_info); -	} -  	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {  		fs_info->num_tolerated_disk_barrier_failures =  			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);  	} +	if (bargs) { +		memset(bargs, 0, sizeof(*bargs)); +		update_ioctl_balance_args(fs_info, 0, bargs); +	} +  	wake_up(&fs_info->balance_wait_q);  	return ret;  out:  	if (bctl->flags & BTRFS_BALANCE_RESUME)  		__cancel_balance(fs_info); -	else +	else {  		kfree(bctl); +		atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +	}  	return ret;  } @@ -3156,7 +3256,6 @@ static int balance_kthread(void *data)  		ret = btrfs_balance(fs_info->balance_ctl, NULL);  	} -	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);  	mutex_unlock(&fs_info->balance_mutex);  	mutex_unlock(&fs_info->volume_mutex); @@ -3179,7 +3278,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)  		return 0;  	} -	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));  	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");  	if (IS_ERR(tsk))  		return PTR_ERR(tsk); @@ -3233,6 +3331,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)  	btrfs_balance_sys(leaf, item, &disk_bargs);  	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); +	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); +  	mutex_lock(&fs_info->volume_mutex);  	mutex_lock(&fs_info->balance_mutex); @@ -3492,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b)  }  struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { -	{ 2, 1, 0, 4, 2, 2 /* raid10 */ }, -	{ 1, 1, 2, 2, 2, 2 /* raid1 */ }, -	{ 1, 2, 1, 1, 1, 2 /* dup */ }, -	{ 1, 1, 0, 2, 1, 1 /* raid0 */ }, -	{ 1, 1, 0, 1, 1, 1 /* single */ }, +	[BTRFS_RAID_RAID10] = { +		.sub_stripes	= 2, +		.dev_stripes	= 1, +		.devs_max	= 0,	/* 0 == as many as possible */ +		.devs_min	= 4, +		.devs_increment	= 2, +		.ncopies	= 2, +	}, +	[BTRFS_RAID_RAID1] = { +		.sub_stripes	= 1, +		.dev_stripes	= 1, +		.devs_max	= 2, +		.devs_min	= 2, +		.devs_increment	= 2, +		.ncopies	= 2, +	}, +	[BTRFS_RAID_DUP] = { +		.sub_stripes	= 1, +		.dev_stripes	= 2, +		.devs_max	= 1, +		.devs_min	= 1, +		.devs_increment	= 1, +		.ncopies	= 2, +	}, +	[BTRFS_RAID_RAID0] = { +		.sub_stripes	= 1, +		.dev_stripes	= 1, +		.devs_max	= 0, +		.devs_min	= 2, +		.devs_increment	= 1, +		.ncopies	= 1, +	}, +	[BTRFS_RAID_SINGLE] = { +		.sub_stripes	= 1, +		.dev_stripes	= 1, +		.devs_max	= 1, +		.devs_min	= 1, +		.devs_increment	= 1, +		.ncopies	= 1, +	}, +	[BTRFS_RAID_RAID5] = { +		.sub_stripes	= 1, +		.dev_stripes	= 1, +		.devs_max	= 0, +		.devs_min	= 2, +		.devs_increment	= 1, +		.ncopies	= 2, +	}, +	[BTRFS_RAID_RAID6] = { +		.sub_stripes	= 1, +		.dev_stripes	= 1, +		.devs_max	= 0, +		.devs_min	= 3, +		.devs_increment	= 1, +		.ncopies	= 3, +	},  }; +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) +{ +	/* TODO allow them to set a preferred stripe size */ +	return 64 * 1024; +} + +static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ +	u64 features; + +	if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) +		return; + +	features = btrfs_super_incompat_flags(info->super_copy); +	if (features & BTRFS_FEATURE_INCOMPAT_RAID56) +		return; + +	features |= BTRFS_FEATURE_INCOMPAT_RAID56; +	btrfs_set_super_incompat_flags(info->super_copy, features); +	printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); +} +  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  			       struct btrfs_root *extent_root,  			       struct map_lookup **map_ret, @@ -3514,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	struct btrfs_device_info *devices_info = NULL;  	u64 total_avail;  	int num_stripes;	/* total number of stripes to allocate */ +	int data_stripes;	/* number of stripes that count for +				   block group size */  	int sub_stripes;	/* sub_stripes info for map */  	int dev_stripes;	/* stripes per dev */  	int devs_max;		/* max devs to use */ @@ -3525,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	u64 max_chunk_size;  	u64 stripe_size;  	u64 num_bytes; +	u64 raid_stripe_len = BTRFS_STRIPE_LEN;  	int ndevs;  	int i;  	int j; @@ -3619,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)  			continue; +		if (ndevs == fs_devices->rw_devices) { +			WARN(1, "%s: found more than %llu devices\n", +			     __func__, fs_devices->rw_devices); +			break; +		}  		devices_info[ndevs].dev_offset = dev_offset;  		devices_info[ndevs].max_avail = max_avail;  		devices_info[ndevs].total_avail = total_avail;  		devices_info[ndevs].dev = device;  		++ndevs; -		WARN_ON(ndevs > fs_devices->rw_devices);  	}  	/* @@ -3650,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	stripe_size = devices_info[ndevs-1].max_avail;  	num_stripes = ndevs * dev_stripes; -	if (stripe_size * ndevs > max_chunk_size * ncopies) { -		stripe_size = max_chunk_size * ncopies; -		do_div(stripe_size, ndevs); +	/* +	 * this will have to be fixed for RAID1 and RAID10 over +	 * more drives +	 */ +	data_stripes = num_stripes / ncopies; + +	if (type & BTRFS_BLOCK_GROUP_RAID5) { +		raid_stripe_len = find_raid56_stripe_len(ndevs - 1, +				 btrfs_super_stripesize(info->super_copy)); +		data_stripes = num_stripes - 1; +	} +	if (type & BTRFS_BLOCK_GROUP_RAID6) { +		raid_stripe_len = find_raid56_stripe_len(ndevs - 2, +				 btrfs_super_stripesize(info->super_copy)); +		data_stripes = num_stripes - 2; +	} + +	/* +	 * Use the number of data stripes to figure out how big this chunk +	 * is really going to be in terms of logical address space, +	 * and compare that answer with the max chunk size +	 */ +	if (stripe_size * data_stripes > max_chunk_size) { +		u64 mask = (1ULL << 24) - 1; +		stripe_size = max_chunk_size; +		do_div(stripe_size, data_stripes); + +		/* bump the answer up to a 16MB boundary */ +		stripe_size = (stripe_size + mask) & ~mask; + +		/* but don't go higher than the limits we found +		 * while searching for free extents +		 */ +		if (stripe_size > devices_info[ndevs-1].max_avail) +			stripe_size = devices_info[ndevs-1].max_avail;  	}  	do_div(stripe_size, dev_stripes);  	/* align to BTRFS_STRIPE_LEN */ -	do_div(stripe_size, BTRFS_STRIPE_LEN); -	stripe_size *= BTRFS_STRIPE_LEN; +	do_div(stripe_size, raid_stripe_len); +	stripe_size *= raid_stripe_len;  	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);  	if (!map) { @@ -3677,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		}  	}  	map->sector_size = extent_root->sectorsize; -	map->stripe_len = BTRFS_STRIPE_LEN; -	map->io_align = BTRFS_STRIPE_LEN; -	map->io_width = BTRFS_STRIPE_LEN; +	map->stripe_len = raid_stripe_len; +	map->io_align = raid_stripe_len; +	map->io_width = raid_stripe_len;  	map->type = type;  	map->sub_stripes = sub_stripes;  	*map_ret = map; -	num_bytes = stripe_size * (num_stripes / ncopies); +	num_bytes = stripe_size * data_stripes;  	*stripe_size_out = stripe_size;  	*num_bytes_out = num_bytes; @@ -3706,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	write_lock(&em_tree->lock);  	ret = add_extent_mapping(em_tree, em);  	write_unlock(&em_tree->lock); -	free_extent_map(em); -	if (ret) -		goto error; - -	ret = btrfs_make_block_group(trans, extent_root, 0, type, -				     BTRFS_FIRST_CHUNK_TREE_OBJECTID, -				     start, num_bytes); -	if (ret) +	if (ret) { +		free_extent_map(em);  		goto error; +	}  	for (i = 0; i < map->num_stripes; ++i) {  		struct btrfs_device *device; @@ -3727,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  				info->chunk_root->root_key.objectid,  				BTRFS_FIRST_CHUNK_TREE_OBJECTID,  				start, dev_offset, stripe_size); -		if (ret) { -			btrfs_abort_transaction(trans, extent_root, ret); -			goto error; -		} +		if (ret) +			goto error_dev_extent; +	} + +	ret = btrfs_make_block_group(trans, extent_root, 0, type, +				     BTRFS_FIRST_CHUNK_TREE_OBJECTID, +				     start, num_bytes); +	if (ret) { +		i = map->num_stripes - 1; +		goto error_dev_extent;  	} +	free_extent_map(em); +	check_raid56_incompat_flag(extent_root->fs_info, type); +  	kfree(devices_info);  	return 0; +error_dev_extent: +	for (; i >= 0; i--) { +		struct btrfs_device *device; +		int err; + +		device = map->stripes[i].dev; +		err = btrfs_free_dev_extent(trans, device, start); +		if (err) { +			btrfs_abort_transaction(trans, extent_root, err); +			break; +		} +	} +	write_lock(&em_tree->lock); +	remove_extent_mapping(em_tree, em); +	write_unlock(&em_tree->lock); + +	/* One for our allocation */ +	free_extent_map(em); +	/* One for the tree reference */ +	free_extent_map(em);  error:  	kfree(map);  	kfree(devices_info); @@ -3875,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,  	if (ret)  		return ret; -	alloc_profile = BTRFS_BLOCK_GROUP_METADATA | -				fs_info->avail_metadata_alloc_bits; -	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - +	alloc_profile = btrfs_get_alloc_profile(extent_root, 0);  	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,  				  &stripe_size, chunk_offset, alloc_profile);  	if (ret) @@ -3886,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,  	sys_chunk_offset = chunk_offset + chunk_size; -	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | -				fs_info->avail_system_alloc_bits; -	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - +	alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);  	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,  				  &sys_chunk_size, &sys_stripe_size,  				  sys_chunk_offset, alloc_profile); @@ -4002,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  		ret = map->num_stripes;  	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)  		ret = map->sub_stripes; +	else if (map->type & BTRFS_BLOCK_GROUP_RAID5) +		ret = 2; +	else if (map->type & BTRFS_BLOCK_GROUP_RAID6) +		ret = 3;  	else  		ret = 1;  	free_extent_map(em); @@ -4014,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  	return ret;  } +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, +				    struct btrfs_mapping_tree *map_tree, +				    u64 logical) +{ +	struct extent_map *em; +	struct map_lookup *map; +	struct extent_map_tree *em_tree = &map_tree->map_tree; +	unsigned long len = root->sectorsize; + +	read_lock(&em_tree->lock); +	em = lookup_extent_mapping(em_tree, logical, len); +	read_unlock(&em_tree->lock); +	BUG_ON(!em); + +	BUG_ON(em->start > logical || em->start + em->len < logical); +	map = (struct map_lookup *)em->bdev; +	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +			 BTRFS_BLOCK_GROUP_RAID6)) { +		len = map->stripe_len * nr_data_stripes(map); +	} +	free_extent_map(em); +	return len; +} + +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +			   u64 logical, u64 len, int mirror_num) +{ +	struct extent_map *em; +	struct map_lookup *map; +	struct extent_map_tree *em_tree = &map_tree->map_tree; +	int ret = 0; + +	read_lock(&em_tree->lock); +	em = lookup_extent_mapping(em_tree, logical, len); +	read_unlock(&em_tree->lock); +	BUG_ON(!em); + +	BUG_ON(em->start > logical || em->start + em->len < logical); +	map = (struct map_lookup *)em->bdev; +	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +			 BTRFS_BLOCK_GROUP_RAID6)) +		ret = 1; +	free_extent_map(em); +	return ret; +} +  static int find_live_mirror(struct btrfs_fs_info *fs_info,  			    struct map_lookup *map, int first, int num,  			    int optimal, int dev_replace_is_ongoing) @@ -4051,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,  	return optimal;  } +static inline int parity_smaller(u64 a, u64 b) +{ +	return a > b; +} + +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +{ +	struct btrfs_bio_stripe s; +	int i; +	u64 l; +	int again = 1; + +	while (again) { +		again = 0; +		for (i = 0; i < bbio->num_stripes - 1; i++) { +			if (parity_smaller(raid_map[i], raid_map[i+1])) { +				s = bbio->stripes[i]; +				l = raid_map[i]; +				bbio->stripes[i] = bbio->stripes[i+1]; +				raid_map[i] = raid_map[i+1]; +				bbio->stripes[i+1] = s; +				raid_map[i+1] = l; +				again = 1; +			} +		} +	} +} +  static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  			     u64 logical, u64 *length,  			     struct btrfs_bio **bbio_ret, -			     int mirror_num) +			     int mirror_num, u64 **raid_map_ret)  {  	struct extent_map *em;  	struct map_lookup *map; @@ -4066,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	u64 stripe_nr;  	u64 stripe_nr_orig;  	u64 stripe_nr_end; +	u64 stripe_len; +	u64 *raid_map = NULL;  	int stripe_index;  	int i;  	int ret = 0; @@ -4077,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	int num_alloc_stripes;  	int patch_the_first_stripe_for_dev_replace = 0;  	u64 physical_to_patch_in_first_stripe = 0; +	u64 raid56_full_stripe_start = (u64)-1;  	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, logical, *length); @@ -4093,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	map = (struct map_lookup *)em->bdev;  	offset = logical - em->start; +	if (mirror_num > map->num_stripes) +		mirror_num = 0; + +	stripe_len = map->stripe_len;  	stripe_nr = offset;  	/*  	 * stripe_nr counts the total number of stripes we have to stride  	 * to get to this block  	 */ -	do_div(stripe_nr, map->stripe_len); +	do_div(stripe_nr, stripe_len); -	stripe_offset = stripe_nr * map->stripe_len; +	stripe_offset = stripe_nr * stripe_len;  	BUG_ON(offset < stripe_offset);  	/* stripe_offset is the offset of this block in its stripe*/  	stripe_offset = offset - stripe_offset; -	if (rw & REQ_DISCARD) +	/* if we're here for raid56, we need to know the stripe aligned start */ +	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { +		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); +		raid56_full_stripe_start = offset; + +		/* allow a write of a full stripe, but make sure we don't +		 * allow straddling of stripes +		 */ +		do_div(raid56_full_stripe_start, full_stripe_len); +		raid56_full_stripe_start *= full_stripe_len; +	} + +	if (rw & REQ_DISCARD) { +		/* we don't discard raid56 yet */ +		if (map->type & +		    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { +			ret = -EOPNOTSUPP; +			goto out; +		}  		*length = min_t(u64, em->len - offset, *length); -	else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { -		/* we limit the length of each bio to what fits in a stripe */ -		*length = min_t(u64, em->len - offset, -				map->stripe_len - stripe_offset); +	} else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { +		u64 max_len; +		/* For writes to RAID[56], allow a full stripeset across all disks. +		   For other RAID types and for RAID[56] reads, just allow a single +		   stripe (on a single disk). */ +		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && +		    (rw & REQ_WRITE)) { +			max_len = stripe_len * nr_data_stripes(map) - +				(offset - raid56_full_stripe_start); +		} else { +			/* we limit the length of each bio to what fits in a stripe */ +			max_len = stripe_len - stripe_offset; +		} +		*length = min_t(u64, em->len - offset, max_len);  	} else {  		*length = em->len - offset;  	} +	/* This is for when we're called from btrfs_merge_bio_hook() and all +	   it cares about is the length */  	if (!bbio_ret)  		goto out; @@ -4148,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		u64 physical_of_found = 0;  		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, -			     logical, &tmp_length, &tmp_bbio, 0); +			     logical, &tmp_length, &tmp_bbio, 0, NULL);  		if (ret) {  			WARN_ON(tmp_bbio != NULL);  			goto out; @@ -4209,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	num_stripes = 1;  	stripe_index = 0;  	stripe_nr_orig = stripe_nr; -	stripe_nr_end = (offset + *length + map->stripe_len - 1) & -			(~(map->stripe_len - 1)); +	stripe_nr_end = ALIGN(offset + *length, map->stripe_len);  	do_div(stripe_nr_end, map->stripe_len);  	stripe_end_offset = stripe_nr_end * map->stripe_len -  			    (offset + *length); +  	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {  		if (rw & REQ_DISCARD)  			num_stripes = min_t(u64, map->num_stripes, @@ -4264,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  					      dev_replace_is_ongoing);  			mirror_num = stripe_index - old_stripe_index + 1;  		} + +	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +				BTRFS_BLOCK_GROUP_RAID6)) { +		u64 tmp; + +		if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) +		    && raid_map_ret) { +			int i, rot; + +			/* push stripe_nr back to the start of the full stripe */ +			stripe_nr = raid56_full_stripe_start; +			do_div(stripe_nr, stripe_len); + +			stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + +			/* RAID[56] write or recovery. Return all stripes */ +			num_stripes = map->num_stripes; +			max_errors = nr_parity_stripes(map); + +			raid_map = kmalloc(sizeof(u64) * num_stripes, +					   GFP_NOFS); +			if (!raid_map) { +				ret = -ENOMEM; +				goto out; +			} + +			/* Work out the disk rotation on this stripe-set */ +			tmp = stripe_nr; +			rot = do_div(tmp, num_stripes); + +			/* Fill in the logical address of each stripe */ +			tmp = stripe_nr * nr_data_stripes(map); +			for (i = 0; i < nr_data_stripes(map); i++) +				raid_map[(i+rot) % num_stripes] = +					em->start + (tmp + i) * map->stripe_len; + +			raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; +			if (map->type & BTRFS_BLOCK_GROUP_RAID6) +				raid_map[(i+rot+1) % num_stripes] = +					RAID6_Q_STRIPE; + +			*length = map->stripe_len; +			stripe_index = 0; +			stripe_offset = 0; +		} else { +			/* +			 * Mirror #0 or #1 means the original data block. +			 * Mirror #2 is RAID5 parity block. +			 * Mirror #3 is RAID6 Q block. +			 */ +			stripe_index = do_div(stripe_nr, nr_data_stripes(map)); +			if (mirror_num > 1) +				stripe_index = nr_data_stripes(map) + +						mirror_num - 2; + +			/* We distribute the parity blocks across stripes */ +			tmp = stripe_nr + stripe_index; +			stripe_index = do_div(tmp, map->num_stripes); +		}  	} else {  		/*  		 * after this do_div call, stripe_nr is the number of stripes @@ -4372,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {  		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |  				 BTRFS_BLOCK_GROUP_RAID10 | +				 BTRFS_BLOCK_GROUP_RAID5 |  				 BTRFS_BLOCK_GROUP_DUP)) {  			max_errors = 1; +		} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { +			max_errors = 2;  		}  	} @@ -4474,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;  		bbio->mirror_num = map->num_stripes + 1;  	} +	if (raid_map) { +		sort_parity_stripes(bbio, raid_map); +		*raid_map_ret = raid_map; +	}  out:  	if (dev_replace_is_ongoing)  		btrfs_dev_replace_unlock(dev_replace); @@ -4486,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		      struct btrfs_bio **bbio_ret, int mirror_num)  {  	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, -				 mirror_num); +				 mirror_num, NULL);  }  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -4500,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	u64 bytenr;  	u64 length;  	u64 stripe_nr; +	u64 rmap_len;  	int i, j, nr = 0;  	read_lock(&em_tree->lock); @@ -4510,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	map = (struct map_lookup *)em->bdev;  	length = em->len; +	rmap_len = map->stripe_len; +  	if (map->type & BTRFS_BLOCK_GROUP_RAID10)  		do_div(length, map->num_stripes / map->sub_stripes);  	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)  		do_div(length, map->num_stripes); +	else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +			      BTRFS_BLOCK_GROUP_RAID6)) { +		do_div(length, nr_data_stripes(map)); +		rmap_len = map->stripe_len * nr_data_stripes(map); +	}  	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);  	BUG_ON(!buf); /* -ENOMEM */ @@ -4533,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  			do_div(stripe_nr, map->sub_stripes);  		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {  			stripe_nr = stripe_nr * map->num_stripes + i; -		} -		bytenr = chunk_start + stripe_nr * map->stripe_len; +		} /* else if RAID[56], multiply by nr_data_stripes(). +		   * Alternatively, just use rmap_len below instead of +		   * map->stripe_len */ + +		bytenr = chunk_start + stripe_nr * rmap_len;  		WARN_ON(nr >= map->num_stripes);  		for (j = 0; j < nr; j++) {  			if (buf[j] == bytenr) @@ -4548,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	*logical = buf;  	*naddrs = nr; -	*stripe_len = map->stripe_len; +	*stripe_len = rmap_len;  	free_extent_map(em);  	return 0; @@ -4622,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err)  		bio->bi_bdev = (struct block_device *)  					(unsigned long)bbio->mirror_num;  		/* only send an error to the higher layers if it is -		 * beyond the tolerance of the multi-bio +		 * beyond the tolerance of the btrfs bio  		 */  		if (atomic_read(&bbio->error) > bbio->max_errors) {  			err = -EIO; @@ -4656,13 +5079,18 @@ struct async_sched {   * This will add one bio to the pending list for a device and make sure   * the work struct is scheduled.   */ -static noinline void schedule_bio(struct btrfs_root *root, +noinline void btrfs_schedule_bio(struct btrfs_root *root,  				 struct btrfs_device *device,  				 int rw, struct bio *bio)  {  	int should_queue = 1;  	struct btrfs_pending_bios *pending_bios; +	if (device->missing || !device->bdev) { +		bio_endio(bio, -EIO); +		return; +	} +  	/* don't bother with additional async steps for reads, right now */  	if (!(rw & REQ_WRITE)) {  		bio_get(bio); @@ -4760,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,  #endif  	bio->bi_bdev = dev->bdev;  	if (async) -		schedule_bio(root, dev, rw, bio); +		btrfs_schedule_bio(root, dev, rw, bio);  	else  		btrfsic_submit_bio(rw, bio);  } @@ -4819,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	u64 logical = (u64)bio->bi_sector << 9;  	u64 length = 0;  	u64 map_length; +	u64 *raid_map = NULL;  	int ret;  	int dev_nr = 0;  	int total_devs = 1; @@ -4827,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	length = bio->bi_size;  	map_length = length; -	ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, -			      mirror_num); -	if (ret) +	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, +			      mirror_num, &raid_map); +	if (ret) /* -ENOMEM */  		return ret;  	total_devs = bbio->num_stripes; +	bbio->orig_bio = first_bio; +	bbio->private = first_bio->bi_private; +	bbio->end_io = first_bio->bi_end_io; +	atomic_set(&bbio->stripes_pending, bbio->num_stripes); + +	if (raid_map) { +		/* In this case, map_length has been set to the length of +		   a single stripe; not the whole write */ +		if (rw & WRITE) { +			return raid56_parity_write(root, bio, bbio, +						   raid_map, map_length); +		} else { +			return raid56_parity_recover(root, bio, bbio, +						     raid_map, map_length, +						     mirror_num); +		} +	} +  	if (map_length < length) {  		printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "  		       "len %llu\n", (unsigned long long)logical, @@ -4841,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  		BUG();  	} -	bbio->orig_bio = first_bio; -	bbio->private = first_bio->bi_private; -	bbio->end_io = first_bio->bi_end_io; -	atomic_set(&bbio->stripes_pending, bbio->num_stripes); -  	while (dev_nr < total_devs) {  		dev = bbio->stripes[dev_nr].dev;  		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d3c3939ac75..062d8604d35 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -21,8 +21,8 @@  #include <linux/bio.h>  #include <linux/sort.h> +#include <linux/btrfs.h>  #include "async-thread.h" -#include "ioctl.h"  #define BTRFS_STRIPE_LEN	(64 * 1024) @@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,  void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,  					      struct btrfs_device *tgtdev);  int btrfs_scratch_superblock(struct btrfs_device *device); - +void btrfs_schedule_bio(struct btrfs_root *root, +			struct btrfs_device *device, +			int rw, struct bio *bio); +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +			   u64 logical, u64 len, int mirror_num); +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, +				    struct btrfs_mapping_tree *map_tree, +				    u64 logical);  static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,  				      int index)  { diff --git a/fs/buffer.c b/fs/buffer.c index c017a2dfb90..b4dcb34c963 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,6 +41,7 @@  #include <linux/bitops.h>  #include <linux/mpage.h>  #include <linux/bit_spinlock.h> +#include <trace/events/block.h>  static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -53,6 +54,13 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)  }  EXPORT_SYMBOL(init_buffer); +inline void touch_buffer(struct buffer_head *bh) +{ +	trace_block_touch_buffer(bh); +	mark_page_accessed(bh->b_page); +} +EXPORT_SYMBOL(touch_buffer); +  static int sleep_on_buffer(void *word)  {  	io_schedule(); @@ -1113,6 +1121,8 @@ void mark_buffer_dirty(struct buffer_head *bh)  {  	WARN_ON_ONCE(!buffer_uptodate(bh)); +	trace_block_dirty_buffer(bh); +  	/*  	 * Very *carefully* optimize the it-is-already-dirty case.  	 * @@ -2332,7 +2342,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,  			 get_block_t get_block)  {  	struct page *page = vmf->page; -	struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(vma->vm_file);  	unsigned long end;  	loff_t size;  	int ret; @@ -2359,7 +2369,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,  	if (unlikely(ret < 0))  		goto out_unlock;  	set_page_dirty(page); -	wait_on_page_writeback(page); +	wait_for_stable_page(page);  	return 0;  out_unlock:  	unlock_page(page); @@ -2371,7 +2381,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,  		   get_block_t get_block)  {  	int ret; -	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; +	struct super_block *sb = file_inode(vma->vm_file)->i_sb;  	sb_start_pagefault(sb); @@ -2935,6 +2945,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)  		void *kaddr = kmap_atomic(bh->b_page);  		memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);  		kunmap_atomic(kaddr); +		flush_dcache_page(bh->b_page);  	}  } @@ -3226,7 +3237,7 @@ static struct kmem_cache *bh_cachep __read_mostly;   * Once the number of bh's in the machine exceeds this level, we start   * stripping them in writeback.   */ -static int max_buffer_heads; +static unsigned long max_buffer_heads;  int buffer_heads_over_limit; @@ -3342,7 +3353,7 @@ EXPORT_SYMBOL(bh_submit_read);  void __init buffer_init(void)  { -	int nrpages; +	unsigned long nrpages;  	bh_cachep = kmem_cache_create("buffer_head",  			sizeof(struct buffer_head), 0, diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 67bef6d0148..746ce532e13 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -41,12 +41,12 @@ static struct fscache_object *cachefiles_alloc_object(  	_enter("{%s},%p,", cache->cache.identifier, cookie); -	lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL); +	lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);  	if (!lookup_data)  		goto nomem_lookup_data;  	/* create a new object record and a temporary leaf image */ -	object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); +	object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);  	if (!object)  		goto nomem_object; @@ -63,7 +63,7 @@ static struct fscache_object *cachefiles_alloc_object(  	 * - stick the length on the front and leave space on the back for the  	 *   encoder  	 */ -	buffer = kmalloc((2 + 512) + 3, GFP_KERNEL); +	buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);  	if (!buffer)  		goto nomem_buffer; @@ -219,7 +219,7 @@ static void cachefiles_update_object(struct fscache_object *_object)  		return;  	} -	auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL); +	auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);  	if (!auxdata) {  		_leave(" [nomem]");  		return; @@ -441,6 +441,54 @@ truncate_failed:  }  /* + * Invalidate an object + */ +static void cachefiles_invalidate_object(struct fscache_operation *op) +{ +	struct cachefiles_object *object; +	struct cachefiles_cache *cache; +	const struct cred *saved_cred; +	struct path path; +	uint64_t ni_size; +	int ret; + +	object = container_of(op->object, struct cachefiles_object, fscache); +	cache = container_of(object->fscache.cache, +			     struct cachefiles_cache, cache); + +	op->object->cookie->def->get_attr(op->object->cookie->netfs_data, +					  &ni_size); + +	_enter("{OBJ%x},[%llu]", +	       op->object->debug_id, (unsigned long long)ni_size); + +	if (object->backer) { +		ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + +		fscache_set_store_limit(&object->fscache, ni_size); + +		path.dentry = object->backer; +		path.mnt = cache->mnt; + +		cachefiles_begin_secure(cache, &saved_cred); +		ret = vfs_truncate(&path, 0); +		if (ret == 0) +			ret = vfs_truncate(&path, ni_size); +		cachefiles_end_secure(cache, saved_cred); + +		if (ret != 0) { +			fscache_set_store_limit(&object->fscache, 0); +			if (ret == -EIO) +				cachefiles_io_error_obj(object, +							"Invalidate failed"); +		} +	} + +	fscache_op_complete(op, true); +	_leave(""); +} + +/*   * dissociate a cache from all the pages it was backing   */  static void cachefiles_dissociate_pages(struct fscache_cache *cache) @@ -455,6 +503,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = {  	.lookup_complete	= cachefiles_lookup_complete,  	.grab_object		= cachefiles_grab_object,  	.update_object		= cachefiles_update_object, +	.invalidate_object	= cachefiles_invalidate_object,  	.drop_object		= cachefiles_drop_object,  	.put_object		= cachefiles_put_object,  	.sync_cache		= cachefiles_sync_cache, diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index bd6bc1bde2d..49382519907 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -23,6 +23,8 @@ extern unsigned cachefiles_debug;  #define CACHEFILES_DEBUG_KLEAVE	2  #define CACHEFILES_DEBUG_KDEBUG	4 +#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) +  /*   * node records   */ diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c index 81b8b2b3a67..33b58c60f2d 100644 --- a/fs/cachefiles/key.c +++ b/fs/cachefiles/key.c @@ -78,7 +78,7 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)  	_debug("max: %d", max); -	key = kmalloc(max, GFP_KERNEL); +	key = kmalloc(max, cachefiles_gfp);  	if (!key)  		return NULL; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index b0b5f7cdfff..8c01c5fcdf7 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -40,8 +40,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,  	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",  	       prefix, fscache_object_states[object->fscache.state],  	       object->fscache.flags, work_busy(&object->fscache.work), -	       object->fscache.events, -	       object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); +	       object->fscache.events, object->fscache.event_mask);  	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",  	       prefix, object->fscache.n_ops, object->fscache.n_in_progress,  	       object->fscache.n_exclusive); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index c994691d944..48099225970 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -77,25 +77,25 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,  	struct page *backpage = monitor->back_page, *backpage2;  	int ret; -	kenter("{ino=%lx},{%lx,%lx}", +	_enter("{ino=%lx},{%lx,%lx}",  	       object->backer->d_inode->i_ino,  	       backpage->index, backpage->flags);  	/* skip if the page was truncated away completely */  	if (backpage->mapping != bmapping) { -		kleave(" = -ENODATA [mapping]"); +		_leave(" = -ENODATA [mapping]");  		return -ENODATA;  	}  	backpage2 = find_get_page(bmapping, backpage->index);  	if (!backpage2) { -		kleave(" = -ENODATA [gone]"); +		_leave(" = -ENODATA [gone]");  		return -ENODATA;  	}  	if (backpage != backpage2) {  		put_page(backpage2); -		kleave(" = -ENODATA [different]"); +		_leave(" = -ENODATA [different]");  		return -ENODATA;  	} @@ -114,7 +114,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,  		if (PageUptodate(backpage))  			goto unlock_discard; -		kdebug("reissue read"); +		_debug("reissue read");  		ret = bmapping->a_ops->readpage(NULL, backpage);  		if (ret < 0)  			goto unlock_discard; @@ -129,7 +129,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,  	}  	/* it'll reappear on the todo list */ -	kleave(" = -EINPROGRESS"); +	_leave(" = -EINPROGRESS");  	return -EINPROGRESS;  unlock_discard: @@ -137,7 +137,7 @@ unlock_discard:  	spin_lock_irq(&object->work_lock);  	list_del(&monitor->op_link);  	spin_unlock_irq(&object->work_lock); -	kleave(" = %d", ret); +	_leave(" = %d", ret);  	return ret;  } @@ -174,11 +174,13 @@ static void cachefiles_read_copier(struct fscache_operation *_op)  		_debug("- copy {%lu}", monitor->back_page->index);  	recheck: -		if (PageUptodate(monitor->back_page)) { +		if (test_bit(FSCACHE_COOKIE_INVALIDATING, +			     &object->fscache.cookie->flags)) { +			error = -ESTALE; +		} else if (PageUptodate(monitor->back_page)) {  			copy_highpage(monitor->netfs_page, monitor->back_page); - -			pagevec_add(&pagevec, monitor->netfs_page); -			fscache_mark_pages_cached(monitor->op, &pagevec); +			fscache_mark_page_cached(monitor->op, +						 monitor->netfs_page);  			error = 0;  		} else if (!PageError(monitor->back_page)) {  			/* the page has probably been truncated */ @@ -198,6 +200,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)  		fscache_end_io(op, monitor->netfs_page, error);  		page_cache_release(monitor->netfs_page); +		fscache_retrieval_complete(op, 1);  		fscache_put_retrieval(op);  		kfree(monitor); @@ -239,7 +242,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,  	_debug("read back %p{%lu,%d}",  	       netpage, netpage->index, page_count(netpage)); -	monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); +	monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);  	if (!monitor)  		goto nomem; @@ -258,13 +261,14 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,  			goto backing_page_already_present;  		if (!newpage) { -			newpage = page_cache_alloc_cold(bmapping); +			newpage = __page_cache_alloc(cachefiles_gfp | +						     __GFP_COLD);  			if (!newpage)  				goto nomem_monitor;  		}  		ret = add_to_page_cache(newpage, bmapping, -					netpage->index, GFP_KERNEL); +					netpage->index, cachefiles_gfp);  		if (ret == 0)  			goto installed_new_backing_page;  		if (ret != -EEXIST) @@ -335,11 +339,11 @@ backing_page_already_present:  backing_page_already_uptodate:  	_debug("- uptodate"); -	pagevec_add(pagevec, netpage); -	fscache_mark_pages_cached(op, pagevec); +	fscache_mark_page_cached(op, netpage);  	copy_highpage(netpage, backpage);  	fscache_end_io(op, netpage, 0); +	fscache_retrieval_complete(op, 1);  success:  	_debug("success"); @@ -357,10 +361,13 @@ out:  read_error:  	_debug("read error %d", ret); -	if (ret == -ENOMEM) +	if (ret == -ENOMEM) { +		fscache_retrieval_complete(op, 1);  		goto out; +	}  io_error:  	cachefiles_io_error_obj(object, "Page read error on backing file"); +	fscache_retrieval_complete(op, 1);  	ret = -ENOBUFS;  	goto out; @@ -370,6 +377,7 @@ nomem_monitor:  	fscache_put_retrieval(monitor->op);  	kfree(monitor);  nomem: +	fscache_retrieval_complete(op, 1);  	_leave(" = -ENOMEM");  	return -ENOMEM;  } @@ -408,7 +416,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,  	_enter("{%p},{%lx},,,", object, page->index);  	if (!object->backer) -		return -ENOBUFS; +		goto enobufs;  	inode = object->backer->d_inode;  	ASSERT(S_ISREG(inode->i_mode)); @@ -417,7 +425,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,  	/* calculate the shift required to use bmap */  	if (inode->i_sb->s_blocksize > PAGE_SIZE) -		return -ENOBUFS; +		goto enobufs;  	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; @@ -448,15 +456,20 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,  						       &pagevec);  	} else if (cachefiles_has_space(cache, 0, 1) == 0) {  		/* there's space in the cache we can use */ -		pagevec_add(&pagevec, page); -		fscache_mark_pages_cached(op, &pagevec); +		fscache_mark_page_cached(op, page); +		fscache_retrieval_complete(op, 1);  		ret = -ENODATA;  	} else { -		ret = -ENOBUFS; +		goto enobufs;  	}  	_leave(" = %d", ret);  	return ret; + +enobufs: +	fscache_retrieval_complete(op, 1); +	_leave(" = -ENOBUFS"); +	return -ENOBUFS;  }  /* @@ -465,8 +478,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,   */  static int cachefiles_read_backing_file(struct cachefiles_object *object,  					struct fscache_retrieval *op, -					struct list_head *list, -					struct pagevec *mark_pvec) +					struct list_head *list)  {  	struct cachefiles_one_read *monitor = NULL;  	struct address_space *bmapping = object->backer->d_inode->i_mapping; @@ -485,7 +497,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,  		       netpage, netpage->index, page_count(netpage));  		if (!monitor) { -			monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); +			monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);  			if (!monitor)  				goto nomem; @@ -500,13 +512,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,  				goto backing_page_already_present;  			if (!newpage) { -				newpage = page_cache_alloc_cold(bmapping); +				newpage = __page_cache_alloc(cachefiles_gfp | +							     __GFP_COLD);  				if (!newpage)  					goto nomem;  			}  			ret = add_to_page_cache(newpage, bmapping, -						netpage->index, GFP_KERNEL); +						netpage->index, cachefiles_gfp);  			if (ret == 0)  				goto installed_new_backing_page;  			if (ret != -EEXIST) @@ -536,10 +549,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,  		_debug("- monitor add");  		ret = add_to_page_cache(netpage, op->mapping, netpage->index, -					GFP_KERNEL); +					cachefiles_gfp);  		if (ret < 0) {  			if (ret == -EEXIST) {  				page_cache_release(netpage); +				fscache_retrieval_complete(op, 1);  				continue;  			}  			goto nomem; @@ -612,10 +626,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,  		_debug("- uptodate");  		ret = add_to_page_cache(netpage, op->mapping, netpage->index, -					GFP_KERNEL); +					cachefiles_gfp);  		if (ret < 0) {  			if (ret == -EEXIST) {  				page_cache_release(netpage); +				fscache_retrieval_complete(op, 1);  				continue;  			}  			goto nomem; @@ -626,16 +641,17 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,  		page_cache_release(backpage);  		backpage = NULL; -		if (!pagevec_add(mark_pvec, netpage)) -			fscache_mark_pages_cached(op, mark_pvec); +		fscache_mark_page_cached(op, netpage);  		page_cache_get(netpage);  		if (!pagevec_add(&lru_pvec, netpage))  			__pagevec_lru_add_file(&lru_pvec); +		/* the netpage is unlocked and marked up to date here */  		fscache_end_io(op, netpage, 0);  		page_cache_release(netpage);  		netpage = NULL; +		fscache_retrieval_complete(op, 1);  		continue;  	} @@ -661,6 +677,7 @@ out:  	list_for_each_entry_safe(netpage, _n, list, lru) {  		list_del(&netpage->lru);  		page_cache_release(netpage); +		fscache_retrieval_complete(op, 1);  	}  	_leave(" = %d", ret); @@ -669,15 +686,17 @@ out:  nomem:  	_debug("nomem");  	ret = -ENOMEM; -	goto out; +	goto record_page_complete;  read_error:  	_debug("read error %d", ret);  	if (ret == -ENOMEM) -		goto out; +		goto record_page_complete;  io_error:  	cachefiles_io_error_obj(object, "Page read error on backing file");  	ret = -ENOBUFS; +record_page_complete: +	fscache_retrieval_complete(op, 1);  	goto out;  } @@ -709,7 +728,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,  	       *nr_pages);  	if (!object->backer) -		return -ENOBUFS; +		goto all_enobufs;  	space = 1;  	if (cachefiles_has_space(cache, 0, *nr_pages) < 0) @@ -722,7 +741,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,  	/* calculate the shift required to use bmap */  	if (inode->i_sb->s_blocksize > PAGE_SIZE) -		return -ENOBUFS; +		goto all_enobufs;  	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; @@ -762,7 +781,10 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,  			nrbackpages++;  		} else if (space && pagevec_add(&pagevec, page) == 0) {  			fscache_mark_pages_cached(op, &pagevec); +			fscache_retrieval_complete(op, 1);  			ret = -ENODATA; +		} else { +			fscache_retrieval_complete(op, 1);  		}  	} @@ -775,18 +797,18 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,  	/* submit the apparently valid pages to the backing fs to be read from  	 * disk */  	if (nrbackpages > 0) { -		ret2 = cachefiles_read_backing_file(object, op, &backpages, -						    &pagevec); +		ret2 = cachefiles_read_backing_file(object, op, &backpages);  		if (ret2 == -ENOMEM || ret2 == -EINTR)  			ret = ret2;  	} -	if (pagevec_count(&pagevec) > 0) -		fscache_mark_pages_cached(op, &pagevec); -  	_leave(" = %d [nr=%u%s]",  	       ret, *nr_pages, list_empty(pages) ? " empty" : "");  	return ret; + +all_enobufs: +	fscache_retrieval_complete(op, *nr_pages); +	return -ENOBUFS;  }  /* @@ -806,7 +828,6 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,  {  	struct cachefiles_object *object;  	struct cachefiles_cache *cache; -	struct pagevec pagevec;  	int ret;  	object = container_of(op->op.object, @@ -817,14 +838,12 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,  	_enter("%p,{%lx},", object, page->index);  	ret = cachefiles_has_space(cache, 0, 1); -	if (ret == 0) { -		pagevec_init(&pagevec, 0); -		pagevec_add(&pagevec, page); -		fscache_mark_pages_cached(op, &pagevec); -	} else { +	if (ret == 0) +		fscache_mark_page_cached(op, page); +	else  		ret = -ENOBUFS; -	} +	fscache_retrieval_complete(op, 1);  	_leave(" = %d", ret);  	return ret;  } @@ -874,6 +893,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,  		ret = -ENOBUFS;  	} +	fscache_retrieval_complete(op, *nr_pages);  	_leave(" = %d", ret);  	return ret;  } diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index e18b183b47e..73b46288b54 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -174,7 +174,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,  	ASSERT(dentry);  	ASSERT(dentry->d_inode); -	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); +	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);  	if (!auxbuf) {  		_leave(" = -ENOMEM");  		return -ENOMEM; diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 9eb134ea6eb..49bc78243db 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,6 +1,6 @@  config CEPH_FS -        tristate "Ceph distributed file system (EXPERIMENTAL)" -	depends on INET && EXPERIMENTAL +	tristate "Ceph distributed file system" +	depends on INET  	select CEPH_LIB  	select LIBCRC32C  	select CRYPTO_AES diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6690269f5dd..a60ea977af6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -195,7 +195,7 @@ static int ceph_releasepage(struct page *page, gfp_t g)   */  static int readpage_nounlock(struct file *filp, struct page *page)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_osd_client *osdc =   		&ceph_inode_to_client(inode)->client->osdc; @@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page)  static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)  {  	struct inode *inode = req->r_inode; -	struct ceph_osd_reply_head *replyhead; -	int rc, bytes; +	int rc = req->r_result; +	int bytes = le32_to_cpu(msg->hdr.data_len);  	int i; -	/* parse reply */ -	replyhead = msg->front.iov_base; -	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); -	rc = le32_to_cpu(replyhead->result); -	bytes = le32_to_cpu(msg->hdr.data_len); -  	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);  	/* unlock all pages, zeroing any data we didn't read */ @@ -267,6 +261,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)  	kfree(req->r_pages);  } +static void ceph_unlock_page_vector(struct page **pages, int num_pages) +{ +	int i; + +	for (i = 0; i < num_pages; i++) +		unlock_page(pages[i]); +} +  /*   * start an async read(ahead) operation.  return nr_pages we submitted   * a read for on success, or negative error code. @@ -307,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)  				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,  				    NULL, 0,  				    ci->i_truncate_seq, ci->i_truncate_size, -				    NULL, false, 1, 0); +				    NULL, false, 0);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -347,6 +349,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)  	return nr_pages;  out_pages: +	ceph_unlock_page_vector(pages, nr_pages);  	ceph_release_page_vector(pages, nr_pages);  out:  	ceph_osdc_put_request(req); @@ -361,7 +364,7 @@ out:  static int ceph_readpages(struct file *file, struct address_space *mapping,  			  struct list_head *page_list, unsigned nr_pages)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  	int rc = 0;  	int max = 0; @@ -483,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)  				   &ci->i_layout, snapc,  				   page_off, len,  				   ci->i_truncate_seq, ci->i_truncate_size, -				   &inode->i_mtime, -				   &page, 1, 0, 0, true); +				   &inode->i_mtime, &page, 1);  	if (err < 0) {  		dout("writepage setting page/mapping error %d %p\n", err, page);  		SetPageError(page); @@ -545,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req,  			      struct ceph_msg *msg)  {  	struct inode *inode = req->r_inode; -	struct ceph_osd_reply_head *replyhead; -	struct ceph_osd_op *op;  	struct ceph_inode_info *ci = ceph_inode(inode);  	unsigned wrote;  	struct page *page;  	int i;  	struct ceph_snap_context *snapc = req->r_snapc;  	struct address_space *mapping = inode->i_mapping; -	__s32 rc = -EIO; -	u64 bytes = 0; +	int rc = req->r_result; +	u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  	long writeback_stat;  	unsigned issued = ceph_caps_issued(ci); -	/* parse reply */ -	replyhead = msg->front.iov_base; -	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); -	op = (void *)(replyhead + 1); -	rc = le32_to_cpu(replyhead->result); -	bytes = le64_to_cpu(op->extent.length); -  	if (rc >= 0) {  		/*  		 * Assume we wrote the pages we originally sent.  The @@ -732,8 +725,6 @@ retry:  		struct page *page;  		int want;  		u64 offset, len; -		struct ceph_osd_request_head *reqhead; -		struct ceph_osd_op *op;  		long writeback_stat;  		next = 0; @@ -829,7 +820,7 @@ get_more_pages:  					    snapc, do_sync,  					    ci->i_truncate_seq,  					    ci->i_truncate_size, -					    &inode->i_mtime, true, 1, 0); +					    &inode->i_mtime, true, 0);  				if (IS_ERR(req)) {  					rc = PTR_ERR(req); @@ -897,10 +888,8 @@ get_more_pages:  		/* revise final length, page count */  		req->r_num_pages = locked_pages; -		reqhead = req->r_request->front.iov_base; -		op = (void *)(reqhead + 1); -		op->extent.length = cpu_to_le64(len); -		op->payload_len = cpu_to_le32(len); +		req->r_request_ops[0].extent.length = cpu_to_le64(len); +		req->r_request_ops[0].payload_len = cpu_to_le32(len);  		req->r_request->hdr.data_len = cpu_to_le32(len);  		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); @@ -968,7 +957,7 @@ static int ceph_update_writeable_page(struct file *file,  			    loff_t pos, unsigned len,  			    struct page *page)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;  	loff_t page_off = pos & PAGE_CACHE_MASK; @@ -1077,24 +1066,52 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,  			    loff_t pos, unsigned len, unsigned flags,  			    struct page **pagep, void **fsdata)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file); +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_file_info *fi = file->private_data;  	struct page *page;  	pgoff_t index = pos >> PAGE_CACHE_SHIFT; -	int r; +	int r, want, got = 0; + +	if (fi->fmode & CEPH_FILE_MODE_LAZY) +		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; +	else +		want = CEPH_CAP_FILE_BUFFER; + +	dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", +	     inode, ceph_vinop(inode), pos, len, inode->i_size); +	r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); +	if (r < 0) +		return r; +	dout("write_begin %p %llx.%llx %llu~%u  got cap refs on %s\n", +	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); +	if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { +		ceph_put_cap_refs(ci, got); +		return -EAGAIN; +	}  	do {  		/* get a page */  		page = grab_cache_page_write_begin(mapping, index, 0); -		if (!page) -			return -ENOMEM; -		*pagep = page; +		if (!page) { +			r = -ENOMEM; +			break; +		}  		dout("write_begin file %p inode %p page %p %d~%d\n", file,  		     inode, page, (int)pos, (int)len);  		r = ceph_update_writeable_page(file, pos, len, page); +		if (r) +			page_cache_release(page);  	} while (r == -EAGAIN); +	if (r) { +		ceph_put_cap_refs(ci, got); +	} else { +		*pagep = page; +		*(int *)fsdata = got; +	}  	return r;  } @@ -1107,11 +1124,13 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,  			  loff_t pos, unsigned len, unsigned copied,  			  struct page *page, void *fsdata)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file); +	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  	struct ceph_mds_client *mdsc = fsc->mdsc;  	unsigned from = pos & (PAGE_CACHE_SIZE - 1);  	int check_cap = 0; +	int got = (unsigned long)fsdata;  	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,  	     inode, page, (int)pos, (int)copied, (int)len); @@ -1134,6 +1153,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,  	up_read(&mdsc->snap_rwsem);  	page_cache_release(page); +	if (copied > 0) { +		int dirty; +		spin_lock(&ci->i_ceph_lock); +		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); +		spin_unlock(&ci->i_ceph_lock); +		if (dirty) +			__mark_inode_dirty(inode, dirty); +	} + +	dout("write_end %p %llx.%llx %llu~%u  dropping cap refs on %s\n", +	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); +	ceph_put_cap_refs(ci, got); +  	if (check_cap)  		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); @@ -1176,7 +1208,7 @@ const struct address_space_operations ceph_aops = {   */  static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  { -	struct inode *inode = vma->vm_file->f_dentry->d_inode; +	struct inode *inode = file_inode(vma->vm_file);  	struct page *page = vmf->page;  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;  	loff_t off = page_offset(page); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3251e9cc640..78e2f575247 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,  	if (!ctx) {  		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);  		if (cap) { +			spin_lock(&mdsc->caps_list_lock);  			mdsc->caps_use_count++;  			mdsc->caps_total_count++; +			spin_unlock(&mdsc->caps_list_lock);  		}  		return cap;  	} @@ -609,8 +611,16 @@ retry:  	if (flags & CEPH_CAP_FLAG_AUTH)  		ci->i_auth_cap = cap; -	else if (ci->i_auth_cap == cap) +	else if (ci->i_auth_cap == cap) {  		ci->i_auth_cap = NULL; +		spin_lock(&mdsc->cap_dirty_lock); +		if (!list_empty(&ci->i_dirty_item)) { +			dout(" moving %p to cap_dirty_migrating\n", inode); +			list_move(&ci->i_dirty_item, +				  &mdsc->cap_dirty_migrating); +		} +		spin_unlock(&mdsc->cap_dirty_lock); +	}  	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",  	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued), @@ -928,7 +938,7 @@ static int send_cap_msg(struct ceph_mds_session *session,  			u64 size, u64 max_size,  			struct timespec *mtime, struct timespec *atime,  			u64 time_warp_seq, -			uid_t uid, gid_t gid, umode_t mode, +			kuid_t uid, kgid_t gid, umode_t mode,  			u64 xattr_version,  			struct ceph_buffer *xattrs_buf,  			u64 follows) @@ -972,8 +982,8 @@ static int send_cap_msg(struct ceph_mds_session *session,  		ceph_encode_timespec(&fc->atime, atime);  	fc->time_warp_seq = cpu_to_le32(time_warp_seq); -	fc->uid = cpu_to_le32(uid); -	fc->gid = cpu_to_le32(gid); +	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); +	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));  	fc->mode = cpu_to_le32(mode);  	fc->xattr_version = cpu_to_le64(xattr_version); @@ -1079,8 +1089,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,  	struct timespec mtime, atime;  	int wake = 0;  	umode_t mode; -	uid_t uid; -	gid_t gid; +	kuid_t uid; +	kgid_t gid;  	struct ceph_mds_session *session;  	u64 xattr_version = 0;  	struct ceph_buffer *xattr_blob = NULL; @@ -1349,11 +1359,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)  		if (!ci->i_head_snapc)  			ci->i_head_snapc = ceph_get_snap_context(  				ci->i_snap_realm->cached_context); -		dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, -			ci->i_head_snapc); +		dout(" inode %p now dirty snapc %p auth cap %p\n", +		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);  		BUG_ON(!list_empty(&ci->i_dirty_item));  		spin_lock(&mdsc->cap_dirty_lock); -		list_add(&ci->i_dirty_item, &mdsc->cap_dirty); +		if (ci->i_auth_cap) +			list_add(&ci->i_dirty_item, &mdsc->cap_dirty); +		else +			list_add(&ci->i_dirty_item, +				 &mdsc->cap_dirty_migrating);  		spin_unlock(&mdsc->cap_dirty_lock);  		if (ci->i_flushing_caps == 0) {  			ihold(inode); @@ -1454,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,  	struct ceph_mds_client *mdsc = fsc->mdsc;  	struct inode *inode = &ci->vfs_inode;  	struct ceph_cap *cap; -	int file_wanted, used; +	int file_wanted, used, cap_used;  	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */  	int issued, implemented, want, retain, revoking, flushing = 0;  	int mds = -1;   /* keep track of how far we've gone through i_caps list @@ -1557,9 +1571,14 @@ retry_locked:  		/* NOTE: no side-effects allowed, until we take s_mutex */ +		cap_used = used; +		if (ci->i_auth_cap && cap != ci->i_auth_cap) +			cap_used &= ~ci->i_auth_cap->issued; +  		revoking = cap->implemented & ~cap->issued; -		dout(" mds%d cap %p issued %s implemented %s revoking %s\n", +		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",  		     cap->mds, cap, ceph_cap_string(cap->issued), +		     ceph_cap_string(cap_used),  		     ceph_cap_string(cap->implemented),  		     ceph_cap_string(revoking)); @@ -1587,7 +1606,7 @@ retry_locked:  		}  		/* completed revocation? going down and there are no caps? */ -		if (revoking && (revoking & used) == 0) { +		if (revoking && (revoking & cap_used) == 0) {  			dout("completed revocation of %s\n",  			     ceph_cap_string(cap->implemented & ~cap->issued));  			goto ack; @@ -1664,8 +1683,8 @@ ack:  		sent++;  		/* __send_cap drops i_ceph_lock */ -		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, -				      retain, flushing, NULL); +		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, +				      want, retain, flushing, NULL);  		goto retry; /* retake i_ceph_lock and restart our cap scan. */  	} @@ -2353,10 +2372,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {  		inode->i_mode = le32_to_cpu(grant->mode); -		inode->i_uid = le32_to_cpu(grant->uid); -		inode->i_gid = le32_to_cpu(grant->gid); +		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); +		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));  		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, -		     inode->i_uid, inode->i_gid); +		     from_kuid(&init_user_ns, inode->i_uid), +		     from_kgid(&init_user_ns, inode->i_gid));  	}  	if ((issued & CEPH_CAP_LINK_EXCL) == 0) @@ -2388,7 +2408,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  			    &atime);  	/* max size increase? */ -	if (max_size != ci->i_max_size) { +	if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {  		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);  		ci->i_max_size = max_size;  		if (max_size >= ci->i_wanted_max_size) { @@ -2410,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  		dout("mds wanted %s -> %s\n",  		     ceph_cap_string(le32_to_cpu(grant->wanted)),  		     ceph_cap_string(wanted)); -		grant->wanted = cpu_to_le32(wanted); +		/* imported cap may not have correct mds_wanted */ +		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) +			check_caps = 1;  	}  	cap->seq = seq; @@ -2745,6 +2767,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,  	/* make sure we re-request max_size, if necessary */  	spin_lock(&ci->i_ceph_lock); +	ci->i_wanted_max_size = 0;  /* reset */  	ci->i_requested_max_size = 0;  	spin_unlock(&ci->i_ceph_lock);  } @@ -2813,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,  	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,  	     (unsigned)seq); +	if (op == CEPH_CAP_OP_IMPORT) +		ceph_add_cap_releases(mdsc, session); +  	/* lookup ino */  	inode = ceph_find_inode(sb, vino);  	ci = ceph_inode(inode); @@ -2840,8 +2866,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,  	case CEPH_CAP_OP_IMPORT:  		handle_cap_import(mdsc, inode, h, session,  				  snaptrace, snaptrace_len); -		ceph_check_caps(ceph_inode(inode), 0, session); -		goto done_unlocked;  	}  	/* the rest require a cap */ @@ -2858,6 +2882,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,  	switch (op) {  	case CEPH_CAP_OP_REVOKE:  	case CEPH_CAP_OP_GRANT: +	case CEPH_CAP_OP_IMPORT:  		handle_cap_grant(inode, h, session, cap, msg->middle);  		goto done_unlocked; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8c1aabe93b6..6d797f46d77 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -238,7 +238,7 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,  static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)  {  	struct ceph_file_info *fi = filp->private_data; -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  	struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1138,7 +1138,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,  			     loff_t *ppos)  {  	struct ceph_file_info *cf = file->private_data; -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	int left;  	const int bufsize = 1024; @@ -1188,7 +1188,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,  static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,  			  int datasync)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct list_head *head = &ci->i_unsafe_dirops;  	struct ceph_mds_request *req; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index ca3ab3f9ca7..16796be53ca 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -81,7 +81,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,  		if (parent_inode) {  			/* nfsd wants connectable */  			*max_len = connected_handle_length; -			type = 255; +			type = FILEID_INVALID;  		} else {  			dout("encode_fh %p\n", dentry);  			fh->ino = ceph_ino(inode); @@ -90,7 +90,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,  		}  	} else {  		*max_len = handle_length; -		type = 255; +		type = FILEID_INVALID;  	}  	if (dentry)  		dput(dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d4dfdcf76d7..bf338d9b67e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,  	err = ceph_mdsc_do_request(mdsc,  				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,  				   req); +	if (err) +		goto out_err; +  	err = ceph_handle_snapdir(req, dentry, err);  	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)  		err = ceph_handle_notrace_create(dir, dentry); @@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,  		err = finish_no_open(file, dn);  	} else {  		dout("atomic_open finish_open on dn %p\n", dn); +		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { +			*opened |= FILE_CREATED; +		}  		err = finish_open(file, dentry, ceph_open, opened);  	} @@ -393,7 +399,7 @@ more:  static ssize_t ceph_sync_read(struct file *file, char __user *data,  			      unsigned len, loff_t *poff, int *checkeof)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct page **pages;  	u64 off = *poff;  	int num_pages, ret; @@ -466,7 +472,7 @@ static void sync_write_commit(struct ceph_osd_request *req,  static ssize_t ceph_sync_write(struct file *file, const char __user *data,  			       size_t left, loff_t *offset)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  	struct ceph_osd_request *req; @@ -483,7 +489,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,  	int ret;  	struct timespec mtime = CURRENT_TIME; -	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP) +	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)  		return -EROFS;  	dout("sync_write on file %p %lld~%u %s\n", file, *offset, @@ -535,7 +541,7 @@ more:  				    ci->i_snap_realm->cached_context,  				    do_sync,  				    ci->i_truncate_seq, ci->i_truncate_size, -				    &mtime, false, 2, page_align); +				    &mtime, false, page_align);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -637,7 +643,7 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,  	struct ceph_file_info *fi = filp->private_data;  	loff_t *ppos = &iocb->ki_pos;  	size_t len = iov->iov_len; -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ceph_inode_info *ci = ceph_inode(inode);  	void __user *base = iov->iov_base;  	ssize_t ret; @@ -707,68 +713,58 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,  {  	struct file *file = iocb->ki_filp;  	struct ceph_file_info *fi = file->private_data; -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_osd_client *osdc =  		&ceph_sb_to_client(inode->i_sb)->client->osdc;  	loff_t endoff = pos + iov->iov_len; -	int want, got = 0; -	int ret, err; +	int got = 0; +	int ret, err, written;  	if (ceph_snap(inode) != CEPH_NOSNAP)  		return -EROFS;  retry_snap: +	written = 0;  	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))  		return -ENOSPC;  	__ceph_do_pending_vmtruncate(inode); -	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", -	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, -	     inode->i_size); -	if (fi->fmode & CEPH_FILE_MODE_LAZY) -		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; -	else -		want = CEPH_CAP_FILE_BUFFER; -	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); -	if (ret < 0) -		goto out_put; - -	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n", -	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, -	     ceph_cap_string(got)); - -	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || -	    (iocb->ki_filp->f_flags & O_DIRECT) || -	    (inode->i_sb->s_flags & MS_SYNCHRONOUS) || -	    (fi->flags & CEPH_F_SYNC)) { -		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, -			&iocb->ki_pos); -	} else { -		/* -		 * buffered write; drop Fw early to avoid slow -		 * revocation if we get stuck on balance_dirty_pages -		 */ -		int dirty; - -		spin_lock(&ci->i_ceph_lock); -		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); -		spin_unlock(&ci->i_ceph_lock); -		ceph_put_cap_refs(ci, got); +	/* +	 * try to do a buffered write.  if we don't have sufficient +	 * caps, we'll get -EAGAIN from generic_file_aio_write, or a +	 * short write if we only get caps for some pages. +	 */ +	if (!(iocb->ki_filp->f_flags & O_DIRECT) && +	    !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && +	    !(fi->flags & CEPH_F_SYNC)) {  		ret = generic_file_aio_write(iocb, iov, nr_segs, pos); +		if (ret >= 0) +			written = ret; +  		if ((ret >= 0 || ret == -EIOCBQUEUED) &&  		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)  		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { -			err = vfs_fsync_range(file, pos, pos + ret - 1, 1); +			err = vfs_fsync_range(file, pos, pos + written - 1, 1);  			if (err < 0)  				ret = err;  		} +		if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) +			goto out; +	} -		if (dirty) -			__mark_inode_dirty(inode, dirty); +	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", +	     inode, ceph_vinop(inode), pos + written, +	     (unsigned)iov->iov_len - written, inode->i_size); +	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); +	if (ret < 0)  		goto out; -	} +	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n", +	     inode, ceph_vinop(inode), pos + written, +	     (unsigned)iov->iov_len - written, ceph_cap_string(got)); +	ret = ceph_sync_write(file, iov->iov_base + written, +			      iov->iov_len - written, &iocb->ki_pos);  	if (ret >= 0) {  		int dirty;  		spin_lock(&ci->i_ceph_lock); @@ -777,13 +773,10 @@ retry_snap:  		if (dirty)  			__mark_inode_dirty(inode, dirty);  	} - -out_put:  	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n", -	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, -	     ceph_cap_string(got)); +	     inode, ceph_vinop(inode), pos + written, +	     (unsigned)iov->iov_len - written, ceph_cap_string(got));  	ceph_put_cap_refs(ci, got); -  out:  	if (ret == -EOLDSNAPC) {  		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ba95eea201b..851814d951c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -612,10 +612,11 @@ static int fill_inode(struct inode *inode,  	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {  		inode->i_mode = le32_to_cpu(info->mode); -		inode->i_uid = le32_to_cpu(info->uid); -		inode->i_gid = le32_to_cpu(info->gid); +		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); +		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));  		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, -		     inode->i_uid, inode->i_gid); +		     from_kuid(&init_user_ns, inode->i_uid), +		     from_kgid(&init_user_ns, inode->i_gid));  	}  	if ((issued & CEPH_CAP_LINK_EXCL) == 0) @@ -1130,8 +1131,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,  					    req->r_request_started);  		dout(" final dn %p\n", dn);  		i++; -	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || -		   req->r_op == CEPH_MDS_OP_MKSNAP) { +	} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || +		   req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) {  		struct dentry *dn = req->r_dentry;  		/* fill out a snapdir LOOKUPSNAP dentry */ @@ -1195,6 +1196,39 @@ done:  /*   * Prepopulate our cache with readdir results, leases, etc.   */ +static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, +					   struct ceph_mds_session *session) +{ +	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; +	int i, err = 0; + +	for (i = 0; i < rinfo->dir_nr; i++) { +		struct ceph_vino vino; +		struct inode *in; +		int rc; + +		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); +		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + +		in = ceph_get_inode(req->r_dentry->d_sb, vino); +		if (IS_ERR(in)) { +			err = PTR_ERR(in); +			dout("new_inode badness got %d\n", err); +			continue; +		} +		rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, +				req->r_request_started, -1, +				&req->r_caps_reservation); +		if (rc < 0) { +			pr_err("fill_inode badness on %p got %d\n", in, rc); +			err = rc; +			continue; +		} +	} + +	return err; +} +  int ceph_readdir_prepopulate(struct ceph_mds_request *req,  			     struct ceph_mds_session *session)  { @@ -1209,6 +1243,9 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,  	u64 frag = le32_to_cpu(rhead->args.readdir.frag);  	struct ceph_dentry_info *di; +	if (req->r_aborted) +		return readdir_prepopulate_inodes_only(req, session); +  	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {  		snapdir = ceph_get_snapdir(parent->d_inode);  		parent = d_find_alias(snapdir); @@ -1466,7 +1503,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)  {  	struct ceph_inode_info *ci = ceph_inode(inode);  	u64 to; -	int wrbuffer_refs, wake = 0; +	int wrbuffer_refs, finish = 0;  retry:  	spin_lock(&ci->i_ceph_lock); @@ -1498,15 +1535,18 @@ retry:  	truncate_inode_pages(inode->i_mapping, to);  	spin_lock(&ci->i_ceph_lock); -	ci->i_truncate_pending--; -	if (ci->i_truncate_pending == 0) -		wake = 1; +	if (to == ci->i_truncate_size) { +		ci->i_truncate_pending = 0; +		finish = 1; +	}  	spin_unlock(&ci->i_ceph_lock); +	if (!finish) +		goto retry;  	if (wrbuffer_refs == 0)  		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); -	if (wake) -		wake_up_all(&ci->i_cap_wq); + +	wake_up_all(&ci->i_cap_wq);  } @@ -1562,26 +1602,30 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)  	if (ia_valid & ATTR_UID) {  		dout("setattr %p uid %d -> %d\n", inode, -		     inode->i_uid, attr->ia_uid); +		     from_kuid(&init_user_ns, inode->i_uid), +		     from_kuid(&init_user_ns, attr->ia_uid));  		if (issued & CEPH_CAP_AUTH_EXCL) {  			inode->i_uid = attr->ia_uid;  			dirtied |= CEPH_CAP_AUTH_EXCL;  		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || -			   attr->ia_uid != inode->i_uid) { -			req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid); +			   !uid_eq(attr->ia_uid, inode->i_uid)) { +			req->r_args.setattr.uid = cpu_to_le32( +				from_kuid(&init_user_ns, attr->ia_uid));  			mask |= CEPH_SETATTR_UID;  			release |= CEPH_CAP_AUTH_SHARED;  		}  	}  	if (ia_valid & ATTR_GID) {  		dout("setattr %p gid %d -> %d\n", inode, -		     inode->i_gid, attr->ia_gid); +		     from_kgid(&init_user_ns, inode->i_gid), +		     from_kgid(&init_user_ns, attr->ia_gid));  		if (issued & CEPH_CAP_AUTH_EXCL) {  			inode->i_gid = attr->ia_gid;  			dirtied |= CEPH_CAP_AUTH_EXCL;  		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || -			   attr->ia_gid != inode->i_gid) { -			req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid); +			   !gid_eq(attr->ia_gid, inode->i_gid)) { +			req->r_args.setattr.gid = cpu_to_le32( +				from_kgid(&init_user_ns, attr->ia_gid));  			mask |= CEPH_SETATTR_GID;  			release |= CEPH_CAP_AUTH_SHARED;  		} diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 36549a46e31..4a989345b37 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -16,11 +16,11 @@   */  static long ceph_ioctl_get_layout(struct file *file, void __user *arg)  { -	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); +	struct ceph_inode_info *ci = ceph_inode(file_inode(file));  	struct ceph_ioctl_layout l;  	int err; -	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); +	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);  	if (!err) {  		l.stripe_unit = ceph_file_layout_su(ci->i_layout);  		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); @@ -63,12 +63,12 @@ static long __validate_layout(struct ceph_mds_client *mdsc,  static long ceph_ioctl_set_layout(struct file *file, void __user *arg)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct inode *parent_inode;  	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;  	struct ceph_mds_request *req;  	struct ceph_ioctl_layout l; -	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); +	struct ceph_inode_info *ci = ceph_inode(file_inode(file));  	struct ceph_ioctl_layout nl;  	int err; @@ -76,7 +76,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)  		return -EFAULT;  	/* validate changed params against current layout */ -	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); +	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);  	if (err)  		return err; @@ -136,7 +136,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)   */  static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_mds_request *req;  	struct ceph_ioctl_layout l;  	int err; @@ -179,13 +179,12 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)  static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)  {  	struct ceph_ioctl_dataloc dl; -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_osd_client *osdc =  		&ceph_sb_to_client(inode->i_sb)->client->osdc;  	u64 len = 1, olen;  	u64 tmp; -	struct ceph_object_layout ol;  	struct ceph_pg pgid;  	int r; @@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)  		return -EFAULT;  	down_read(&osdc->map_sem); -	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, +	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,  					  &dl.object_no, &dl.object_offset,  					  &olen);  	if (r < 0) @@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)  	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",  		 ceph_ino(inode), dl.object_no); -	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, +	ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,  				osdc->osdmap); -	pgid = ol.ol_pgid;  	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);  	if (dl.osd >= 0) {  		struct ceph_entity_addr *a = @@ -234,7 +232,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)  static long ceph_ioctl_lazyio(struct file *file)  {  	struct ceph_file_info *fi = file->private_data; -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 80576d05d68..202dd3d68be 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -13,7 +13,7 @@  static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,  			     int cmd, u8 wait, struct file_lock *fl)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_mds_client *mdsc =  		ceph_sb_to_client(inode->i_sb)->mdsc;  	struct ceph_mds_request *req; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1bcf712655d..442880d099c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -233,6 +233,30 @@ bad:  }  /* + * parse create results + */ +static int parse_reply_info_create(void **p, void *end, +				  struct ceph_mds_reply_info_parsed *info, +				  int features) +{ +	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { +		if (*p == end) { +			info->has_create_ino = false; +		} else { +			info->has_create_ino = true; +			info->ino = ceph_decode_64(p); +		} +	} + +	if (unlikely(*p != end)) +		goto bad; +	return 0; + +bad: +	return -EIO; +} + +/*   * parse extra results   */  static int parse_reply_info_extra(void **p, void *end, @@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,  {  	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)  		return parse_reply_info_filelock(p, end, info, features); -	else +	else if (info->head->op == CEPH_MDS_OP_READDIR)  		return parse_reply_info_dir(p, end, info, features); +	else if (info->head->op == CEPH_MDS_OP_CREATE) +		return parse_reply_info_create(p, end, info, features); +	else +		return -EIO;  }  /* @@ -1590,7 +1618,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,  	} else if (rpath || rino) {  		*ino = rino;  		*ppath = rpath; -		*pathlen = strlen(rpath); +		*pathlen = rpath ? strlen(rpath) : 0;  		dout(" path %.*s\n", *pathlen, rpath);  	} @@ -1658,8 +1686,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,  	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);  	head->op = cpu_to_le32(req->r_op); -	head->caller_uid = cpu_to_le32(req->r_uid); -	head->caller_gid = cpu_to_le32(req->r_gid); +	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); +	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));  	head->args = req->r_args;  	ceph_encode_filepath(&p, end, ino1, path1); @@ -1876,9 +1904,14 @@ finish:  static void __wake_requests(struct ceph_mds_client *mdsc,  			    struct list_head *head)  { -	struct ceph_mds_request *req, *nreq; +	struct ceph_mds_request *req; +	LIST_HEAD(tmp_list); + +	list_splice_init(head, &tmp_list); -	list_for_each_entry_safe(req, nreq, head, r_wait) { +	while (!list_empty(&tmp_list)) { +		req = list_entry(tmp_list.next, +				 struct ceph_mds_request, r_wait);  		list_del_init(&req->r_wait);  		__do_request(mdsc, req);  	} @@ -2165,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)  	mutex_lock(&req->r_fill_mutex);  	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);  	if (err == 0) { -		if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && +		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || +				    req->r_op == CEPH_MDS_OP_LSSNAP) &&  		    rinfo->dir_nr)  			ceph_readdir_prepopulate(req, req->r_session);  		ceph_unreserve_caps(mdsc, &req->r_caps_reservation); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index dd26846dd71..c2a19fbbe51 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {  			struct ceph_mds_reply_info_in *dir_in;  			u8                            dir_complete, dir_end;  		}; + +		/* for create results */ +		struct { +			bool has_create_ino; +			u64 ino; +		};  	};  	/* encoded blob describing snapshot contexts for certain @@ -184,8 +190,8 @@ struct ceph_mds_request {  	union ceph_mds_request_args r_args;  	int r_fmode;        /* file mode, if expecting cap */ -	uid_t r_uid; -	gid_t r_gid; +	kuid_t r_uid; +	kgid_t r_gid;  	/* for choosing which mds to send this request to */  	int r_direct_mode; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 73b7d44e8a3..0d3c9240c61 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)  		return ERR_PTR(-ENOMEM);  	ceph_decode_16_safe(p, end, version, bad); +	if (version > 3) { +		pr_warning("got mdsmap version %d > 3, failing", version); +		goto bad; +	}  	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);  	m->m_epoch = ceph_decode_32(p); @@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)  	/* pg_pools */  	ceph_decode_32_safe(p, end, n, bad);  	m->m_num_data_pg_pools = n; -	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); +	m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);  	if (!m->m_data_pg_pools)  		goto badmem; -	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); +	ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);  	for (i = 0; i < n; i++) -		m->m_data_pg_pools[i] = ceph_decode_32(p); -	m->m_cas_pg_pool = ceph_decode_32(p); +		m->m_data_pg_pools[i] = ceph_decode_64(p); +	m->m_cas_pg_pool = ceph_decode_64(p);  	/* ok, we don't care about the rest. */  	dout("mdsmap_decode success epoch %u\n", m->m_epoch); diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index cd5097d7c80..89fa4a940a0 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)  	case CEPH_MDS_STATE_BOOT:       return "up:boot";  	case CEPH_MDS_STATE_STANDBY:    return "up:standby";  	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay"; +	case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";  	case CEPH_MDS_STATE_CREATING:   return "up:creating";  	case CEPH_MDS_STATE_STARTING:   return "up:starting";  		/* up and in */ @@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)  	case CEPH_MDS_OP_LOOKUP:  return "lookup";  	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";  	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent"; +	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";  	case CEPH_MDS_OP_GETATTR:  return "getattr";  	case CEPH_MDS_OP_SETXATTR: return "setxattr";  	case CEPH_MDS_OP_SETATTR: return "setattr";  	case CEPH_MDS_OP_RMXATTR: return "rmxattr"; +	case CEPH_MDS_OP_SETLAYOUT: return "setlayou"; +	case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";  	case CEPH_MDS_OP_READDIR: return "readdir";  	case CEPH_MDS_OP_MKNOD: return "mknod";  	case CEPH_MDS_OP_LINK: return "link"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2eb43f21132..9fe17c6c287 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)  	/*  	 * express utilization in terms of large blocks to avoid  	 * overflow on 32-bit machines. +	 * +	 * NOTE: for the time being, we make bsize == frsize to humor +	 * not-yet-ancient versions of glibc that are broken. +	 * Someday, we will probably want to report a real block +	 * size...  whatever that may mean for a network file system!  	 */  	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; +	buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;  	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);  	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);  	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); @@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)  	buf->f_files = le64_to_cpu(st.num_objects);  	buf->f_ffree = -1;  	buf->f_namelen = NAME_MAX; -	buf->f_frsize = PAGE_CACHE_SIZE;  	/* leave fsid little-endian, regardless of host endianness */  	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); @@ -403,8 +408,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)  		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);  	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)  		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); -	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) -		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);  	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)  		seq_printf(m, ",osdkeepalivetimeout=%d",  			   opt->osd_keepalive_timeout); @@ -849,7 +852,7 @@ static int ceph_register_bdi(struct super_block *sb,  		fsc->backing_dev_info.ra_pages =  			default_backing_dev_info.ra_pages; -	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", +	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",  			   atomic_long_inc_return(&bdi_seq));  	if (!err)  		sb->s_bdi = &fsc->backing_dev_info; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 66ebe720e40..c7b309723dc 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -21,7 +21,7 @@  /* large granularity for statfs utilization stats to facilitate   * large volume sizes on 32-bit machines. */ -#define CEPH_BLOCK_SHIFT   20  /* 1 MB */ +#define CEPH_BLOCK_SHIFT   22  /* 4 MB */  #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)  #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */ @@ -138,8 +138,8 @@ struct ceph_cap_snap {  	struct ceph_snap_context *context;  	umode_t mode; -	uid_t uid; -	gid_t gid; +	kuid_t uid; +	kgid_t gid;  	struct ceph_buffer *xattr_blob;  	u64 xattr_version; @@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);  /* file.c */  extern const struct file_operations ceph_file_fops;  extern const struct address_space_operations ceph_aops; -extern int ceph_copy_to_page_vector(struct page **pages, -				    const char *data, -				    loff_t off, size_t len); -extern int ceph_copy_from_page_vector(struct page **pages, -				    char *data, -				    loff_t off, size_t len); -extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); +  extern int ceph_open(struct inode *inode, struct file *file);  extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,  			    struct file *file, unsigned flags, umode_t mode, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2c2ae5be990..9b6b2b6dd16 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -29,9 +29,94 @@ struct ceph_vxattr {  	size_t name_size;	/* strlen(name) + 1 (for '\0') */  	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,  			      size_t size); -	bool readonly; +	bool readonly, hidden; +	bool (*exists_cb)(struct ceph_inode_info *ci);  }; +/* layouts */ + +static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) +{ +	size_t s; +	char *p = (char *)&ci->i_layout; + +	for (s = 0; s < sizeof(ci->i_layout); s++, p++) +		if (*p) +			return true; +	return false; +} + +static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, +					size_t size) +{ +	int ret; +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); +	struct ceph_osd_client *osdc = &fsc->client->osdc; +	s64 pool = ceph_file_layout_pg_pool(ci->i_layout); +	const char *pool_name; + +	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); +	down_read(&osdc->map_sem); +	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); +	if (pool_name) +		ret = snprintf(val, size, +		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", +		(unsigned long long)ceph_file_layout_su(ci->i_layout), +		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), +	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout), +		pool_name); +	else +		ret = snprintf(val, size, +		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", +		(unsigned long long)ceph_file_layout_su(ci->i_layout), +		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), +	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout), +		(unsigned long long)pool); + +	up_read(&osdc->map_sem); +	return ret; +} + +static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, +					       char *val, size_t size) +{ +	return snprintf(val, size, "%lld", +			(unsigned long long)ceph_file_layout_su(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, +						char *val, size_t size) +{ +	return snprintf(val, size, "%lld", +	       (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, +					       char *val, size_t size) +{ +	return snprintf(val, size, "%lld", +	       (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, +					char *val, size_t size) +{ +	int ret; +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); +	struct ceph_osd_client *osdc = &fsc->client->osdc; +	s64 pool = ceph_file_layout_pg_pool(ci->i_layout); +	const char *pool_name; + +	down_read(&osdc->map_sem); +	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); +	if (pool_name) +		ret = snprintf(val, size, "%s", pool_name); +	else +		ret = snprintf(val, size, "%lld", (unsigned long long)pool); +	up_read(&osdc->map_sem); +	return ret; +} +  /* directories */  static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, @@ -83,17 +168,43 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,  			(long)ci->i_rctime.tv_nsec);  } +  #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2)	\ +	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 -#define XATTR_NAME_CEPH(_type, _name) \ -		{ \ -			.name = CEPH_XATTR_NAME(_type, _name), \ -			.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ -			.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ -			.readonly = true, \ -		} +#define XATTR_NAME_CEPH(_type, _name)					\ +	{								\ +		.name = CEPH_XATTR_NAME(_type, _name),			\ +		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ +		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ +		.readonly = true,				\ +		.hidden = false,				\ +		.exists_cb = NULL,			\ +	} +#define XATTR_LAYOUT_FIELD(_type, _name, _field)			\ +	{								\ +		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\ +		.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ +		.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ +		.readonly = false,				\ +		.hidden = true,			\ +		.exists_cb = ceph_vxattrcb_layout_exists,	\ +	}  static struct ceph_vxattr ceph_dir_vxattrs[] = { +	{ +		.name = "ceph.dir.layout", +		.name_size = sizeof("ceph.dir.layout"), +		.getxattr_cb = ceph_vxattrcb_layout, +		.readonly = false, +		.hidden = false, +		.exists_cb = ceph_vxattrcb_layout_exists, +	}, +	XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), +	XATTR_LAYOUT_FIELD(dir, layout, stripe_count), +	XATTR_LAYOUT_FIELD(dir, layout, object_size), +	XATTR_LAYOUT_FIELD(dir, layout, pool),  	XATTR_NAME_CEPH(dir, entries),  	XATTR_NAME_CEPH(dir, files),  	XATTR_NAME_CEPH(dir, subdirs), @@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {  	XATTR_NAME_CEPH(dir, rsubdirs),  	XATTR_NAME_CEPH(dir, rbytes),  	XATTR_NAME_CEPH(dir, rctime), -	{ 0 }	/* Required table terminator */ +	{ .name = NULL, 0 }	/* Required table terminator */  };  static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */  /* files */ -static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val, -				   size_t size) -{ -	int ret; - -	ret = snprintf(val, size, -		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", -		(unsigned long long)ceph_file_layout_su(ci->i_layout), -		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), -		(unsigned long long)ceph_file_layout_object_size(ci->i_layout)); -	return ret; -} -  static struct ceph_vxattr ceph_file_vxattrs[] = { -	XATTR_NAME_CEPH(file, layout), -	/* The following extended attribute name is deprecated */  	{ -		.name = XATTR_CEPH_PREFIX "layout", -		.name_size = sizeof (XATTR_CEPH_PREFIX "layout"), -		.getxattr_cb = ceph_vxattrcb_file_layout, -		.readonly = true, +		.name = "ceph.file.layout", +		.name_size = sizeof("ceph.file.layout"), +		.getxattr_cb = ceph_vxattrcb_layout, +		.readonly = false, +		.hidden = false, +		.exists_cb = ceph_vxattrcb_layout_exists,  	}, -	{ 0 }	/* Required table terminator */ +	XATTR_LAYOUT_FIELD(file, layout, stripe_unit), +	XATTR_LAYOUT_FIELD(file, layout, stripe_count), +	XATTR_LAYOUT_FIELD(file, layout, object_size), +	XATTR_LAYOUT_FIELD(file, layout, pool), +	{ .name = NULL, 0 }	/* Required table terminator */  };  static size_t ceph_file_vxattrs_name_size;	/* total size of all names */ @@ -164,7 +266,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)  	size_t size = 0;  	for (vxattr = vxattrs; vxattr->name; vxattr++) -		size += vxattr->name_size; +		if (!vxattr->hidden) +			size += vxattr->name_size;  	return size;  } @@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,  	if (!ceph_is_valid_xattr(name))  		return -ENODATA; -	/* let's see if a virtual xattr was requested */ -	vxattr = ceph_match_vxattr(inode, name); -  	spin_lock(&ci->i_ceph_lock);  	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,  	     ci->i_xattrs.version, ci->i_xattrs.index_version); +	/* let's see if a virtual xattr was requested */ +	vxattr = ceph_match_vxattr(inode, name); +	if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { +		err = vxattr->getxattr_cb(ci, value, size); +		goto out; +	} +  	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&  	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {  		goto get_xattr; @@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,  	spin_lock(&ci->i_ceph_lock); -	if (vxattr && vxattr->readonly) { -		err = vxattr->getxattr_cb(ci, value, size); -		goto out; -	} -  	err = __build_xattrs(inode);  	if (err < 0)  		goto out; @@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,  get_xattr:  	err = -ENODATA;  /* == ENOATTR */  	xattr = __get_xattr(ci, name); -	if (!xattr) { -		if (vxattr) -			err = vxattr->getxattr_cb(ci, value, size); +	if (!xattr)  		goto out; -	}  	err = -ERANGE;  	if (size && size < xattr->val_len) @@ -664,23 +763,30 @@ list_xattr:  	vir_namelen = ceph_vxattrs_name_size(vxattrs);  	/* adding 1 byte per each variable due to the null termination */ -	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; +	namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;  	err = -ERANGE; -	if (size && namelen > size) +	if (size && vir_namelen + namelen > size)  		goto out; -	err = namelen; +	err = namelen + vir_namelen;  	if (size == 0)  		goto out;  	names = __copy_xattr_names(ci, names);  	/* virtual xattr names, too */ -	if (vxattrs) +	err = namelen; +	if (vxattrs) {  		for (i = 0; vxattrs[i].name; i++) { -			len = sprintf(names, "%s", vxattrs[i].name); -			names += len + 1; +			if (!vxattrs[i].hidden && +			    !(vxattrs[i].exists_cb && +			      !vxattrs[i].exists_cb(ci))) { +				len = sprintf(names, "%s", vxattrs[i].name); +				names += len + 1; +				err += len + 1; +			}  		} +	}  out:  	spin_unlock(&ci->i_ceph_lock); @@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name,  	if (vxattr && vxattr->readonly)  		return -EOPNOTSUPP; +	/* pass any unhandled ceph.* xattrs through to the MDS */ +	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) +		goto do_sync_unlocked; +  	/* preallocate memory for xattr name, value, index node */  	err = -ENOMEM;  	newname = kmemdup(name, name_len + 1, GFP_NOFS); @@ -838,6 +948,7 @@ retry:  do_sync:  	spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked:  	err = ceph_sync_setxattr(dentry, name, value, size, flags);  out:  	kfree(newname); @@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name)  	if (vxattr && vxattr->readonly)  		return -EOPNOTSUPP; +	/* pass any unhandled ceph.* xattrs through to the MDS */ +	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) +		goto do_sync_unlocked; +  	err = -ENOMEM;  	spin_lock(&ci->i_ceph_lock);  retry: @@ -931,6 +1046,7 @@ retry:  	return err;  do_sync:  	spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked:  	err = ceph_send_removexattr(dentry, name);  out:  	return err; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 21ff76c22a1..2906ee27640 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -155,14 +155,14 @@ config CIFS_DFS_UPCALL  	    points. If unsure, say N.  config CIFS_NFSD_EXPORT -	  bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" -	  depends on CIFS && EXPERIMENTAL && BROKEN +	  bool "Allow nfsd to export CIFS file system" +	  depends on CIFS && BROKEN  	  help  	   Allows NFS server to export a CIFS mounted share (nfsd over cifs)  config CIFS_SMB2 -	bool "SMB2 network file system support (EXPERIMENTAL)" -	depends on CIFS && EXPERIMENTAL && INET +	bool "SMB2 network file system support" +	depends on CIFS && INET  	select NLS  	select KEYS  	select FSCACHE diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 86e92ef2abc..69ae3d3c3b3 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -37,7 +37,6 @@ void dump_smb(void *, int);  #define CIFS_TIMER	0x04  extern int cifsFYI; -extern int cifsERROR;  /*   *	debug ON @@ -64,10 +63,7 @@ do {									\  /* error event message: e.g., i/o error */  #define cifserror(fmt, ...)						\ -do {									\ -	if (cifsERROR)							\ -		printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__);	\ -} while (0) +	printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__);		\  #define cERROR(set, fmt, ...)						\  do {									\ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ce5cbd717bf..210fce2df30 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -226,6 +226,8 @@ compose_mount_options_out:  compose_mount_options_err:  	kfree(mountdata);  	mountdata = ERR_PTR(rc); +	kfree(*devname); +	*devname = NULL;  	goto compose_mount_options_out;  } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index c865bfdfe81..37e4a72a7d1 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -55,10 +55,10 @@ struct cifs_sb_info {  	unsigned int wsize;  	unsigned long actimeo; /* attribute cache timeout (jiffies) */  	atomic_t active; -	uid_t	mnt_uid; -	gid_t	mnt_gid; -	uid_t	mnt_backupuid; -	gid_t	mnt_backupgid; +	kuid_t	mnt_uid; +	kgid_t	mnt_gid; +	kuid_t	mnt_backupuid; +	kgid_t	mnt_backupgid;  	umode_t	mnt_file_mode;  	umode_t	mnt_dir_mode;  	unsigned int mnt_cifs_flags; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 086f381d648..10e77476129 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -149,10 +149,12 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)  		goto out;  	dp = description + strlen(description); -	sprintf(dp, ";uid=0x%x", sesInfo->linux_uid); +	sprintf(dp, ";uid=0x%x", +		from_kuid_munged(&init_user_ns, sesInfo->linux_uid));  	dp = description + strlen(description); -	sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); +	sprintf(dp, ";creduid=0x%x", +		from_kuid_munged(&init_user_ns, sesInfo->cred_uid));  	if (sesInfo->user_name) {  		dp = description + strlen(description); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 5cbd00e7406..f1e3f25fe00 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -266,8 +266,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,  	struct key *sidkey;  	char *sidstr;  	const struct cred *saved_cred; -	uid_t fuid = cifs_sb->mnt_uid; -	gid_t fgid = cifs_sb->mnt_gid; +	kuid_t fuid = cifs_sb->mnt_uid; +	kgid_t fgid = cifs_sb->mnt_gid;  	/*  	 * If we have too many subauthorities, then something is really wrong. @@ -297,6 +297,7 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,  	 * probably a safe assumption but might be better to check based on  	 * sidtype.  	 */ +	BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));  	if (sidkey->datalen != sizeof(uid_t)) {  		rc = -EIO;  		cFYI(1, "%s: Downcall contained malformed key " @@ -305,10 +306,21 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,  		goto out_key_put;  	} -	if (sidtype == SIDOWNER) -		memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t)); -	else -		memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t)); +	if (sidtype == SIDOWNER) { +		kuid_t uid; +		uid_t id; +		memcpy(&id, &sidkey->payload.value, sizeof(uid_t)); +		uid = make_kuid(&init_user_ns, id); +		if (uid_valid(uid)) +			fuid = uid; +	} else { +		kgid_t gid; +		gid_t id; +		memcpy(&id, &sidkey->payload.value, sizeof(gid_t)); +		gid = make_kgid(&init_user_ns, id); +		if (gid_valid(gid)) +			fgid = gid; +	}  out_key_put:  	key_put(sidkey); @@ -346,7 +358,8 @@ init_cifs_idmap(void)  	if (!cred)  		return -ENOMEM; -	keyring = keyring_alloc(".cifs_idmap", 0, 0, cred, +	keyring = keyring_alloc(".cifs_idmap", +				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,  				(KEY_POS_ALL & ~KEY_POS_SETATTR) |  				KEY_USR_VIEW | KEY_USR_READ,  				KEY_ALLOC_NOT_IN_QUOTA, NULL); @@ -774,7 +787,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,  /* Convert permission bits from mode to equivalent CIFS ACL */  static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, -	__u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag) +	__u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag)  {  	int rc = 0;  	__u32 dacloffset; @@ -806,17 +819,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,  		*aclflag = CIFS_ACL_DACL;  	} else {  		memcpy(pnntsd, pntsd, secdesclen); -		if (uid != NO_CHANGE_32) { /* chown */ +		if (uid_valid(uid)) { /* chown */ +			uid_t id;  			owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +  					le32_to_cpu(pnntsd->osidoffset));  			nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),  								GFP_KERNEL);  			if (!nowner_sid_ptr)  				return -ENOMEM; -			rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr); +			id = from_kuid(&init_user_ns, uid); +			rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);  			if (rc) {  				cFYI(1, "%s: Mapping error %d for owner id %d", -						__func__, rc, uid); +						__func__, rc, id);  				kfree(nowner_sid_ptr);  				return rc;  			} @@ -824,17 +839,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,  			kfree(nowner_sid_ptr);  			*aclflag = CIFS_ACL_OWNER;  		} -		if (gid != NO_CHANGE_32) { /* chgrp */ +		if (gid_valid(gid)) { /* chgrp */ +			gid_t id;  			group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +  					le32_to_cpu(pnntsd->gsidoffset));  			ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),  								GFP_KERNEL);  			if (!ngroup_sid_ptr)  				return -ENOMEM; -			rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr); +			id = from_kgid(&init_user_ns, gid); +			rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);  			if (rc) {  				cFYI(1, "%s: Mapping error %d for group id %d", -						__func__, rc, gid); +						__func__, rc, id);  				kfree(ngroup_sid_ptr);  				return rc;  			} @@ -1002,7 +1019,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,  /* Convert mode bits to an ACL so we can update the ACL on the server */  int  id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, -			uid_t uid, gid_t gid) +			kuid_t uid, kgid_t gid)  {  	int rc = 0;  	int aclflag = CIFS_ACL_DACL; /* default flag to set */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ce9f3c5421b..1a052c0eee8 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -54,7 +54,6 @@  #endif  int cifsFYI = 0; -int cifsERROR = 1;  int traceSMB = 0;  bool enable_oplocks = true;  unsigned int linuxExtEnabled = 1; @@ -229,7 +228,6 @@ cifs_alloc_inode(struct super_block *sb)  	cifs_set_oplock_level(cifs_inode, 0);  	cifs_inode->delete_pending = false;  	cifs_inode->invalid_mapping = false; -	cifs_inode->leave_pages_clean = false;  	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */  	cifs_inode->server_eof = 0;  	cifs_inode->uniqueid = 0; @@ -377,13 +375,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root)  				   (int)(srcaddr->sa_family));  	} -	seq_printf(s, ",uid=%u", cifs_sb->mnt_uid); +	seq_printf(s, ",uid=%u", +		   from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid));  	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)  		seq_printf(s, ",forceuid");  	else  		seq_printf(s, ",noforceuid"); -	seq_printf(s, ",gid=%u", cifs_sb->mnt_gid); +	seq_printf(s, ",gid=%u", +		   from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid));  	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)  		seq_printf(s, ",forcegid");  	else @@ -438,9 +438,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root)  	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)  		seq_printf(s, ",noperm");  	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) -		seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid); +		seq_printf(s, ",backupuid=%u", +			   from_kuid_munged(&init_user_ns, +					    cifs_sb->mnt_backupuid));  	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) -		seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid); +		seq_printf(s, ",backupgid=%u", +			   from_kgid_munged(&init_user_ns, +					    cifs_sb->mnt_backupgid));  	seq_printf(s, ",rsize=%u", cifs_sb->rsize);  	seq_printf(s, ",wsize=%u", cifs_sb->wsize); @@ -560,6 +564,11 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)  			dentry = ERR_PTR(-ENOENT);  			break;  		} +		if (!S_ISDIR(dir->i_mode)) { +			dput(dentry); +			dentry = ERR_PTR(-ENOTDIR); +			break; +		}  		/* skip separators */  		while (*s == sep) @@ -679,7 +688,7 @@ out_nls:  static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  				   unsigned long nr_segs, loff_t pos)  { -	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(iocb->ki_filp);  	ssize_t written;  	int rc; @@ -703,7 +712,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)  	 */  	if (whence != SEEK_SET && whence != SEEK_CUR) {  		int rc; -		struct inode *inode = file->f_path.dentry->d_inode; +		struct inode *inode = file_inode(file);  		/*  		 * We need to be sure that all dirty pages are written and the @@ -735,7 +744,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)  {  	/* note that this is called by vfs setlease with lock_flocks held  	   to protect *lease from going away */ -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct cifsFileInfo *cfile = file->private_data;  	if (!(S_ISREG(inode->i_mode))) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index aea1eec6491..4f07f6fbe49 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -386,6 +386,7 @@ struct smb_version_values {  	unsigned int	cap_unix;  	unsigned int	cap_nt_find;  	unsigned int	cap_large_files; +	unsigned int	oplock_read;  };  #define HEADER_SIZE(server) (server->vals->header_size) @@ -399,11 +400,11 @@ struct smb_vol {  	char *iocharset;  /* local code page for mapping to and from Unicode */  	char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */  	char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ -	uid_t cred_uid; -	uid_t linux_uid; -	gid_t linux_gid; -	uid_t backupuid; -	gid_t backupgid; +	kuid_t cred_uid; +	kuid_t linux_uid; +	kgid_t linux_gid; +	kuid_t backupuid; +	kgid_t backupgid;  	umode_t file_mode;  	umode_t dir_mode;  	unsigned secFlg; @@ -702,8 +703,8 @@ struct cifs_ses {  	char *serverNOS;	/* name of network operating system of server */  	char *serverDomain;	/* security realm of server */  	__u64 Suid;		/* remote smb uid  */ -	uid_t linux_uid;        /* overriding owner of files on the mount */ -	uid_t cred_uid;		/* owner of credentials */ +	kuid_t linux_uid;	/* overriding owner of files on the mount */ +	kuid_t cred_uid;	/* owner of credentials */  	unsigned int capabilities;  	char serverName[SERVER_NAME_LEN_WITH_NULL * 2];	/* BB make bigger for  				TCP names - will ipv6 and sctp addresses fit? */ @@ -837,7 +838,7 @@ struct cifs_tcon {   */  struct tcon_link {  	struct rb_node		tl_rbnode; -	uid_t			tl_uid; +	kuid_t			tl_uid;  	unsigned long		tl_flags;  #define TCON_LINK_MASTER	0  #define TCON_LINK_PENDING	1 @@ -930,7 +931,7 @@ struct cifsFileInfo {  	struct list_head tlist;	/* pointer to next fid owned by tcon */  	struct list_head flist;	/* next fid (file instance) for this inode */  	struct cifs_fid_locks *llist;	/* brlocks held by this fid */ -	unsigned int uid;	/* allows finding which FileInfo structure */ +	kuid_t uid;		/* allows finding which FileInfo structure */  	__u32 pid;		/* process id who opened file */  	struct cifs_fid fid;	/* file id from remote */  	/* BB add lock scope info here if needed */ ; @@ -1030,7 +1031,6 @@ struct cifsInodeInfo {  	bool clientCanCacheAll;		/* read and writebehind oplock */  	bool delete_pending;		/* DELETE_ON_CLOSE is set */  	bool invalid_mapping;		/* pagecache is invalid */ -	bool leave_pages_clean;	/* protected by i_mutex, not set pages dirty */  	unsigned long time;		/* jiffies of last update of inode */  	u64  server_eof;		/* current file size on server -- protected by i_lock */  	u64  uniqueid;			/* server inode number */ @@ -1245,8 +1245,8 @@ struct cifs_fattr {  	u64		cf_eof;  	u64		cf_bytes;  	u64		cf_createtime; -	uid_t		cf_uid; -	gid_t		cf_gid; +	kuid_t		cf_uid; +	kgid_t		cf_gid;  	umode_t		cf_mode;  	dev_t		cf_rdev;  	unsigned int	cf_nlink; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b9d59a948a2..e996ff6b26d 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -277,7 +277,6 @@  #define CIFS_NO_HANDLE        0xFFFF  #define NO_CHANGE_64          0xFFFFFFFFFFFFFFFFULL -#define NO_CHANGE_32          0xFFFFFFFFUL  /* IPC$ in ASCII */  #define CIFS_IPC_RESOURCE "\x49\x50\x43\x24" diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1988c1baa22..f450f0683dd 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -46,7 +46,8 @@ extern void _free_xid(unsigned int);  ({								\  	unsigned int __xid = _get_xid();				\  	cFYI(1, "CIFS VFS: in %s as Xid: %u with uid: %d",	\ -	     __func__, __xid, current_fsuid());			\ +	     __func__, __xid,					\ +	     from_kuid(&init_user_ns, current_fsuid()));	\  	__xid;							\  }) @@ -161,7 +162,7 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,  			      struct cifs_fattr *fattr, struct inode *inode,  			      const char *path, const __u16 *pfid);  extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, -					uid_t, gid_t); +					kuid_t, kgid_t);  extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,  					const char *, u32 *);  extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, @@ -304,8 +305,8 @@ struct cifs_unix_set_info_args {  	__u64	atime;  	__u64	mtime;  	__u64	mode; -	__u64	uid; -	__u64	gid; +	kuid_t	uid; +	kgid_t	gid;  	dev_t	device;  }; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 76d0d299885..7353bc5d73d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1909,8 +1909,11 @@ cifs_writev_requeue(struct cifs_writedata *wdata)  	} while (rc == -EAGAIN);  	for (i = 0; i < wdata->nr_pages; i++) { -		if (rc != 0) +		if (rc != 0) {  			SetPageError(wdata->pages[i]); +			end_page_writeback(wdata->pages[i]); +			page_cache_release(wdata->pages[i]); +		}  		unlock_page(wdata->pages[i]);  	} @@ -5819,8 +5822,14 @@ static void  cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,  			const struct cifs_unix_set_info_args *args)  { +	u64 uid = NO_CHANGE_64, gid = NO_CHANGE_64;  	u64 mode = args->mode; +	if (uid_valid(args->uid)) +		uid = from_kuid(&init_user_ns, args->uid); +	if (gid_valid(args->gid)) +		gid = from_kgid(&init_user_ns, args->gid); +  	/*  	 * Samba server ignores set of file size to zero due to bugs in some  	 * older clients, but we should be precise - we use SetFileSize to @@ -5833,8 +5842,8 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,  	data_offset->LastStatusChange = cpu_to_le64(args->ctime);  	data_offset->LastAccessTime = cpu_to_le64(args->atime);  	data_offset->LastModificationTime = cpu_to_le64(args->mtime); -	data_offset->Uid = cpu_to_le64(args->uid); -	data_offset->Gid = cpu_to_le64(args->gid); +	data_offset->Uid = cpu_to_le64(uid); +	data_offset->Gid = cpu_to_le64(gid);  	/* better to leave device as zero when it is  */  	data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));  	data_offset->DevMinor = cpu_to_le64(MINOR(args->device)); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7635b5db26a..54125e04fd0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -987,6 +987,41 @@ static int get_option_ul(substring_t args[], unsigned long *option)  	return rc;  } +static int get_option_uid(substring_t args[], kuid_t *result) +{ +	unsigned long value; +	kuid_t uid; +	int rc; + +	rc = get_option_ul(args, &value); +	if (rc) +		return rc; + +	uid = make_kuid(current_user_ns(), value); +	if (!uid_valid(uid)) +		return -EINVAL; + +	*result = uid; +	return 0; +} + +static int get_option_gid(substring_t args[], kgid_t *result) +{ +	unsigned long value; +	kgid_t gid; +	int rc; + +	rc = get_option_ul(args, &value); +	if (rc) +		return rc; + +	gid = make_kgid(current_user_ns(), value); +	if (!gid_valid(gid)) +		return -EINVAL; + +	*result = gid; +	return 0; +}  static int cifs_parse_security_flavors(char *value,  				       struct smb_vol *vol) @@ -996,7 +1031,7 @@ static int cifs_parse_security_flavors(char *value,  	switch (match_token(value, cifs_secflavor_tokens, args)) {  	case Opt_sec_krb5: -		vol->secFlg |= CIFSSEC_MAY_KRB5; +		vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN;  		break;  	case Opt_sec_krb5i:  		vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN; @@ -1424,47 +1459,42 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,  		/* Numeric Values */  		case Opt_backupuid: -			if (get_option_ul(args, &option)) { +			if (get_option_uid(args, &vol->backupuid)) {  				cERROR(1, "%s: Invalid backupuid value",  					__func__);  				goto cifs_parse_mount_err;  			} -			vol->backupuid = option;  			vol->backupuid_specified = true;  			break;  		case Opt_backupgid: -			if (get_option_ul(args, &option)) { +			if (get_option_gid(args, &vol->backupgid)) {  				cERROR(1, "%s: Invalid backupgid value",  					__func__);  				goto cifs_parse_mount_err;  			} -			vol->backupgid = option;  			vol->backupgid_specified = true;  			break;  		case Opt_uid: -			if (get_option_ul(args, &option)) { +			if (get_option_uid(args, &vol->linux_uid)) {  				cERROR(1, "%s: Invalid uid value",  					__func__);  				goto cifs_parse_mount_err;  			} -			vol->linux_uid = option;  			uid_specified = true;  			break;  		case Opt_cruid: -			if (get_option_ul(args, &option)) { +			if (get_option_uid(args, &vol->cred_uid)) {  				cERROR(1, "%s: Invalid cruid value",  					__func__);  				goto cifs_parse_mount_err;  			} -			vol->cred_uid = option;  			break;  		case Opt_gid: -			if (get_option_ul(args, &option)) { +			if (get_option_gid(args, &vol->linux_gid)) {  				cERROR(1, "%s: Invalid gid value",  						__func__);  				goto cifs_parse_mount_err;  			} -			vol->linux_gid = option;  			gid_specified = true;  			break;  		case Opt_file_mode: @@ -1624,14 +1654,11 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,  		case Opt_unc:  			string = vol->UNC;  			vol->UNC = match_strdup(args); -			if (vol->UNC == NULL) { -				kfree(string); +			if (vol->UNC == NULL)  				goto out_nomem; -			}  			convert_delimiter(vol->UNC, '\\');  			if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') { -				kfree(string);  				printk(KERN_ERR "CIFS: UNC Path does not "  						"begin with // or \\\\\n");  				goto cifs_parse_mount_err; @@ -1687,10 +1714,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,  			string = vol->prepath;  			vol->prepath = match_strdup(args); -			if (vol->prepath == NULL) { -				kfree(string); +			if (vol->prepath == NULL)  				goto out_nomem; -			}  			/* Compare old prefixpath= option to new one */  			if (!string || strcmp(string, vol->prepath))  				printk(KERN_WARNING "CIFS: the value of the " @@ -1922,7 +1947,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)  	}  	case AF_INET6: {  		struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; -		struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs; +		struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;  		return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);  	}  	default: @@ -2246,7 +2271,7 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)  {  	switch (ses->server->secType) {  	case Kerberos: -		if (vol->cred_uid != ses->cred_uid) +		if (!uid_eq(vol->cred_uid, ses->cred_uid))  			return 0;  		break;  	default: @@ -2718,7 +2743,7 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)  	if (new->rsize && new->rsize < old->rsize)  		return 0; -	if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) +	if (!uid_eq(old->mnt_uid, new->mnt_uid) || !gid_eq(old->mnt_gid, new->mnt_gid))  		return 0;  	if (old->mnt_file_mode != new->mnt_file_mode || @@ -3924,7 +3949,7 @@ cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)  }  static struct cifs_tcon * -cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) +cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)  {  	int rc;  	struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb); @@ -3994,7 +4019,7 @@ cifs_sb_tcon_pending_wait(void *unused)  /* find and return a tlink with given uid */  static struct tcon_link * -tlink_rb_search(struct rb_root *root, uid_t uid) +tlink_rb_search(struct rb_root *root, kuid_t uid)  {  	struct rb_node *node = root->rb_node;  	struct tcon_link *tlink; @@ -4002,9 +4027,9 @@ tlink_rb_search(struct rb_root *root, uid_t uid)  	while (node) {  		tlink = rb_entry(node, struct tcon_link, tl_rbnode); -		if (tlink->tl_uid > uid) +		if (uid_gt(tlink->tl_uid, uid))  			node = node->rb_left; -		else if (tlink->tl_uid < uid) +		else if (uid_lt(tlink->tl_uid, uid))  			node = node->rb_right;  		else  			return tlink; @@ -4023,7 +4048,7 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)  		tlink = rb_entry(*new, struct tcon_link, tl_rbnode);  		parent = *new; -		if (tlink->tl_uid > new_tlink->tl_uid) +		if (uid_gt(tlink->tl_uid, new_tlink->tl_uid))  			new = &((*new)->rb_left);  		else  			new = &((*new)->rb_right); @@ -4053,7 +4078,7 @@ struct tcon_link *  cifs_sb_tlink(struct cifs_sb_info *cifs_sb)  {  	int ret; -	uid_t fsuid = current_fsuid(); +	kuid_t fsuid = current_fsuid();  	struct tcon_link *tlink, *newtlink;  	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 8719bbe0dcc..1cd01621744 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -342,14 +342,14 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,  		*created |= FILE_CREATED;  		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { -			args.uid = (__u64) current_fsuid(); +			args.uid = current_fsuid();  			if (inode->i_mode & S_ISGID) -				args.gid = (__u64) inode->i_gid; +				args.gid = inode->i_gid;  			else -				args.gid = (__u64) current_fsgid(); +				args.gid = current_fsgid();  		} else { -			args.uid = NO_CHANGE_64; -			args.gid = NO_CHANGE_64; +			args.uid = INVALID_UID; /* no change */ +			args.gid = INVALID_GID; /* no change */  		}  		CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid,  				       current->tgid); @@ -588,11 +588,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,  			.device	= device_number,  		};  		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { -			args.uid = (__u64) current_fsuid(); -			args.gid = (__u64) current_fsgid(); +			args.uid = current_fsuid(); +			args.gid = current_fsgid();  		} else { -			args.uid = NO_CHANGE_64; -			args.gid = NO_CHANGE_64; +			args.uid = INVALID_UID; /* no change */ +			args.gid = INVALID_GID; /* no change */  		}  		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,  					    cifs_sb->local_nls, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a6677ba212..8c0d8557731 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -43,6 +43,7 @@  #include "cifs_fs_sb.h"  #include "fscache.h" +  static inline int cifs_convert_flags(unsigned int flags)  {  	if ((flags & O_ACCMODE) == O_RDONLY) @@ -72,10 +73,15 @@ static u32 cifs_posix_convert_flags(unsigned int flags)  	else if ((flags & O_ACCMODE) == O_RDWR)  		posix_flags = SMB_O_RDWR; -	if (flags & O_CREAT) +	if (flags & O_CREAT) {  		posix_flags |= SMB_O_CREAT; -	if (flags & O_EXCL) -		posix_flags |= SMB_O_EXCL; +		if (flags & O_EXCL) +			posix_flags |= SMB_O_EXCL; +	} else if (flags & O_EXCL) +		cFYI(1, "Application %s pid %d has incorrectly set O_EXCL flag" +			"but not O_CREAT on file open. Ignoring O_EXCL", +			current->comm, current->tgid); +  	if (flags & O_TRUNC)  		posix_flags |= SMB_O_TRUNC;  	/* be safe and imply O_SYNC for O_DSYNC */ @@ -238,6 +244,23 @@ out:  	return rc;  } +static bool +cifs_has_mand_locks(struct cifsInodeInfo *cinode) +{ +	struct cifs_fid_locks *cur; +	bool has_locks = false; + +	down_read(&cinode->lock_sem); +	list_for_each_entry(cur, &cinode->llist, llist) { +		if (!list_empty(&cur->locks)) { +			has_locks = true; +			break; +		} +	} +	up_read(&cinode->lock_sem); +	return has_locks; +} +  struct cifsFileInfo *  cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,  		  struct tcon_link *tlink, __u32 oplock) @@ -248,6 +271,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,  	struct cifsFileInfo *cfile;  	struct cifs_fid_locks *fdlocks;  	struct cifs_tcon *tcon = tlink_tcon(tlink); +	struct TCP_Server_Info *server = tcon->ses->server;  	cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);  	if (cfile == NULL) @@ -276,12 +300,22 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,  	INIT_WORK(&cfile->oplock_break, cifs_oplock_break);  	mutex_init(&cfile->fh_mutex); +	/* +	 * If the server returned a read oplock and we have mandatory brlocks, +	 * set oplock level to None. +	 */ +	if (oplock == server->vals->oplock_read && +						cifs_has_mand_locks(cinode)) { +		cFYI(1, "Reset oplock val from read to None due to mand locks"); +		oplock = 0; +	} +  	spin_lock(&cifs_file_list_lock); -	if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE) +	if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)  		oplock = fid->pending_open->oplock;  	list_del(&fid->pending_open->olist); -	tlink_tcon(tlink)->ses->server->ops->set_fid(cfile, fid, oplock); +	server->ops->set_fid(cfile, fid, oplock);  	list_add(&cfile->tlist, &tcon->openFileList);  	/* if readable file instance put first in list*/ @@ -487,8 +521,8 @@ int cifs_open(struct inode *inode, struct file *file)  		 */  		struct cifs_unix_set_info_args args = {  			.mode	= inode->i_mode, -			.uid	= NO_CHANGE_64, -			.gid	= NO_CHANGE_64, +			.uid	= INVALID_UID, /* no change */ +			.gid	= INVALID_GID, /* no change */  			.ctime	= NO_CHANGE_64,  			.atime	= NO_CHANGE_64,  			.mtime	= NO_CHANGE_64, @@ -919,7 +953,7 @@ static int  cifs_posix_lock_test(struct file *file, struct file_lock *flock)  {  	int rc = 0; -	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); +	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));  	unsigned char saved_type = flock->fl_type;  	if ((flock->fl_flags & FL_POSIX) == 0) @@ -946,7 +980,7 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)  static int  cifs_posix_lock_set(struct file *file, struct file_lock *flock)  { -	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); +	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));  	int rc = 1;  	if ((flock->fl_flags & FL_POSIX) == 0) @@ -1422,6 +1456,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,  	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;  	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);  	struct TCP_Server_Info *server = tcon->ses->server; +	struct inode *inode = cfile->dentry->d_inode;  	if (posix_lck) {  		int posix_lock_type; @@ -1459,6 +1494,21 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,  		if (!rc)  			goto out; +		/* +		 * Windows 7 server can delay breaking lease from read to None +		 * if we set a byte-range lock on a file - break it explicitly +		 * before sending the lock to the server to be sure the next +		 * read won't conflict with non-overlapted locks due to +		 * pagereading. +		 */ +		if (!CIFS_I(inode)->clientCanCacheAll && +					CIFS_I(inode)->clientCanCacheRead) { +			cifs_invalidate_mapping(inode); +			cFYI(1, "Set no oplock for inode=%p due to mand locks", +			     inode); +			CIFS_I(inode)->clientCanCacheRead = false; +		} +  		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,  					    type, 1, 0, wait_flag);  		if (rc) { @@ -1504,7 +1554,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)  	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);  	netfid = cfile->fid.netfid; -	cinode = CIFS_I(file->f_path.dentry->d_inode); +	cinode = CIFS_I(file_inode(file));  	if (cap_unix(tcon->ses) &&  	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && @@ -1649,7 +1699,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,  	   are always at the end of the list but since the first entry might  	   have a close pending, we go through the whole list */  	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { -		if (fsuid_only && open_file->uid != current_fsuid()) +		if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))  			continue;  		if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {  			if (!open_file->invalidHandle) { @@ -1702,7 +1752,7 @@ refind_writable:  	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {  		if (!any_available && open_file->pid != current->tgid)  			continue; -		if (fsuid_only && open_file->uid != current_fsuid()) +		if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))  			continue;  		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {  			if (!open_file->invalidHandle) { @@ -2103,15 +2153,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,  	} else {  		rc = copied;  		pos += copied; -		/* -		 * When we use strict cache mode and cifs_strict_writev was run -		 * with level II oplock (indicated by leave_pages_clean field of -		 * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev -		 * sent the data to the server itself. -		 */ -		if (!CIFS_I(inode)->leave_pages_clean || -		    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)) -			set_page_dirty(page); +		set_page_dirty(page);  	}  	if (rc > 0) { @@ -2135,7 +2177,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,  	struct cifs_tcon *tcon;  	struct TCP_Server_Info *server;  	struct cifsFileInfo *smbfile = file->private_data; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);  	rc = filemap_write_and_wait_range(inode->i_mapping, start, end); @@ -2210,7 +2252,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)   */  int cifs_flush(struct file *file, fl_owner_t id)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	int rc = 0;  	if (file->f_mode & FMODE_WRITE) @@ -2444,7 +2486,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,  	ssize_t written;  	struct inode *inode; -	inode = iocb->ki_filp->f_path.dentry->d_inode; +	inode = file_inode(iocb->ki_filp);  	/*  	 * BB - optimize the way when signing is disabled. We can drop this @@ -2462,8 +2504,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,  }  static ssize_t -cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov, -		      unsigned long nr_segs, loff_t pos, bool cache_ex) +cifs_writev(struct kiocb *iocb, const struct iovec *iov, +	    unsigned long nr_segs, loff_t pos)  {  	struct file *file = iocb->ki_filp;  	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; @@ -2485,12 +2527,8 @@ cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,  				     server->vals->exclusive_lock_type, NULL,  				     CIFS_WRITE_OP)) {  		mutex_lock(&inode->i_mutex); -		if (!cache_ex) -			cinode->leave_pages_clean = true;  		rc = __generic_file_aio_write(iocb, iov, nr_segs, -					      &iocb->ki_pos); -		if (!cache_ex) -			cinode->leave_pages_clean = false; +					       &iocb->ki_pos);  		mutex_unlock(&inode->i_mutex);  	} @@ -2511,66 +2549,38 @@ ssize_t  cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,  		   unsigned long nr_segs, loff_t pos)  { -	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(iocb->ki_filp);  	struct cifsInodeInfo *cinode = CIFS_I(inode);  	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);  	struct cifsFileInfo *cfile = (struct cifsFileInfo *)  						iocb->ki_filp->private_data;  	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); -	ssize_t written, written2; -	/* -	 * We need to store clientCanCacheAll here to prevent race -	 * conditions - this value can be changed during an execution -	 * of generic_file_aio_write. For CIFS it can be changed from -	 * true to false only, but for SMB2 it can be changed both from -	 * true to false and vice versa. So, we can end up with a data -	 * stored in the cache, not marked dirty and not sent to the -	 * server if this value changes its state from false to true -	 * after cifs_write_end. -	 */ -	bool cache_ex = cinode->clientCanCacheAll; -	bool cache_read = cinode->clientCanCacheRead; -	int rc; -	loff_t saved_pos; +	ssize_t written; -	if (cache_ex) { +	if (cinode->clientCanCacheAll) {  		if (cap_unix(tcon->ses) && -		    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && -		    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( -						tcon->fsUnixInfo.Capability))) +		(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) +		    && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))  			return generic_file_aio_write(iocb, iov, nr_segs, pos); -		return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex); +		return cifs_writev(iocb, iov, nr_segs, pos);  	} -  	/* -	 * For files without exclusive oplock in strict cache mode we need to -	 * write the data to the server exactly from the pos to pos+len-1 rather -	 * than flush all affected pages because it may cause a error with -	 * mandatory locks on these pages but not on the region from pos to -	 * ppos+len-1. +	 * For non-oplocked files in strict cache mode we need to write the data +	 * to the server exactly from the pos to pos+len-1 rather than flush all +	 * affected pages because it may cause a error with mandatory locks on +	 * these pages but not on the region from pos to ppos+len-1.  	 */  	written = cifs_user_writev(iocb, iov, nr_segs, pos); -	if (!cache_read || written <= 0) -		return written; - -	saved_pos = iocb->ki_pos; -	iocb->ki_pos = pos; -	/* we have a read oplock - need to store a data in the page cache */ -	if (cap_unix(tcon->ses) && -	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && -	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( -					tcon->fsUnixInfo.Capability))) -		written2 = generic_file_aio_write(iocb, iov, nr_segs, pos); -	else -		written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos, -						 cache_ex); -	/* errors occured during writing - invalidate the page cache */ -	if (written2 < 0) { -		rc = cifs_invalidate_mapping(inode); -		if (rc) -			written = (ssize_t)rc; -		else -			iocb->ki_pos = saved_pos; +	if (written > 0 && cinode->clientCanCacheRead) { +		/* +		 * Windows 7 server can delay breaking level2 oplock if a write +		 * request comes - break it on the client to prevent reading +		 * an old data. +		 */ +		cifs_invalidate_mapping(inode); +		cFYI(1, "Set no oplock for inode=%p after a write operation", +		     inode); +		cinode->clientCanCacheRead = false;  	}  	return written;  } @@ -2911,7 +2921,7 @@ ssize_t  cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,  		  unsigned long nr_segs, loff_t pos)  { -	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(iocb->ki_filp);  	struct cifsInodeInfo *cinode = CIFS_I(inode);  	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);  	struct cifsFileInfo *cfile = (struct cifsFileInfo *) @@ -3059,7 +3069,7 @@ static struct vm_operations_struct cifs_file_vm_ops = {  int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)  {  	int rc, xid; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	xid = get_xid(); @@ -3352,7 +3362,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,  	int rc;  	/* Is the page cached? */ -	rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page); +	rc = cifs_readpage_from_fscache(file_inode(file), page);  	if (rc == 0)  		goto read_complete; @@ -3367,8 +3377,8 @@ static int cifs_readpage_worker(struct file *file, struct page *page,  	else  		cFYI(1, "Bytes read %d", rc); -	file->f_path.dentry->d_inode->i_atime = -		current_fs_time(file->f_path.dentry->d_inode->i_sb); +	file_inode(file)->i_atime = +		current_fs_time(file_inode(file)->i_sb);  	if (PAGE_CACHE_SIZE > rc)  		memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc); @@ -3377,7 +3387,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,  	SetPageUptodate(page);  	/* send this page to the cache */ -	cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page); +	cifs_readpage_to_fscache(file_inode(file), page);  	rc = 0; @@ -3577,6 +3587,13 @@ void cifs_oplock_break(struct work_struct *work)  	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);  	int rc = 0; +	if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead && +						cifs_has_mand_locks(cinode)) { +		cFYI(1, "Reset oplock to None for inode=%p due to mand locks", +		     inode); +		cinode->clientCanCacheRead = false; +	} +  	if (inode && S_ISREG(inode->i_mode)) {  		if (cinode->clientCanCacheRead)  			break_lease(inode, O_RDONLY); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index ed6208ff85a..83f2606c76d 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -244,15 +244,25 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,  		break;  	} -	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) -		fattr->cf_uid = cifs_sb->mnt_uid; -	else -		fattr->cf_uid = le64_to_cpu(info->Uid); - -	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) -		fattr->cf_gid = cifs_sb->mnt_gid; -	else -		fattr->cf_gid = le64_to_cpu(info->Gid); +	fattr->cf_uid = cifs_sb->mnt_uid; +	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) { +		u64 id = le64_to_cpu(info->Uid); +		if (id < ((uid_t)-1)) { +			kuid_t uid = make_kuid(&init_user_ns, id); +			if (uid_valid(uid)) +				fattr->cf_uid = uid; +		} +	} +	 +	fattr->cf_gid = cifs_sb->mnt_gid; +	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) { +		u64 id = le64_to_cpu(info->Gid); +		if (id < ((gid_t)-1)) { +			kgid_t gid = make_kgid(&init_user_ns, id); +			if (gid_valid(gid)) +				fattr->cf_gid = gid; +		} +	}  	fattr->cf_nlink = le64_to_cpu(info->Nlinks);  } @@ -289,7 +299,7 @@ cifs_get_file_info_unix(struct file *filp)  	unsigned int xid;  	FILE_UNIX_BASIC_INFO find_data;  	struct cifs_fattr fattr; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);  	struct cifsFileInfo *cfile = filp->private_data;  	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); @@ -558,7 +568,7 @@ cifs_get_file_info(struct file *filp)  	unsigned int xid;  	FILE_ALL_INFO find_data;  	struct cifs_fattr fattr; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);  	struct cifsFileInfo *cfile = filp->private_data;  	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); @@ -806,10 +816,9 @@ static bool  inode_has_hashed_dentries(struct inode *inode)  {  	struct dentry *dentry; -	struct hlist_node *p;  	spin_lock(&inode->i_lock); -	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { +	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {  		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {  			spin_unlock(&inode->i_lock);  			return true; @@ -1245,14 +1254,14 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,  			.device	= 0,  		};  		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { -			args.uid = (__u64)current_fsuid(); +			args.uid = current_fsuid();  			if (parent->i_mode & S_ISGID) -				args.gid = (__u64)parent->i_gid; +				args.gid = parent->i_gid;  			else -				args.gid = (__u64)current_fsgid(); +				args.gid = current_fsgid();  		} else { -			args.uid = NO_CHANGE_64; -			args.gid = NO_CHANGE_64; +			args.uid = INVALID_UID; /* no change */ +			args.gid = INVALID_GID; /* no change */  		}  		CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,  				       cifs_sb->local_nls, @@ -1678,7 +1687,7 @@ cifs_invalidate_mapping(struct inode *inode)  int cifs_revalidate_file_attr(struct file *filp)  {  	int rc = 0; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;  	if (!cifs_inode_needs_reval(inode)) @@ -1735,7 +1744,7 @@ out:  int cifs_revalidate_file(struct file *filp)  {  	int rc; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	rc = cifs_revalidate_file_attr(filp);  	if (rc) @@ -2013,12 +2022,12 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)  	if (attrs->ia_valid & ATTR_UID)  		args->uid = attrs->ia_uid;  	else -		args->uid = NO_CHANGE_64; +		args->uid = INVALID_UID; /* no change */  	if (attrs->ia_valid & ATTR_GID)  		args->gid = attrs->ia_gid;  	else -		args->gid = NO_CHANGE_64; +		args->gid = INVALID_GID; /* no change */  	if (attrs->ia_valid & ATTR_ATIME)  		args->atime = cifs_UnixTimeToNT(attrs->ia_atime); @@ -2086,8 +2095,8 @@ static int  cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)  {  	unsigned int xid; -	uid_t uid = NO_CHANGE_32; -	gid_t gid = NO_CHANGE_32; +	kuid_t uid = INVALID_UID; +	kgid_t gid = INVALID_GID;  	struct inode *inode = direntry->d_inode;  	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);  	struct cifsInodeInfo *cifsInode = CIFS_I(inode); @@ -2146,7 +2155,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)  #ifdef CONFIG_CIFS_ACL  	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { -		if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) { +		if (uid_valid(uid) || gid_valid(gid)) {  			rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,  							uid, gid);  			if (rc) { @@ -2170,7 +2179,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)  #ifdef CONFIG_CIFS_ACL  		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {  			rc = id_mode_to_cifs_acl(inode, full_path, mode, -						NO_CHANGE_32, NO_CHANGE_32); +						INVALID_UID, INVALID_GID);  			if (rc) {  				cFYI(1, "%s: Setting ACL failed with error: %d",  					__func__, rc); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index fd5009d56f9..6c9f1214cf0 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -30,7 +30,7 @@  long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)  { -	struct inode *inode = filep->f_dentry->d_inode; +	struct inode *inode = file_inode(filep);  	int rc = -ENOTTY; /* strange error - but the precedent */  	unsigned int xid;  	struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 51dc2fb6e85..9f6c4c45d21 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -76,7 +76,7 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)  	}  	rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);  	if (rc) { -		cERROR(1, "%s: Could not update iwth link_str", __func__); +		cERROR(1, "%s: Could not update with link_str", __func__);  		goto symlink_hash_err;  	}  	rc = crypto_shash_final(&sdescmd5->shash, md5_hash); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 3a00c0d0cea..1b15bf839f3 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -569,7 +569,7 @@ bool  backup_cred(struct cifs_sb_info *cifs_sb)  {  	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) { -		if (cifs_sb->mnt_backupuid == current_fsuid()) +		if (uid_eq(cifs_sb->mnt_backupuid, current_fsuid()))  			return true;  	}  	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) { diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 6002fdc920a..df40cc5fd13 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -78,23 +78,32 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,  	struct dentry *dentry, *alias;  	struct inode *inode;  	struct super_block *sb = parent->d_inode->i_sb; +	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);  	cFYI(1, "%s: for %s", __func__, name->name); -	if (parent->d_op && parent->d_op->d_hash) -		parent->d_op->d_hash(parent, parent->d_inode, name); -	else -		name->hash = full_name_hash(name->name, name->len); +	dentry = d_hash_and_lookup(parent, name); +	if (unlikely(IS_ERR(dentry))) +		return; -	dentry = d_lookup(parent, name);  	if (dentry) {  		int err;  		inode = dentry->d_inode; -		/* update inode in place if i_ino didn't change */ -		if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { -			cifs_fattr_to_inode(inode, fattr); -			goto out; +		if (inode) { +			/* +			 * If we're generating inode numbers, then we don't +			 * want to clobber the existing one with the one that +			 * the readdir code created. +			 */ +			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) +				fattr->cf_uniqueid = CIFS_I(inode)->uniqueid; + +			/* update inode in place if i_ino didn't change */ +			if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { +				cifs_fattr_to_inode(inode, fattr); +				goto out; +			}  		}  		err = d_invalidate(dentry);  		dput(dentry); @@ -494,7 +503,7 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)     whether we can use the cached search results from the previous search */  static int is_dir_changed(struct file *file)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct cifsInodeInfo *cifsInfo = CIFS_I(inode);  	if (cifsInfo->time == 0) @@ -767,7 +776,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)  	switch ((int) file->f_pos) {  	case 0:  		if (filldir(direntry, ".", 1, file->f_pos, -		     file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) { +		     file_inode(file)->i_ino, DT_DIR) < 0) {  			cERROR(1, "Filldir for current dir failed");  			rc = -ENOMEM;  			break; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index a5d234c8d5d..47bc5a87f94 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -53,6 +53,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,  		mutex_unlock(&server->srv_mutex);  		return rc;  	} + +	/* +	 * The response to this call was already factored into the sequence +	 * number when the call went out, so we must adjust it back downward +	 * after signing here. +	 */ +	--server->sequence_number;  	rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));  	mutex_unlock(&server->srv_mutex); @@ -952,4 +959,5 @@ struct smb_version_values smb1_values = {  	.cap_unix = CAP_UNIX,  	.cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,  	.cap_large_files = CAP_LARGE_FILES, +	.oplock_read = OPLOCK_READ,  }; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d79de7bc443..c9c7aa7ed96 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -708,6 +708,7 @@ struct smb_version_values smb20_values = {  	.cap_unix = 0,  	.cap_nt_find = SMB2_NT_FIND,  	.cap_large_files = SMB2_LARGE_FILES, +	.oplock_read = SMB2_OPLOCK_LEVEL_II,  };  struct smb_version_values smb21_values = { @@ -725,6 +726,7 @@ struct smb_version_values smb21_values = {  	.cap_unix = 0,  	.cap_nt_find = SMB2_NT_FIND,  	.cap_large_files = SMB2_LARGE_FILES, +	.oplock_read = SMB2_OPLOCK_LEVEL_II,  };  struct smb_version_values smb30_values = { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 76d974c952f..1a528680ec5 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -144,9 +144,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,  	*sent = 0; -	if (ssocket == NULL) -		return -ENOTSOCK; /* BB eventually add reconnect code here */ -  	smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;  	smb_msg.msg_namelen = sizeof(struct sockaddr);  	smb_msg.msg_control = NULL; @@ -291,6 +288,9 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)  	struct socket *ssocket = server->ssocket;  	int val = 1; +	if (ssocket == NULL) +		return -ENOTSOCK; +  	cFYI(1, "Sending smb: smb_len=%u", smb_buf_length);  	dump_smb(iov[0].iov_base, iov[0].iov_len); diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 958ae0e0ff8..1da168c61d3 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -33,7 +33,7 @@ void coda_cache_enter(struct inode *inode, int mask)  	spin_lock(&cii->c_lock);  	cii->c_cached_epoch = atomic_read(&permission_epoch); -	if (cii->c_uid != current_fsuid()) { +	if (!uid_eq(cii->c_uid, current_fsuid())) {  		cii->c_uid = current_fsuid();                  cii->c_cached_perm = mask;          } else @@ -65,7 +65,7 @@ int coda_cache_check(struct inode *inode, int mask)  	spin_lock(&cii->c_lock);  	hit = (mask & cii->c_cached_perm) == mask && -	    cii->c_uid == current_fsuid() && +	    uid_eq(cii->c_uid, current_fsuid()) &&  	    cii->c_cached_epoch == atomic_read(&permission_epoch);  	spin_unlock(&cii->c_lock); diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h index b24fdfd8a3f..c6407521321 100644 --- a/fs/coda/coda_fs_i.h +++ b/fs/coda/coda_fs_i.h @@ -25,7 +25,7 @@ struct coda_inode_info {  	u_short	           c_flags;     /* flags (see below) */  	unsigned int	   c_mapcount;  /* nr of times this inode is mapped */  	unsigned int	   c_cached_epoch; /* epoch for cached permissions */ -	vuid_t		   c_uid;	/* fsuid for cached permissions */ +	kuid_t		   c_uid;	/* fsuid for cached permissions */  	unsigned int       c_cached_perm; /* cached access permissions */  	spinlock_t	   c_lock;  	struct inode	   vfs_inode; diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 854ace71268..2849f41e72a 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -100,9 +100,9 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)  	if (attr->va_mode != (u_short) -1)  	        inode->i_mode = attr->va_mode | inode_type;          if (attr->va_uid != -1)  -	        inode->i_uid = (uid_t) attr->va_uid; +	        inode->i_uid = make_kuid(&init_user_ns, (uid_t) attr->va_uid);          if (attr->va_gid != -1) -	        inode->i_gid = (gid_t) attr->va_gid; +	        inode->i_gid = make_kgid(&init_user_ns, (gid_t) attr->va_gid);  	if (attr->va_nlink != -1)  		set_nlink(inode, attr->va_nlink);  	if (attr->va_size != -1) @@ -171,10 +171,10 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)                  vattr->va_mode = iattr->ia_mode;  	}          if ( valid & ATTR_UID ) { -                vattr->va_uid = (vuid_t) iattr->ia_uid; +                vattr->va_uid = (vuid_t) from_kuid(&init_user_ns, iattr->ia_uid);  	}          if ( valid & ATTR_GID ) { -                vattr->va_gid = (vgid_t) iattr->ia_gid; +                vattr->va_gid = (vgid_t) from_kgid(&init_user_ns, iattr->ia_gid);  	}          if ( valid & ATTR_SIZE ) {                  vattr->va_size = iattr->ia_size; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 49fe52d2560..b7d3a05c062 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -397,7 +397,7 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)  		 * We can't use vfs_readdir because we have to keep the file  		 * position in sync between the coda_file and the host_file.  		 * and as such we need grab the inode mutex. */ -		struct inode *host_inode = host_file->f_path.dentry->d_inode; +		struct inode *host_inode = file_inode(host_file);  		mutex_lock(&host_inode->i_mutex);  		host_file->f_pos = coda_file->f_pos; diff --git a/fs/coda/file.c b/fs/coda/file.c index 8edd404e641..fa4c100bdc7 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -66,7 +66,7 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,  static ssize_t  coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos)  { -	struct inode *host_inode, *coda_inode = coda_file->f_path.dentry->d_inode; +	struct inode *host_inode, *coda_inode = file_inode(coda_file);  	struct coda_file_info *cfi;  	struct file *host_file;  	ssize_t ret; @@ -78,7 +78,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo  	if (!host_file->f_op || !host_file->f_op->write)  		return -EINVAL; -	host_inode = host_file->f_path.dentry->d_inode; +	host_inode = file_inode(host_file);  	mutex_lock(&coda_inode->i_mutex);  	ret = host_file->f_op->write(host_file, buf, count, ppos); @@ -106,8 +106,8 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)  	if (!host_file->f_op || !host_file->f_op->mmap)  		return -ENODEV; -	coda_inode = coda_file->f_path.dentry->d_inode; -	host_inode = host_file->f_path.dentry->d_inode; +	coda_inode = file_inode(coda_file); +	host_inode = file_inode(host_file);  	cii = ITOC(coda_inode);  	spin_lock(&cii->c_lock); @@ -178,7 +178,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)  	err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),  			  coda_flags, coda_file->f_cred->fsuid); -	host_inode = cfi->cfi_container->f_path.dentry->d_inode; +	host_inode = file_inode(cfi->cfi_container);  	cii = ITOC(coda_inode);  	/* did we mmap this file? */ @@ -202,7 +202,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)  int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)  {  	struct file *host_file; -	struct inode *coda_inode = coda_file->f_path.dentry->d_inode; +	struct inode *coda_inode = file_inode(coda_file);  	struct coda_file_info *cfi;  	int err; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index be2aa490948..dada9d0abed 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -20,6 +20,7 @@  #include <linux/file.h>  #include <linux/vfs.h>  #include <linux/slab.h> +#include <linux/pid_namespace.h>  #include <asm/uaccess.h> @@ -48,7 +49,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)  		return NULL;  	memset(&ei->c_fid, 0, sizeof(struct CodaFid));  	ei->c_flags = 0; -	ei->c_uid = 0; +	ei->c_uid = GLOBAL_ROOT_UID;  	ei->c_cached_perm = 0;  	spin_lock_init(&ei->c_lock);  	return &ei->vfs_inode; @@ -129,7 +130,7 @@ static int get_device_index(struct coda_mount_data *data)  	f = fdget(data->fd);  	if (!f.file)  		goto Ebadf; -	inode = f.file->f_path.dentry->d_inode; +	inode = file_inode(f.file);  	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {  		fdput(f);  		goto Ebadf; @@ -157,6 +158,9 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)  	int error;  	int idx; +	if (task_active_pid_ns(current) != &init_pid_ns) +		return -EINVAL; +  	idx = get_device_index((struct coda_mount_data *) data);  	/* Ignore errors in data, for backward compatibility */ diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index ee0981f1375..3f5de96bbb5 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -52,7 +52,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,  	struct path path;  	int error;  	struct PioctlData data; -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct inode *target_inode = NULL;  	struct coda_inode_info *cnp; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 761d5b31b18..ebc2bae6c28 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -37,6 +37,7 @@  #include <linux/list.h>  #include <linux/mutex.h>  #include <linux/device.h> +#include <linux/pid_namespace.h>  #include <asm/io.h>  #include <asm/poll.h>  #include <asm/uaccess.h> @@ -266,6 +267,12 @@ static int coda_psdev_open(struct inode * inode, struct file * file)  	struct venus_comm *vcp;  	int idx, err; +	if (task_active_pid_ns(current) != &init_pid_ns) +		return -EINVAL; + +	if (current_user_ns() != &init_user_ns) +		return -EINVAL; +  	idx = iminor(inode);  	if (idx < 0 || idx >= MAX_CODADEVS)  		return -ENODEV; diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 0c68fd31fbf..3a731976dc5 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -50,9 +50,9 @@ static void *alloc_upcall(int opcode, int size)  		return ERR_PTR(-ENOMEM);          inp->ih.opcode = opcode; -	inp->ih.pid = current->pid; -	inp->ih.pgid = task_pgrp_nr(current); -	inp->ih.uid = current_fsuid(); +	inp->ih.pid = task_pid_nr_ns(current, &init_pid_ns); +	inp->ih.pgid = task_pgrp_nr_ns(current, &init_pid_ns); +	inp->ih.uid = from_kuid(&init_user_ns, current_fsuid());  	return (void*)inp;  } @@ -157,7 +157,7 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid,  }  int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, -		vuid_t uid) +		kuid_t uid)  {  	union inputArgs *inp;  	union outputArgs *outp; @@ -166,7 +166,7 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,  	insize = SIZE(release);  	UPARG(CODA_CLOSE); -	inp->ih.uid = uid; +	inp->ih.uid = from_kuid(&init_user_ns, uid);          inp->coda_close.VFid = *fid;          inp->coda_close.flags = flags; diff --git a/fs/compat.c b/fs/compat.c index 015e1e1f87c..fe40fde2911 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1278,8 +1278,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,   * Exactly like fs/open.c:sys_open(), except that it doesn't set the   * O_LARGEFILE flag.   */ -asmlinkage long -compat_sys_open(const char __user *filename, int flags, umode_t mode) +COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)  {  	return do_sys_open(AT_FDCWD, filename, flags, mode);  } @@ -1288,8 +1287,7 @@ compat_sys_open(const char __user *filename, int flags, umode_t mode)   * Exactly like fs/open.c:sys_openat(), except that it doesn't set the   * O_LARGEFILE flag.   */ -asmlinkage long -compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, umode_t mode) +COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)  {  	return do_sys_open(dfd, filename, flags, mode);  } @@ -1739,55 +1737,13 @@ asmlinkage long compat_sys_signalfd(int ufd,  }  #endif /* CONFIG_SIGNALFD */ -#ifdef CONFIG_TIMERFD - -asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, -				   const struct compat_itimerspec __user *utmr, -				   struct compat_itimerspec __user *otmr) -{ -	int error; -	struct itimerspec t; -	struct itimerspec __user *ut; - -	if (get_compat_itimerspec(&t, utmr)) -		return -EFAULT; -	ut = compat_alloc_user_space(2 * sizeof(struct itimerspec)); -	if (copy_to_user(&ut[0], &t, sizeof(t))) -		return -EFAULT; -	error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]); -	if (!error && otmr) -		error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) || -			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; - -	return error; -} - -asmlinkage long compat_sys_timerfd_gettime(int ufd, -				   struct compat_itimerspec __user *otmr) -{ -	int error; -	struct itimerspec t; -	struct itimerspec __user *ut; - -	ut = compat_alloc_user_space(sizeof(struct itimerspec)); -	error = sys_timerfd_gettime(ufd, ut); -	if (!error) -		error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) || -			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; - -	return error; -} - -#endif /* CONFIG_TIMERFD */ -  #ifdef CONFIG_FHANDLE  /*   * Exactly like fs/open.c:sys_open_by_handle_at(), except that it   * doesn't set the O_LARGEFILE flag.   */ -asmlinkage long -compat_sys_open_by_handle_at(int mountdirfd, -			     struct file_handle __user *handle, int flags) +COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, +			     struct file_handle __user *, handle, int, flags)  {  	return do_handle_open(mountdirfd, handle, flags);  } diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index e2f57a00702..3ced75f765c 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1582,7 +1582,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,  	case FIBMAP:  	case FIGETBSZ:  	case FIONREAD: -		if (S_ISREG(f.file->f_path.dentry->d_inode->i_mode)) +		if (S_ISREG(file_inode(f.file)->i_mode))  			break;  		/*FALL THROUGH*/ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 712b10f64c7..7aabc6ad4e9 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1037,10 +1037,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)  static int configfs_depend_prep(struct dentry *origin,  				struct config_item *target)  { -	struct configfs_dirent *child_sd, *sd = origin->d_fsdata; +	struct configfs_dirent *child_sd, *sd;  	int ret = 0; -	BUG_ON(!origin || !sd); +	BUG_ON(!origin || !origin->d_fsdata); +	sd = origin->d_fsdata;  	if (sd->s_element == target)  /* Boo-yah */  		goto out; @@ -1625,7 +1626,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)  			if (offset >= 0)  				break;  		default: -			mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); +			mutex_unlock(&file_inode(file)->i_mutex);  			return -EINVAL;  	}  	if (offset != file->f_pos) { diff --git a/fs/coredump.c b/fs/coredump.c index 177493272a6..c6479658d48 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -411,7 +411,7 @@ static void wait_for_dump_helpers(struct file *file)  {  	struct pipe_inode_info *pipe; -	pipe = file->f_path.dentry->d_inode->i_pipe; +	pipe = file_inode(file)->i_pipe;  	pipe_lock(pipe);  	pipe->readers++; @@ -501,7 +501,7 @@ void do_coredump(siginfo_t *siginfo)  	 * so we dump it as root in mode 2, and only into a controlled  	 * environment (pipe handler or fully qualified path).  	 */ -	if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) { +	if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {  		/* Setuid core dump mode */  		flag = O_EXCL;		/* Stop rewrite attacks */  		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */ @@ -600,7 +600,7 @@ void do_coredump(siginfo_t *siginfo)  		if (IS_ERR(cprm.file))  			goto fail_unlock; -		inode = cprm.file->f_path.dentry->d_inode; +		inode = file_inode(cprm.file);  		if (inode->i_nlink > 1)  			goto close_fail;  		if (d_unhashed(cprm.file->f_path.dentry)) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index c6c3f91ecf0..3ceb9ec976e 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -351,7 +351,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)   */  static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	char *buf;  	unsigned int offset; diff --git a/fs/dcache.c b/fs/dcache.c index 3a463d0c4fe..fbfae008ba4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -455,24 +455,6 @@ void d_drop(struct dentry *dentry)  EXPORT_SYMBOL(d_drop);  /* - * d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag - * @dentry: dentry to drop - * - * This is called when we do a lookup on a placeholder dentry that needed to be - * looked up.  The dentry should have been hashed in order for it to be found by - * the lookup code, but now needs to be unhashed while we do the actual lookup - * and clear the DCACHE_NEED_LOOKUP flag. - */ -void d_clear_need_lookup(struct dentry *dentry) -{ -	spin_lock(&dentry->d_lock); -	__d_drop(dentry); -	dentry->d_flags &= ~DCACHE_NEED_LOOKUP; -	spin_unlock(&dentry->d_lock); -} -EXPORT_SYMBOL(d_clear_need_lookup); - -/*   * Finish off a dentry we've decided to kill.   * dentry->d_lock must be held, returns with it unlocked.   * If ref is non-zero, then decrement the refcount too. @@ -565,13 +547,7 @@ repeat:   	if (d_unhashed(dentry))  		goto kill_it; -	/* -	 * If this dentry needs lookup, don't set the referenced flag so that it -	 * is more likely to be cleaned up by the dcache shrinker in case of -	 * memory pressure. -	 */ -	if (!d_need_lookup(dentry)) -		dentry->d_flags |= DCACHE_REFERENCED; +	dentry->d_flags |= DCACHE_REFERENCED;  	dentry_lru_add(dentry);  	dentry->d_count--; @@ -699,11 +675,10 @@ EXPORT_SYMBOL(dget_parent);  static struct dentry *__d_find_alias(struct inode *inode, int want_discon)  {  	struct dentry *alias, *discon_alias; -	struct hlist_node *p;  again:  	discon_alias = NULL; -	hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { +	hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {  		spin_lock(&alias->d_lock);   		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {  			if (IS_ROOT(alias) && @@ -754,10 +729,9 @@ EXPORT_SYMBOL(d_find_alias);  void d_prune_aliases(struct inode *inode)  {  	struct dentry *dentry; -	struct hlist_node *p;  restart:  	spin_lock(&inode->i_lock); -	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { +	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {  		spin_lock(&dentry->d_lock);  		if (!dentry->d_count) {  			__dget_dlock(dentry); @@ -1382,6 +1356,7 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)  	WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH	|  				DCACHE_OP_COMPARE	|  				DCACHE_OP_REVALIDATE	| +				DCACHE_OP_WEAK_REVALIDATE	|  				DCACHE_OP_DELETE ));  	dentry->d_op = op;  	if (!op) @@ -1392,6 +1367,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)  		dentry->d_flags |= DCACHE_OP_COMPARE;  	if (op->d_revalidate)  		dentry->d_flags |= DCACHE_OP_REVALIDATE; +	if (op->d_weak_revalidate) +		dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;  	if (op->d_delete)  		dentry->d_flags |= DCACHE_OP_DELETE;  	if (op->d_prune) @@ -1464,14 +1441,13 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,  	int len = entry->d_name.len;  	const char *name = entry->d_name.name;  	unsigned int hash = entry->d_name.hash; -	struct hlist_node *p;  	if (!inode) {  		__d_instantiate(entry, NULL);  		return NULL;  	} -	hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { +	hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {  		/*  		 * Don't need alias->d_lock here, because aliases with  		 * d_parent == entry->d_parent are not subject to name or @@ -1583,7 +1559,7 @@ EXPORT_SYMBOL(d_find_any_alias);   */  struct dentry *d_obtain_alias(struct inode *inode)  { -	static const struct qstr anonstring = { .name = "" }; +	static const struct qstr anonstring = QSTR_INIT("/", 1);  	struct dentry *tmp;  	struct dentry *res; @@ -1696,7 +1672,6 @@ EXPORT_SYMBOL(d_splice_alias);  struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,  			struct qstr *name)  { -	int error;  	struct dentry *found;  	struct dentry *new; @@ -1705,10 +1680,12 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,  	 * if not go ahead and create it now.  	 */  	found = d_hash_and_lookup(dentry->d_parent, name); +	if (unlikely(IS_ERR(found))) +		goto err_out;  	if (!found) {  		new = d_alloc(dentry->d_parent, name);  		if (!new) { -			error = -ENOMEM; +			found = ERR_PTR(-ENOMEM);  			goto err_out;  		} @@ -1737,13 +1714,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,  	}  	/* -	 * We are going to instantiate this dentry, unhash it and clear the -	 * lookup flag so we can do that. -	 */ -	if (unlikely(d_need_lookup(found))) -		d_clear_need_lookup(found); - -	/*  	 * Negative dentry: instantiate it unless the inode is a directory and  	 * already has a dentry.  	 */ @@ -1756,7 +1726,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,  err_out:  	iput(inode); -	return ERR_PTR(error); +	return found;  }  EXPORT_SYMBOL(d_add_ci); @@ -1920,7 +1890,7 @@ seqretry:   * dentry is returned. The caller must use dput to free the entry when it has   * finished using it. %NULL is returned if the dentry does not exist.   */ -struct dentry *d_lookup(struct dentry *parent, struct qstr *name) +struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)  {  	struct dentry *dentry;  	unsigned seq; @@ -1950,7 +1920,7 @@ EXPORT_SYMBOL(d_lookup);   *   * __d_lookup callers must be commented.   */ -struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) +struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)  {  	unsigned int len = name->len;  	unsigned int hash = name->hash; @@ -2028,12 +1998,10 @@ next:   * @dir: Directory to search in   * @name: qstr of name we wish to find   * - * On hash failure or on lookup failure NULL is returned. + * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)   */  struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)  { -	struct dentry *dentry = NULL; -  	/*  	 * Check for a fs-specific hash function. Note that we must  	 * calculate the standard hash first, as the d_op->d_hash() @@ -2041,13 +2009,13 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)  	 */  	name->hash = full_name_hash(name->name, name->len);  	if (dir->d_flags & DCACHE_OP_HASH) { -		if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0) -			goto out; +		int err = dir->d_op->d_hash(dir, dir->d_inode, name); +		if (unlikely(err < 0)) +			return ERR_PTR(err);  	} -	dentry = d_lookup(dir, name); -out: -	return dentry; +	return d_lookup(dir, name);  } +EXPORT_SYMBOL(d_hash_and_lookup);  /**   * d_validate - verify dentry provided from insecure source (deprecated) @@ -2425,7 +2393,7 @@ out_err:   */  static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)  { -	struct dentry *dparent, *aparent; +	struct dentry *dparent;  	dentry_lock_for_move(anon, dentry); @@ -2433,24 +2401,15 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)  	write_seqcount_begin(&anon->d_seq);  	dparent = dentry->d_parent; -	aparent = anon->d_parent;  	switch_names(dentry, anon);  	swap(dentry->d_name.hash, anon->d_name.hash); -	dentry->d_parent = (aparent == anon) ? dentry : aparent; -	list_del(&dentry->d_u.d_child); -	if (!IS_ROOT(dentry)) -		list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); -	else -		INIT_LIST_HEAD(&dentry->d_u.d_child); - -	anon->d_parent = (dparent == dentry) ? anon : dparent; +	dentry->d_parent = dentry; +	list_del_init(&dentry->d_u.d_child); +	anon->d_parent = dparent;  	list_del(&anon->d_u.d_child); -	if (!IS_ROOT(anon)) -		list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs); -	else -		INIT_LIST_HEAD(&anon->d_u.d_child); +	list_add(&anon->d_u.d_child, &dparent->d_subdirs);  	write_seqcount_end(&dentry->d_seq);  	write_seqcount_end(&anon->d_seq); @@ -2753,37 +2712,6 @@ char *d_path(const struct path *path, char *buf, int buflen)  }  EXPORT_SYMBOL(d_path); -/** - * d_path_with_unreachable - return the path of a dentry - * @path: path to report - * @buf: buffer to return value in - * @buflen: buffer length - * - * The difference from d_path() is that this prepends "(unreachable)" - * to paths which are unreachable from the current process' root. - */ -char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) -{ -	char *res = buf + buflen; -	struct path root; -	int error; - -	if (path->dentry->d_op && path->dentry->d_op->d_dname) -		return path->dentry->d_op->d_dname(path->dentry, buf, buflen); - -	get_fs_root(current->fs, &root); -	write_seqlock(&rename_lock); -	error = path_with_deleted(path, &root, &res, &buflen); -	if (error > 0) -		error = prepend_unreachable(&res, &buflen); -	write_sequnlock(&rename_lock); -	path_put(&root); -	if (error) -		res =  ERR_PTR(error); - -	return res; -} -  /*   * Helper function for dentry_operations.d_dname() members   */ @@ -3066,7 +2994,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name)  	ino_t ino = 0;  	dentry = d_hash_and_lookup(dir, name); -	if (dentry) { +	if (!IS_ERR_OR_NULL(dentry)) {  		if (dentry->d_inode)  			ino = dentry->d_inode->i_ino;  		dput(dentry); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 153bb1e42e6..0c4f80b447f 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -176,7 +176,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)  			opts->uid = uid;  			break;  		case Opt_gid: -			if (match_octal(&args[0], &option)) +			if (match_int(&args[0], &option))  				return -EINVAL;  			gid = make_kgid(current_user_ns(), option);  			if (!gid_valid(gid)) @@ -322,7 +322,6 @@ static struct dentry *__create_file(const char *name, umode_t mode,  	if (!parent)  		parent = debugfs_mount->mnt_root; -	dentry = NULL;  	mutex_lock(&parent->d_inode->i_mutex);  	dentry = lookup_one_len(name, parent, strlen(name));  	if (!IS_ERR(dentry)) { diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 472e6befc54..073d30b9d1a 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -243,6 +243,13 @@ static int mknod_ptmx(struct super_block *sb)  	struct dentry *root = sb->s_root;  	struct pts_fs_info *fsi = DEVPTS_SB(sb);  	struct pts_mount_opts *opts = &fsi->mount_opts; +	kuid_t root_uid; +	kgid_t root_gid; + +	root_uid = make_kuid(current_user_ns(), 0); +	root_gid = make_kgid(current_user_ns(), 0); +	if (!uid_valid(root_uid) || !gid_valid(root_gid)) +		return -EINVAL;  	mutex_lock(&root->d_inode->i_mutex); @@ -273,6 +280,8 @@ static int mknod_ptmx(struct super_block *sb)  	mode = S_IFCHR|opts->ptmxmode;  	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); +	inode->i_uid = root_uid; +	inode->i_gid = root_gid;  	d_add(dentry, inode); @@ -438,6 +447,12 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,  	if (error)  		return ERR_PTR(error); +	/* Require newinstance for all user namespace mounts to ensure +	 * the mount options are not changed. +	 */ +	if ((current_user_ns() != &init_user_ns) && !opts.newinstance) +		return ERR_PTR(-EINVAL); +  	if (opts.newinstance)  		s = sget(fs_type, NULL, set_anon_super, flags, NULL);  	else @@ -491,6 +506,9 @@ static struct file_system_type devpts_fs_type = {  	.name		= "devpts",  	.mount		= devpts_mount,  	.kill_sb	= devpts_kill_sb, +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES +	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, +#endif  };  /* diff --git a/fs/direct-io.c b/fs/direct-io.c index cf5b44b10c6..f853263cf74 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -261,9 +261,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is  		dio->end_io(dio->iocb, offset, transferred,  			    dio->private, ret, is_async);  	} else { +		inode_dio_done(dio->inode);  		if (is_async)  			aio_complete(dio->iocb, ret, 0); -		inode_dio_done(dio->inode);  	}  	return ret; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index a0387dd8b1f..7d58d5b112b 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -158,7 +158,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,  	unsigned int x;  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	x = simple_strtoul(buf, NULL, 0); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 77c0f70f8fe..e7665c31f7b 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -96,10 +96,13 @@ do { \  } +#define DLM_RTF_SHRINK		0x00000001 +  struct dlm_rsbtable {  	struct rb_root		keep;  	struct rb_root		toss;  	spinlock_t		lock; +	uint32_t		flags;  }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index a579f30f237..1b1146670c4 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1132,6 +1132,7 @@ static void toss_rsb(struct kref *kref)  	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);  	rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);  	r->res_toss_time = jiffies; +	ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK;  	if (r->res_lvbptr) {  		dlm_free_lvb(r->res_lvbptr);  		r->res_lvbptr = NULL; @@ -1182,7 +1183,7 @@ static void detach_lkb(struct dlm_lkb *lkb)  static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)  {  	struct dlm_lkb *lkb; -	int rv, id; +	int rv;  	lkb = dlm_allocate_lkb(ls);  	if (!lkb) @@ -1198,19 +1199,13 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)  	mutex_init(&lkb->lkb_cb_mutex);  	INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); - retry: -	rv = idr_pre_get(&ls->ls_lkbidr, GFP_NOFS); -	if (!rv) -		return -ENOMEM; - +	idr_preload(GFP_NOFS);  	spin_lock(&ls->ls_lkbidr_spin); -	rv = idr_get_new_above(&ls->ls_lkbidr, lkb, 1, &id); -	if (!rv) -		lkb->lkb_id = id; +	rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT); +	if (rv >= 0) +		lkb->lkb_id = rv;  	spin_unlock(&ls->ls_lkbidr_spin); - -	if (rv == -EAGAIN) -		goto retry; +	idr_preload_end();  	if (rv < 0) {  		log_error(ls, "create_lkb idr error %d", rv); @@ -1659,11 +1654,18 @@ static void shrink_bucket(struct dlm_ls *ls, int b)  	char *name;  	int our_nodeid = dlm_our_nodeid();  	int remote_count = 0; +	int need_shrink = 0;  	int i, len, rv;  	memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);  	spin_lock(&ls->ls_rsbtbl[b].lock); + +	if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) { +		spin_unlock(&ls->ls_rsbtbl[b].lock); +		return; +	} +  	for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {  		next = rb_next(n);  		r = rb_entry(n, struct dlm_rsb, res_hashnode); @@ -1679,6 +1681,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b)  			continue;  		} +		need_shrink = 1; +  		if (!time_after_eq(jiffies, r->res_toss_time +  				   dlm_config.ci_toss_secs * HZ)) {  			continue; @@ -1710,6 +1714,11 @@ static void shrink_bucket(struct dlm_ls *ls, int b)  		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);  		dlm_free_rsb(r);  	} + +	if (need_shrink) +		ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK; +	else +		ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK;  	spin_unlock(&ls->ls_rsbtbl[b].lock);  	/* diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 2e99fb0c973..3ca79d3253b 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -796,7 +796,6 @@ static int release_lockspace(struct dlm_ls *ls, int force)  	 */  	idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls); -	idr_remove_all(&ls->ls_lkbidr);  	idr_destroy(&ls->ls_lkbidr);  	/* diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index dd87a31bcc2..4f5ad246582 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -177,12 +177,11 @@ static inline int nodeid_hash(int nodeid)  static struct connection *__find_con(int nodeid)  {  	int r; -	struct hlist_node *h;  	struct connection *con;  	r = nodeid_hash(nodeid); -	hlist_for_each_entry(con, h, &connection_hash[r], list) { +	hlist_for_each_entry(con, &connection_hash[r], list) {  		if (con->nodeid == nodeid)  			return con;  	} @@ -232,13 +231,12 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)  static void foreach_conn(void (*conn_func)(struct connection *c))  {  	int i; -	struct hlist_node *h, *n; +	struct hlist_node *n;  	struct connection *con;  	for (i = 0; i < CONN_HASH_SIZE; i++) { -		hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){ +		hlist_for_each_entry_safe(con, n, &connection_hash[i], list)  			conn_func(con); -		}  	}  } @@ -257,13 +255,12 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)  static struct connection *assoc2con(int assoc_id)  {  	int i; -	struct hlist_node *h;  	struct connection *con;  	mutex_lock(&connections_lock);  	for (i = 0 ; i < CONN_HASH_SIZE; i++) { -		hlist_for_each_entry(con, h, &connection_hash[i], list) { +		hlist_for_each_entry(con, &connection_hash[i], list) {  			if (con->sctp_assoc == assoc_id) {  				mutex_unlock(&connections_lock);  				return con; diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index aedea28a86a..a6bc63f6e31 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -305,27 +305,26 @@ static int recover_idr_empty(struct dlm_ls *ls)  static int recover_idr_add(struct dlm_rsb *r)  {  	struct dlm_ls *ls = r->res_ls; -	int rv, id; - -	rv = idr_pre_get(&ls->ls_recover_idr, GFP_NOFS); -	if (!rv) -		return -ENOMEM; +	int rv; +	idr_preload(GFP_NOFS);  	spin_lock(&ls->ls_recover_idr_lock);  	if (r->res_id) { -		spin_unlock(&ls->ls_recover_idr_lock); -		return -1; -	} -	rv = idr_get_new_above(&ls->ls_recover_idr, r, 1, &id); -	if (rv) { -		spin_unlock(&ls->ls_recover_idr_lock); -		return rv; +		rv = -1; +		goto out_unlock;  	} -	r->res_id = id; +	rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT); +	if (rv < 0) +		goto out_unlock; + +	r->res_id = rv;  	ls->ls_recover_list_count++;  	dlm_hold_rsb(r); +	rv = 0; +out_unlock:  	spin_unlock(&ls->ls_recover_idr_lock); -	return 0; +	idr_preload_end(); +	return rv;  }  static void recover_idr_del(struct dlm_rsb *r) @@ -351,24 +350,21 @@ static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id)  	return r;  } -static int recover_idr_clear_rsb(int id, void *p, void *data) +static void recover_idr_clear(struct dlm_ls *ls)  { -	struct dlm_ls *ls = data; -	struct dlm_rsb *r = p; +	struct dlm_rsb *r; +	int id; -	r->res_id = 0; -	r->res_recover_locks_count = 0; -	ls->ls_recover_list_count--; +	spin_lock(&ls->ls_recover_idr_lock); -	dlm_put_rsb(r); -	return 0; -} +	idr_for_each_entry(&ls->ls_recover_idr, r, id) { +		idr_remove(&ls->ls_recover_idr, id); +		r->res_id = 0; +		r->res_recover_locks_count = 0; +		ls->ls_recover_list_count--; -static void recover_idr_clear(struct dlm_ls *ls) -{ -	spin_lock(&ls->ls_recover_idr_lock); -	idr_for_each(&ls->ls_recover_idr, recover_idr_clear_rsb, ls); -	idr_remove_all(&ls->ls_recover_idr); +		dlm_put_rsb(r); +	}  	if (ls->ls_recover_list_count != 0) {  		log_error(ls, "warning: recover_list_count %d", diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 7ff49852b0c..911649a47dd 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf,  #endif  		return -EINVAL; -#ifdef CONFIG_COMPAT -	if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN) -#else +	/* +	 * can't compare against COMPAT/dlm_write_request32 because +	 * we don't yet know if is64bit is zero +	 */  	if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) -#endif  		return -EINVAL;  	kbuf = kzalloc(count + 1, GFP_NOFS); diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index cc16562654d..e15ef38c24f 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,6 +1,6 @@  config ECRYPT_FS -	tristate "eCrypt filesystem layer support (EXPERIMENTAL)" -	depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) +	tristate "eCrypt filesystem layer support" +	depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)  	select CRYPTO_ECB  	select CRYPTO_CBC  	select CRYPTO_MD5 diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index ea993128155..a7b0c2dfb3d 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1935,7 +1935,7 @@ static const unsigned char filename_rev_map[256] = {   * @src: Source location for the filename to encode   * @src_size: Size of the source in bytes   */ -void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, +static void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,  				  unsigned char *src, size_t src_size)  {  	size_t num_blocks; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index cfb4b9fed52..7e2c6f5d798 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -509,6 +509,12 @@ ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)  	return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt;  } +static inline struct path * +ecryptfs_dentry_to_lower_path(struct dentry *dentry) +{ +	return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path; +} +  static inline void  ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)  { diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index d45ba456812..53acc9d0c13 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -118,7 +118,7 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)  	lower_file = ecryptfs_file_to_lower(file);  	lower_file->f_pos = file->f_pos; -	inode = file->f_path.dentry->d_inode; +	inode = file_inode(file);  	memset(&buf, 0, sizeof(buf));  	buf.dirent = dirent;  	buf.dentry = file->f_path.dentry; @@ -133,7 +133,7 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)  		goto out;  	if (rc >= 0)  		fsstack_copy_attr_atime(inode, -					lower_file->f_path.dentry->d_inode); +					file_inode(lower_file));  out:  	return rc;  } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index cc7709e7c50..e0f07fb6d56 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1027,8 +1027,7 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,  	struct kstat lower_stat;  	int rc; -	rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), -			 ecryptfs_dentry_to_lower(dentry), &lower_stat); +	rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat);  	if (!rc) {  		fsstack_copy_attr_all(dentry->d_inode,  				      ecryptfs_inode_to_lower(dentry->d_inode)); diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index 809e67d05ca..f1ea610362c 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -102,12 +102,12 @@ int __init ecryptfs_init_kthread(void)  void ecryptfs_destroy_kthread(void)  { -	struct ecryptfs_open_req *req; +	struct ecryptfs_open_req *req, *tmp;  	mutex_lock(&ecryptfs_kthread_ctl.mux);  	ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE; -	list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list, -			    kthread_ctl_list) { +	list_for_each_entry_safe(req, tmp, &ecryptfs_kthread_ctl.req_list, +				 kthread_ctl_list) {  		list_del(&req->kthread_ctl_list);  		*req->lower_file = ERR_PTR(-EIO);  		complete(&req->done); diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 5fa2471796c..8d7a577ae49 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -115,10 +115,9 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)   */  int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon)  { -	struct hlist_node *elem;  	int rc; -	hlist_for_each_entry(*daemon, elem, +	hlist_for_each_entry(*daemon,  			    &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()],  			    euid_chain) {  		if (uid_eq((*daemon)->file->f_cred->euid, current_euid())) { @@ -445,7 +444,6 @@ void ecryptfs_release_messaging(void)  		mutex_unlock(&ecryptfs_msg_ctx_lists_mux);  	}  	if (ecryptfs_daemon_hash) { -		struct hlist_node *elem;  		struct ecryptfs_daemon *daemon;  		int i; @@ -453,7 +451,7 @@ void ecryptfs_release_messaging(void)  		for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {  			int rc; -			hlist_for_each_entry(daemon, elem, +			hlist_for_each_entry(daemon,  					     &ecryptfs_daemon_hash[i],  					     euid_chain) {  				rc = ecryptfs_exorcise_daemon(daemon); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index bd1d57f98f7..564a1fa34b9 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -338,7 +338,8 @@ static int ecryptfs_write_begin(struct file *file,  			if (prev_page_end_size  			    >= i_size_read(page->mapping->host)) {  				zero_user(page, 0, PAGE_CACHE_SIZE); -			} else { +				SetPageUptodate(page); +			} else if (len < PAGE_CACHE_SIZE) {  				rc = ecryptfs_decrypt_page(page);  				if (rc) {  					printk(KERN_ERR "%s: Error decrypting " @@ -348,8 +349,8 @@ static int ecryptfs_write_begin(struct file *file,  					ClearPageUptodate(page);  					goto out;  				} +				SetPageUptodate(page);  			} -			SetPageUptodate(page);  		}  	}  	/* If creating a page or more of holes, zero them out via truncate. @@ -499,6 +500,13 @@ static int ecryptfs_write_end(struct file *file,  		}  		goto out;  	} +	if (!PageUptodate(page)) { +		if (copied < PAGE_CACHE_SIZE) { +			rc = 0; +			goto out; +		} +		SetPageUptodate(page); +	}  	/* Fills in zeros if 'to' goes beyond inode size */  	rc = fill_zeros_to_end_of_page(page, to);  	if (rc) { diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index b2a34a192f4..6a160539cd2 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -40,16 +40,12 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,  			 loff_t offset, size_t size)  {  	struct file *lower_file; -	mm_segment_t fs_save;  	ssize_t rc;  	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;  	if (!lower_file)  		return -EIO; -	fs_save = get_fs(); -	set_fs(get_ds()); -	rc = vfs_write(lower_file, data, size, &offset); -	set_fs(fs_save); +	rc = kernel_write(lower_file, data, size, offset);  	mark_inode_dirty_sync(ecryptfs_inode);  	return rc;  } diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig index 6ebfc1c207a..d020e3c30fe 100644 --- a/fs/efs/Kconfig +++ b/fs/efs/Kconfig @@ -1,6 +1,6 @@  config EFS_FS -	tristate "EFS file system support (read only) (EXPERIMENTAL)" -	depends on BLOCK && EXPERIMENTAL +	tristate "EFS file system support (read only)" +	depends on BLOCK  	help  	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard  	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer diff --git a/fs/efs/dir.c b/fs/efs/dir.c index 7ee6f7e3a60..055a9e9ca74 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -20,7 +20,7 @@ const struct inode_operations efs_dir_inode_operations = {  };  static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct buffer_head *bh;  	struct efs_dir		*dirblock; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index be56b21435f..9fec1836057 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1313,7 +1313,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even  	 * otherwise we might miss an event that happens between the  	 * f_op->poll() call and the new event set registering.  	 */ -	epi->event.events = event->events; +	epi->event.events = event->events; /* need barrier below */  	pt._key = event->events;  	epi->event.data = event->data; /* protected by mtx */  	if (epi->event.events & EPOLLWAKEUP) { @@ -1324,6 +1324,26 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even  	}  	/* +	 * The following barrier has two effects: +	 * +	 * 1) Flush epi changes above to other CPUs.  This ensures +	 *    we do not miss events from ep_poll_callback if an +	 *    event occurs immediately after we call f_op->poll(). +	 *    We need this because we did not take ep->lock while +	 *    changing epi above (but ep_poll_callback does take +	 *    ep->lock). +	 * +	 * 2) We also need to ensure we do not miss _past_ events +	 *    when calling f_op->poll().  This barrier also +	 *    pairs with the barrier in wq_has_sleeper (see +	 *    comments for wq_has_sleeper). +	 * +	 * This barrier will now guarantee ep_poll_callback or f_op->poll +	 * (or both) will notice the readiness of an item. +	 */ +	smp_mb(); + +	/*  	 * Get current event bits. We can safely use the file* here because  	 * its usage count has been increased by the caller of this function.  	 */ diff --git a/fs/exec.c b/fs/exec.c index d8e1191cb11..a96a4885bbb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -123,7 +123,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)  		goto out;  	error = -EINVAL; -	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) +	if (!S_ISREG(file_inode(file)->i_mode))  		goto exit;  	error = -EACCES; @@ -355,7 +355,7 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)   * flags, permissions, and offset, so we use temporary values.  We'll update   * them later in setup_arg_pages().   */ -int bprm_mm_init(struct linux_binprm *bprm) +static int bprm_mm_init(struct linux_binprm *bprm)  {  	int err;  	struct mm_struct *mm = NULL; @@ -434,8 +434,9 @@ static int count(struct user_arg_ptr argv, int max)  			if (IS_ERR(p))  				return -EFAULT; -			if (i++ >= max) +			if (i >= max)  				return -E2BIG; +			++i;  			if (fatal_signal_pending(current))  				return -ERESTARTNOHAND; @@ -763,7 +764,7 @@ struct file *open_exec(const char *name)  		goto out;  	err = -EACCES; -	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) +	if (!S_ISREG(file_inode(file)->i_mode))  		goto exit;  	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) @@ -1097,7 +1098,7 @@ EXPORT_SYMBOL(flush_old_exec);  void would_dump(struct linux_binprm *bprm, struct file *file)  { -	if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0) +	if (inode_permission(file_inode(file), MAY_READ) < 0)  		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;  }  EXPORT_SYMBOL(would_dump); @@ -1110,7 +1111,7 @@ void setup_new_exec(struct linux_binprm * bprm)  	current->sas_ss_sp = current->sas_ss_size = 0;  	if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) -		set_dumpable(current->mm, SUID_DUMPABLE_ENABLED); +		set_dumpable(current->mm, SUID_DUMP_USER);  	else  		set_dumpable(current->mm, suid_dumpable); @@ -1175,9 +1176,24 @@ void free_bprm(struct linux_binprm *bprm)  		mutex_unlock(¤t->signal->cred_guard_mutex);  		abort_creds(bprm->cred);  	} +	/* If a binfmt changed the interp, free it. */ +	if (bprm->interp != bprm->filename) +		kfree(bprm->interp);  	kfree(bprm);  } +int bprm_change_interp(char *interp, struct linux_binprm *bprm) +{ +	/* If a binfmt changed the interp, free it first. */ +	if (bprm->interp != bprm->filename) +		kfree(bprm->interp); +	bprm->interp = kstrdup(interp, GFP_KERNEL); +	if (!bprm->interp) +		return -ENOMEM; +	return 0; +} +EXPORT_SYMBOL(bprm_change_interp); +  /*   * install the new credentials for this executable   */ @@ -1254,7 +1270,7 @@ static int check_unsafe_exec(struct linux_binprm *bprm)  int prepare_binprm(struct linux_binprm *bprm)  {  	umode_t mode; -	struct inode * inode = bprm->file->f_path.dentry->d_inode; +	struct inode * inode = file_inode(bprm->file);  	int retval;  	mode = inode->i_mode; @@ -1623,17 +1639,17 @@ EXPORT_SYMBOL(set_binfmt);  void set_dumpable(struct mm_struct *mm, int value)  {  	switch (value) { -	case SUID_DUMPABLE_DISABLED: +	case SUID_DUMP_DISABLE:  		clear_bit(MMF_DUMPABLE, &mm->flags);  		smp_wmb();  		clear_bit(MMF_DUMP_SECURELY, &mm->flags);  		break; -	case SUID_DUMPABLE_ENABLED: +	case SUID_DUMP_USER:  		set_bit(MMF_DUMPABLE, &mm->flags);  		smp_wmb();  		clear_bit(MMF_DUMP_SECURELY, &mm->flags);  		break; -	case SUID_DUMPABLE_SAFE: +	case SUID_DUMP_ROOT:  		set_bit(MMF_DUMP_SECURELY, &mm->flags);  		smp_wmb();  		set_bit(MMF_DUMPABLE, &mm->flags); @@ -1646,7 +1662,7 @@ int __get_dumpable(unsigned long mm_flags)  	int ret;  	ret = mm_flags & MMF_DUMPABLE_MASK; -	return (ret > SUID_DUMPABLE_ENABLED) ? SUID_DUMPABLE_SAFE : ret; +	return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;  }  int get_dumpable(struct mm_struct *mm) @@ -1654,7 +1670,6 @@ int get_dumpable(struct mm_struct *mm)  	return __get_dumpable(mm->flags);  } -#ifdef __ARCH_WANT_SYS_EXECVE  SYSCALL_DEFINE3(execve,  		const char __user *, filename,  		const char __user *const __user *, argv, @@ -1682,23 +1697,3 @@ asmlinkage long compat_sys_execve(const char __user * filename,  	return error;  }  #endif -#endif - -#ifdef __ARCH_WANT_KERNEL_EXECVE -int kernel_execve(const char *filename, -		  const char *const argv[], -		  const char *const envp[]) -{ -	int ret = do_execve(filename, -			(const char __user *const __user *)argv, -			(const char __user *const __user *)envp); -	if (ret < 0) -		return ret; - -	/* -	 * We were successful.  We won't be returning to our caller, but -	 * instead to user space by manipulating the kernel stack. -	 */ -	ret_from_kernel_execve(current_pt_regs()); -} -#endif diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index c61e62ac231..46375896cfc 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -242,7 +242,7 @@ static int  exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)  {  	loff_t pos = filp->f_pos; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	unsigned int offset = pos & ~PAGE_CACHE_MASK;  	unsigned long n = pos >> PAGE_CACHE_SHIFT;  	unsigned long npages = dir_pages(inode); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 606bb074c50..262fc994098 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -44,14 +44,13 @@ find_acceptable_alias(struct dentry *result,  {  	struct dentry *dentry, *toput = NULL;  	struct inode *inode; -	struct hlist_node *p;  	if (acceptable(context, result))  		return result;  	inode = result->d_inode;  	spin_lock(&inode->i_lock); -	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { +	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {  		dget(dentry);  		spin_unlock(&inode->i_lock);  		if (toput) @@ -322,10 +321,10 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,  	if (parent && (len < 4)) {  		*max_len = 4; -		return 255; +		return FILEID_INVALID;  	} else if (len < 2) {  		*max_len = 2; -		return 255; +		return FILEID_INVALID;  	}  	len = 2; diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 2616d0ea5c5..9f9992b3792 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -159,15 +159,6 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)  	return bh;  } -static void release_blocks(struct super_block *sb, int count) -{ -	if (count) { -		struct ext2_sb_info *sbi = EXT2_SB(sb); - -		percpu_counter_add(&sbi->s_freeblocks_counter, count); -	} -} -  static void group_adjust_blocks(struct super_block *sb, int group_no,  	struct ext2_group_desc *desc, struct buffer_head *bh, int count)  { @@ -568,8 +559,11 @@ do_more:  	}  error_return:  	brelse(bitmap_bh); -	release_blocks(sb, freed); -	dquot_free_block_nodirty(inode, freed); +	if (freed) { +		percpu_counter_add(&sbi->s_freeblocks_counter, freed); +		dquot_free_block_nodirty(inode, freed); +		mark_inode_dirty(inode); +	}  }  /** @@ -1239,10 +1233,6 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,  	*errp = -ENOSPC;  	sb = inode->i_sb; -	if (!sb) { -		printk("ext2_new_blocks: nonexistent device"); -		return 0; -	}  	/*  	 * Check quota for allocation of this block. @@ -1416,9 +1406,11 @@ allocated:  	*errp = 0;  	brelse(bitmap_bh); -	dquot_free_block_nodirty(inode, *count-num); -	mark_inode_dirty(inode); -	*count = num; +	if (num < *count) { +		dquot_free_block_nodirty(inode, *count-num); +		mark_inode_dirty(inode); +		*count = num; +	}  	return ret_block;  io_error: diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 0f4f5c92925..4237722bfd2 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -290,7 +290,7 @@ static int  ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)  {  	loff_t pos = filp->f_pos; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	unsigned int offset = pos & ~PAGE_CACHE_MASK;  	unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6363ac66faf..c3881e56662 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -495,6 +495,10 @@ static int ext2_alloc_branch(struct inode *inode,  		 * parent to disk.  		 */  		bh = sb_getblk(inode->i_sb, new_blocks[n-1]); +		if (unlikely(!bh)) { +			err = -ENOMEM; +			goto failed; +		}  		branch[n].bh = bh;  		lock_buffer(bh);  		memset(bh->b_data, 0, blocksize); @@ -523,6 +527,14 @@ static int ext2_alloc_branch(struct inode *inode,  	}  	*blks = num;  	return err; + +failed: +	for (i = 1; i < n; i++) +		bforget(branch[i].bh); +	for (i = 0; i < indirect_blks; i++) +		ext2_free_blocks(inode, new_blocks[i], 1); +	ext2_free_blocks(inode, new_blocks[i], num); +	return err;  }  /** diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 2de655f5d62..5d46c09863f 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -19,7 +19,7 @@  long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ext2_inode_info *ei = EXT2_I(inode);  	unsigned int flags;  	unsigned short rsv_window_size; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index fa04d023177..7f68c811402 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1500,7 +1500,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,  			bh = sb_bread(sb, tmp_bh.b_blocknr);  		else  			bh = sb_getblk(sb, tmp_bh.b_blocknr); -		if (!bh) { +		if (unlikely(!bh)) {  			err = -EIO;  			goto out;  		} diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index b6754dbbce3..2d7557db3ae 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -662,10 +662,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,  			ea_idebug(inode, "creating block %d", block);  			new_bh = sb_getblk(sb, block); -			if (!new_bh) { +			if (unlikely(!new_bh)) {  				ext2_free_blocks(inode, block, 1);  				mark_inode_dirty(inode); -				error = -EIO; +				error = -ENOMEM;  				goto cleanup;  			}  			lock_buffer(new_bh); diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index dd91264ba94..87eccbbca25 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -99,7 +99,7 @@ static int ext3_readdir(struct file * filp,  	int i, stored;  	struct ext3_dir_entry_2 *de;  	int err; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	int ret = 0;  	int dir_has_error = 0; @@ -114,7 +114,7 @@ static int ext3_readdir(struct file * filp,  		 * We don't set the inode dirty flag since it's not  		 * critical that it get flushed back to the disk.  		 */ -		EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; +		EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL;  	}  	stored = 0;  	offset = filp->f_pos & (sb->s_blocksize - 1); @@ -457,7 +457,7 @@ static int call_filldir(struct file * filp, void * dirent,  {  	struct dir_private_info *info = filp->private_data;  	loff_t	curr_pos; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block * sb;  	int error; @@ -487,7 +487,7 @@ static int ext3_dx_readdir(struct file * filp,  			 void * dirent, filldir_t filldir)  {  	struct dir_private_info *info = filp->private_data; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct fname *fname;  	int	ret; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index b176d425354..d512c4bc4ad 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -676,6 +676,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,  		 * parent to disk.  		 */  		bh = sb_getblk(inode->i_sb, new_blocks[n-1]); +		if (unlikely(!bh)) { +			err = -ENOMEM; +			goto failed; +		}  		branch[n].bh = bh;  		lock_buffer(bh);  		BUFFER_TRACE(bh, "call get_create_access"); @@ -717,7 +721,7 @@ failed:  		BUFFER_TRACE(branch[i].bh, "call journal_forget");  		ext3_journal_forget(handle, branch[i].bh);  	} -	for (i = 0; i <indirect_blks; i++) +	for (i = 0; i < indirect_blks; i++)  		ext3_free_blocks(handle, inode, new_blocks[i], 1);  	ext3_free_blocks(handle, inode, new_blocks[i], num); @@ -1078,8 +1082,8 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,  	if (!err && buffer_mapped(&dummy)) {  		struct buffer_head *bh;  		bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -		if (!bh) { -			*errp = -EIO; +		if (unlikely(!bh)) { +			*errp = -ENOMEM;  			goto err;  		}  		if (buffer_new(&dummy)) { @@ -2729,12 +2733,12 @@ static int __ext3_get_inode_loc(struct inode *inode,  		return -EIO;  	bh = sb_getblk(inode->i_sb, block); -	if (!bh) { +	if (unlikely(!bh)) {  		ext3_error (inode->i_sb, "ext3_get_inode_loc",  				"unable to read inode block - "  				"inode=%lu, block="E3FSBLK,  				 inode->i_ino, block); -		return -EIO; +		return -ENOMEM;  	}  	if (!buffer_uptodate(bh)) {  		lock_buffer(bh); @@ -2783,7 +2787,7 @@ static int __ext3_get_inode_loc(struct inode *inode,  			bitmap_bh = sb_getblk(inode->i_sb,  					le32_to_cpu(desc->bg_inode_bitmap)); -			if (!bitmap_bh) +			if (unlikely(!bitmap_bh))  				goto make_io;  			/* diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 677a5c27dc6..4d96e9a6453 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -14,7 +14,7 @@  long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ext3_inode_info *ei = EXT3_I(inode);  	unsigned int flags;  	unsigned short rsv_window_size; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 890b8947c54..692de13e359 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -36,7 +36,6 @@  #define NAMEI_RA_CHUNKS  2  #define NAMEI_RA_BLOCKS  4  #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))  static struct buffer_head *ext3_append(handle_t *handle,  					struct inode *inode, @@ -624,7 +623,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,  	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,  		       start_minor_hash)); -	dir = dir_file->f_path.dentry->d_inode; +	dir = file_inode(dir_file);  	if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {  		hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;  		if (hinfo.hash_version <= DX_HASH_TEA) @@ -638,7 +637,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,  	}  	hinfo.hash = start_hash;  	hinfo.minor_hash = 0; -	frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); +	frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);  	if (!frame)  		return err; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 0f814f3450d..27105655502 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -116,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,  	int err;  	bh = sb_getblk(sb, blk); -	if (!bh) -		return ERR_PTR(-EIO); +	if (unlikely(!bh)) +		return ERR_PTR(-ENOMEM);  	if ((err = ext3_journal_get_write_access(handle, bh))) {  		brelse(bh);  		bh = ERR_PTR(err); @@ -234,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb,  			goto exit_bh;  		gdb = sb_getblk(sb, block); -		if (!gdb) { -			err = -EIO; +		if (unlikely(!gdb)) { +			err = -ENOMEM;  			goto exit_bh;  		}  		if ((err = ext3_journal_get_write_access(handle, gdb))) { @@ -722,8 +722,8 @@ static void update_backups(struct super_block *sb,  			break;  		bh = sb_getblk(sb, group * bpg + blk_off); -		if (!bh) { -			err = -EIO; +		if (unlikely(!bh)) { +			err = -ENOMEM;  			break;  		}  		ext3_debug("update metadata backup %#04lx\n", diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6e50223b329..5546ca225ff 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -916,21 +916,24 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)  			"Not enough memory for storing quotafile name");  		return 0;  	} -	if (sbi->s_qf_names[qtype] && -		strcmp(sbi->s_qf_names[qtype], qname)) { -		ext3_msg(sb, KERN_ERR, -			"%s quota file already specified", QTYPE2NAME(qtype)); +	if (sbi->s_qf_names[qtype]) { +		int same = !strcmp(sbi->s_qf_names[qtype], qname); +  		kfree(qname); -		return 0; +		if (!same) { +			ext3_msg(sb, KERN_ERR, +				 "%s quota file already specified", +				 QTYPE2NAME(qtype)); +		} +		return same;  	} -	sbi->s_qf_names[qtype] = qname; -	if (strchr(sbi->s_qf_names[qtype], '/')) { +	if (strchr(qname, '/')) {  		ext3_msg(sb, KERN_ERR,  			"quotafile must be on filesystem root"); -		kfree(sbi->s_qf_names[qtype]); -		sbi->s_qf_names[qtype] = NULL; +		kfree(qname);  		return 0;  	} +	sbi->s_qf_names[qtype] = qname;  	set_opt(sbi->s_mount_opt, QUOTA);  	return 1;  } @@ -945,11 +948,10 @@ static int clear_qf_name(struct super_block *sb, int qtype) {  			" when quota turned on");  		return 0;  	} -	/* -	 * The space will be released later when all options are confirmed -	 * to be correct -	 */ -	sbi->s_qf_names[qtype] = NULL; +	if (sbi->s_qf_names[qtype]) { +		kfree(sbi->s_qf_names[qtype]); +		sbi->s_qf_names[qtype] = NULL; +	}  	return 1;  }  #endif @@ -2065,6 +2067,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)  		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":  		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":  		"writeback"); +	sb->s_flags |= MS_SNAP_STABLE;  	return 0; @@ -2605,7 +2608,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)  #ifdef CONFIG_QUOTA  	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;  	for (i = 0; i < MAXQUOTAS; i++) -		old_opts.s_qf_names[i] = sbi->s_qf_names[i]; +		if (sbi->s_qf_names[i]) { +			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], +							 GFP_KERNEL); +			if (!old_opts.s_qf_names[i]) { +				int j; + +				for (j = 0; j < i; j++) +					kfree(old_opts.s_qf_names[j]); +				return -ENOMEM; +			} +		} else +			old_opts.s_qf_names[i] = NULL;  #endif  	/* @@ -2698,9 +2712,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)  #ifdef CONFIG_QUOTA  	/* Release old quota file names */  	for (i = 0; i < MAXQUOTAS; i++) -		if (old_opts.s_qf_names[i] && -		    old_opts.s_qf_names[i] != sbi->s_qf_names[i]) -			kfree(old_opts.s_qf_names[i]); +		kfree(old_opts.s_qf_names[i]);  #endif  	if (enable_quota)  		dquot_resume(sb, -1); @@ -2714,9 +2726,7 @@ restore_opts:  #ifdef CONFIG_QUOTA  	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;  	for (i = 0; i < MAXQUOTAS; i++) { -		if (sbi->s_qf_names[i] && -		    old_opts.s_qf_names[i] != sbi->s_qf_names[i]) -			kfree(sbi->s_qf_names[i]); +		kfree(sbi->s_qf_names[i]);  		sbi->s_qf_names[i] = old_opts.s_qf_names[i];  	}  #endif diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index d22ebb7a4f5..b1fc96383e0 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -813,10 +813,10 @@ inserted:  			ea_idebug(inode, "creating block %d", block);  			new_bh = sb_getblk(sb, block); -			if (!new_bh) { +			if (unlikely(!new_bh)) {  getblk_failed:  				ext3_free_blocks(handle, inode, block, 1); -				error = -EIO; +				error = -ENOMEM;  				goto cleanup;  			}  			lock_buffer(new_bh); diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 0a475c88185..987358740cb 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -41,6 +41,7 @@ config EXT4_USE_FOR_EXT23  config EXT4_FS_POSIX_ACL  	bool "Ext4 POSIX Access Control Lists" +	depends on EXT4_FS  	select FS_POSIX_ACL  	help  	  POSIX Access Control Lists (ACLs) support permissions for users and @@ -53,6 +54,7 @@ config EXT4_FS_POSIX_ACL  config EXT4_FS_SECURITY  	bool "Ext4 Security Labels" +	depends on EXT4_FS  	help  	  Security labels support alternative access control models  	  implemented by security modules like SELinux.  This option diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index e6e0d988439..39a54a0e9fe 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -324,8 +324,8 @@ ext4_acl_chmod(struct inode *inode)  	if (error)  		return error;  retry: -	handle = ext4_journal_start(inode, -			EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); +	handle = ext4_journal_start(inode, EXT4_HT_XATTR, +				    ext4_jbd2_credits_xattr(inode));  	if (IS_ERR(handle)) {  		error = PTR_ERR(handle);  		ext4_std_error(inode->i_sb, error); @@ -422,7 +422,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,  		acl = NULL;  retry: -	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); +	handle = ext4_journal_start(inode, EXT4_HT_XATTR, +				    ext4_jbd2_credits_xattr(inode));  	if (IS_ERR(handle)) {  		error = PTR_ERR(handle);  		goto release_and_out; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index cf1821784a1..92e68b33fff 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -358,7 +358,7 @@ void ext4_validate_block_bitmap(struct super_block *sb,  }  /** - * ext4_read_block_bitmap() + * ext4_read_block_bitmap_nowait()   * @sb:			super block   * @block_group:	given block group   * @@ -457,6 +457,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)  	struct buffer_head *bh;  	bh = ext4_read_block_bitmap_nowait(sb, block_group); +	if (!bh) +		return NULL;  	if (ext4_wait_block_bitmap(sb, block_group, bh)) {  		put_bh(bh);  		return NULL; @@ -482,11 +484,16 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,  	free_clusters  = percpu_counter_read_positive(fcc);  	dirty_clusters = percpu_counter_read_positive(dcc); -	root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es)); + +	/* +	 * r_blocks_count should always be multiple of the cluster ratio so +	 * we are safe to do a plane bit shift only. +	 */ +	root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;  	if (free_clusters - (nclusters + root_clusters + dirty_clusters) <  					EXT4_FREECLUSTERS_WATERMARK) { -		free_clusters  = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); +		free_clusters  = percpu_counter_sum_positive(fcc);  		dirty_clusters = percpu_counter_sum_positive(dcc);  	}  	/* Check whether we have space after accounting for current @@ -628,7 +635,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)  	brelse(bitmap_bh);  	printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"  	       ", computed = %llu, %llu\n", -	       EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), +	       EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),  	       desc_count, bitmap_count);  	return bitmap_count;  #else diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 80a28b29727..d8cd1f0f466 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -110,7 +110,7 @@ static int ext4_readdir(struct file *filp,  	int i, stored;  	struct ext4_dir_entry_2 *de;  	int err; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	int ret = 0;  	int dir_has_error = 0; @@ -133,7 +133,7 @@ static int ext4_readdir(struct file *filp,  		 * We don't set the inode dirty flag since it's not  		 * critical that it get flushed back to the disk.  		 */ -		ext4_clear_inode_flag(filp->f_path.dentry->d_inode, +		ext4_clear_inode_flag(file_inode(filp),  				      EXT4_INODE_INDEX);  	}  	stored = 0; @@ -185,6 +185,7 @@ static int ext4_readdir(struct file *filp,  					"at offset %llu",  					(unsigned long long)filp->f_pos);  			filp->f_pos += sb->s_blocksize - offset; +			brelse(bh);  			continue;  		}  		set_buffer_verified(bh); @@ -333,7 +334,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)   *   * For non-htree, ext4_llseek already chooses the proper max offset.   */ -loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) +static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)  {  	struct inode *inode = file->f_mapping->host;  	int dx_dir = is_dx_dir(inode); @@ -494,7 +495,7 @@ static int call_filldir(struct file *filp, void *dirent,  {  	struct dir_private_info *info = filp->private_data;  	loff_t	curr_pos; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb;  	int error; @@ -526,7 +527,7 @@ static int ext4_dx_readdir(struct file *filp,  			 void *dirent, filldir_t filldir)  {  	struct dir_private_info *info = filp->private_data; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct fname *fname;  	int	ret; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8462eb3c33a..4a01ba31526 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -194,8 +194,7 @@ struct mpage_da_data {   */  #define	EXT4_IO_END_UNWRITTEN	0x0001  #define EXT4_IO_END_ERROR	0x0002 -#define EXT4_IO_END_QUEUED	0x0004 -#define EXT4_IO_END_DIRECT	0x0008 +#define EXT4_IO_END_DIRECT	0x0004  struct ext4_io_page {  	struct page	*p_page; @@ -215,10 +214,8 @@ typedef struct ext4_io_end {  	struct list_head	list;		/* per-file finished IO list */  	struct inode		*inode;		/* file being written to */  	unsigned int		flag;		/* unwritten or not */ -	struct page		*page;		/* for writepage() path */  	loff_t			offset;		/* offset in the file */  	ssize_t			size;		/* size of the extent */ -	struct work_struct	work;		/* data work queue */  	struct kiocb		*iocb;		/* iocb struct for AIO */  	int			result;		/* error value for AIO */  	int			num_io_pages;   /* for writepages() */ @@ -582,6 +579,8 @@ enum {  #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080  	/* Do not take i_data_sem locking in ext4_map_blocks */  #define EXT4_GET_BLOCKS_NO_LOCK			0x0100 +	/* Do not put hole in extent cache */ +#define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200  /*   * Flags used by ext4_free_blocks @@ -810,17 +809,6 @@ do {									       \  #endif /* defined(__KERNEL__) || defined(__linux__) */ -/* - * storage for cached extent - * If ec_len == 0, then the cache is invalid. - * If ec_start == 0, then the cache represents a gap (null mapping) - */ -struct ext4_ext_cache { -	ext4_fsblk_t	ec_start; -	ext4_lblk_t	ec_block; -	__u32		ec_len; /* must be 32bit to return holes */ -}; -  #include "extents_status.h"  /* @@ -887,7 +875,6 @@ struct ext4_inode_info {  	struct inode vfs_inode;  	struct jbd2_inode *jinode; -	struct ext4_ext_cache i_cached_extent;  	/*  	 * File creation time. Its function is same as that of  	 * struct timespec i_{a,c,m}time in the generic inode. @@ -901,6 +888,8 @@ struct ext4_inode_info {  	/* extents status tree */  	struct ext4_es_tree i_es_tree;  	rwlock_t i_es_lock; +	struct list_head i_es_lru; +	unsigned int i_es_lru_nr;	/* protected by i_es_lock */  	/* ialloc */  	ext4_group_t	i_last_alloc_group; @@ -930,6 +919,7 @@ struct ext4_inode_info {  	spinlock_t i_completed_io_lock;  	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */  	atomic_t i_unwritten; /* Nr. of inflight conversions pending */ +	struct work_struct i_unwritten_work;	/* deferred extent conversion */  	spinlock_t i_block_reservation_lock; @@ -985,7 +975,6 @@ struct ext4_inode_info {  #define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */  #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */  #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */ -#define EXT4_MOUNT_MBLK_IO_SUBMIT	0x4000000 /* multi-block io submits */  #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */  #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */  #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */ @@ -1316,6 +1305,12 @@ struct ext4_sb_info {  	/* Precomputed FS UUID checksum for seeding other checksums */  	__u32 s_csum_seed; + +	/* Reclaim extents from extent status tree */ +	struct shrinker s_es_shrinker; +	struct list_head s_es_lru; +	struct percpu_counter s_extent_cache_cnt; +	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;  };  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -2007,9 +2002,20 @@ extern int ext4fs_dirhash(const char *name, int len, struct  			  dx_hash_info *hinfo);  /* ialloc.c */ -extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, -				    const struct qstr *qstr, __u32 goal, -				    uid_t *owner); +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, +				      const struct qstr *qstr, __u32 goal, +				      uid_t *owner, int handle_type, +				      unsigned int line_no, int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ +	__ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ +			 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ +				    type, nblocks)		    \ +	__ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ +			 (type), __LINE__, (nblocks)) + +  extern void ext4_free_inode(handle_t *, struct inode *);  extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);  extern unsigned long ext4_count_free_inodes(struct super_block *); @@ -2103,6 +2109,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,  extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);  extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);  extern void ext4_ind_truncate(struct inode *inode); +extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length);  /* ioctl.c */  extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2151,6 +2158,8 @@ extern void *ext4_kvzalloc(size_t size, gfp_t flags);  extern void ext4_kvfree(void *ptr);  extern int ext4_alloc_flex_bg_array(struct super_block *sb,  				    ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, +				     char nbuf[16]);  extern __printf(4, 5)  void __ext4_error(struct super_block *, const char *, unsigned int,  		  const char *, ...); @@ -2227,6 +2236,8 @@ extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,  				       struct ext4_group_desc *gdp);  extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,  				     struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, +				    ext4_group_t first_not_zeroed);  static inline int ext4_has_group_desc_csum(struct super_block *sb)  { @@ -2454,6 +2465,75 @@ extern const struct file_operations ext4_file_operations;  extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);  extern void ext4_unwritten_wait(struct inode *inode); +/* inline.c */ +extern int ext4_has_inline_data(struct inode *inode); +extern int ext4_get_inline_size(struct inode *inode); +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern void ext4_write_inline_data(struct inode *inode, +				   struct ext4_iloc *iloc, +				   void *buffer, loff_t pos, +				   unsigned int len); +extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, +				    unsigned int len); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, +				 unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, +					 struct inode *inode, +					 loff_t pos, unsigned len, +					 unsigned flags, +					 struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, +				      loff_t pos, unsigned len, +				      unsigned copied, +				      struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, +				  unsigned len, +				  struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, +					   struct inode *inode, +					   loff_t pos, unsigned len, +					   unsigned flags, +					   struct page **pagep, +					   void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, +					 unsigned len, unsigned copied, +					 struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, +				     struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, +				      struct inode *parent, +				      struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, +				void *dirent, filldir_t filldir, +				int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, +					const struct qstr *d_name, +					struct ext4_dir_entry_2 **res_dir, +					int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, +				    struct inode *dir, +				    struct ext4_dir_entry_2 *de_del, +				    struct buffer_head *bh, +				    int *has_inline_data); +extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, +					struct ext4_dir_entry_2 **parent_de, +					int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, +				   struct fiemap_extent_info *fieinfo, +				   int *has_inline); +extern int ext4_try_to_evict_inline_data(handle_t *handle, +					 struct inode *inode, +					 int needed); +extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); +  /* namei.c */  extern const struct inode_operations ext4_dir_inode_operations;  extern const struct inode_operations ext4_special_inode_operations; @@ -2520,6 +2600,9 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,  						  struct ext4_ext_path *);  extern void ext4_ext_drop_refs(struct ext4_ext_path *);  extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_range(struct inode *inode, +				    ext4_lblk_t lblk_start, +				    ext4_lblk_t lblk_end);  extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);  extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  			__u64 start, __u64 len); @@ -2537,6 +2620,7 @@ extern void ext4_exit_pageio(void);  extern void ext4_ioend_wait(struct inode *);  extern void ext4_free_io_end(ext4_io_end_t *io);  extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern void ext4_end_io_work(struct work_struct *work);  extern void ext4_io_submit(struct ext4_io_submit *io);  extern int ext4_bio_write_page(struct ext4_io_submit *io,  			       struct page *page, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 487fda12bc0..8643ff5bbeb 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -193,12 +193,6 @@ static inline unsigned short ext_depth(struct inode *inode)  	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);  } -static inline void -ext4_ext_invalidate_cache(struct inode *inode) -{ -	EXT4_I(inode)->i_cached_extent.ec_len = 0; -} -  static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)  {  	/* We can not have an uninitialized extent of zero length! */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b4323ba846b..7058975e3a5 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -6,6 +6,108 @@  #include <trace/events/ext4.h> +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ +	handle_t *handle = current->journal_info; +	unsigned long ref_cnt = (unsigned long)handle; + +	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + +	ref_cnt++; +	handle = (handle_t *)ref_cnt; + +	current->journal_info = handle; +	return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ +	unsigned long ref_cnt = (unsigned long)handle; + +	BUG_ON(ref_cnt == 0); + +	ref_cnt--; +	handle = (handle_t *)ref_cnt; + +	current->journal_info = handle; +} + +/* + * Wrappers for jbd2_journal_start/end. + */ +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, +				  int type, int nblocks) +{ +	journal_t *journal; + +	trace_ext4_journal_start(sb, nblocks, _RET_IP_); +	if (sb->s_flags & MS_RDONLY) +		return ERR_PTR(-EROFS); + +	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); +	journal = EXT4_SB(sb)->s_journal; +	if (!journal) +		return ext4_get_nojournal(); +	/* +	 * Special case here: if the journal has aborted behind our +	 * backs (eg. EIO in the commit thread), then we still need to +	 * take the FS itself readonly cleanly. +	 */ +	if (is_journal_aborted(journal)) { +		ext4_abort(sb, "Detected aborted journal"); +		return ERR_PTR(-EROFS); +	} +	return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); +} + +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) +{ +	struct super_block *sb; +	int err; +	int rc; + +	if (!ext4_handle_valid(handle)) { +		ext4_put_nojournal(handle); +		return 0; +	} +	sb = handle->h_transaction->t_journal->j_private; +	err = handle->h_err; +	rc = jbd2_journal_stop(handle); + +	if (!err) +		err = rc; +	if (err) +		__ext4_std_error(sb, where, line, err); +	return err; +} + +void ext4_journal_abort_handle(const char *caller, unsigned int line, +			       const char *err_fn, struct buffer_head *bh, +			       handle_t *handle, int err) +{ +	char nbuf[16]; +	const char *errstr = ext4_decode_error(NULL, err, nbuf); + +	BUG_ON(!ext4_handle_valid(handle)); + +	if (bh) +		BUFFER_TRACE(bh, "abort"); + +	if (!handle->h_err) +		handle->h_err = err; + +	if (is_handle_aborted(handle)) +		return; + +	printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", +	       caller, line, errstr, err_fn); + +	jbd2_journal_abort_handle(handle); +} +  int __ext4_journal_get_write_access(const char *where, unsigned int line,  				    handle_t *handle, struct buffer_head *bh)  { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 7177f9b21cb..4c216b1bf20 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -59,12 +59,6 @@  #define EXT4_META_TRANS_BLOCKS(sb)	(EXT4_XATTR_TRANS_BLOCKS + \  					EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) -/* Delete operations potentially hit one directory's namespace plus an - * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be - * generous.  We can grow the delete transaction later if necessary. */ - -#define EXT4_DELETE_TRANS_BLOCKS(sb)	(2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) -  /* Define an arbitrary limit for the amount of data we will anticipate   * writing to any given transaction.  For unbounded transactions such as   * write(2) and truncate(2) we can write more than this, but we always @@ -110,6 +104,36 @@  #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))  #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +static inline int ext4_jbd2_credits_xattr(struct inode *inode) +{ +	int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + +	/* +	 * In case of inline data, we may push out the data to a block, +	 * so we need to reserve credits for this eventuality +	 */ +	if (ext4_has_inline_data(inode)) +		credits += ext4_writepage_trans_blocks(inode) + 1; +	return credits; +} + + +/* + * Ext4 handle operation types -- for logging purposes + */ +#define EXT4_HT_MISC             0 +#define EXT4_HT_INODE            1 +#define EXT4_HT_WRITE_PAGE       2 +#define EXT4_HT_MAP_BLOCKS       3 +#define EXT4_HT_DIR              4 +#define EXT4_HT_TRUNCATE         5 +#define EXT4_HT_QUOTA            6 +#define EXT4_HT_RESIZE           7 +#define EXT4_HT_MIGRATE          8 +#define EXT4_HT_MOVE_EXTENTS     9 +#define EXT4_HT_XATTR           10 +#define EXT4_HT_MAX             11 +  /**   *   struct ext4_journal_cb_entry - Base structure for callback information.   * @@ -234,7 +258,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,  #define ext4_handle_dirty_super(handle, sb) \  	__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, +				  int type, int nblocks);  int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);  #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -268,9 +293,17 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)  	return 1;  } -static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) +#define ext4_journal_start_sb(sb, type, nblocks)			\ +	__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) + +#define ext4_journal_start(inode, type, nblocks)			\ +	__ext4_journal_start((inode), __LINE__, (type), (nblocks)) + +static inline handle_t *__ext4_journal_start(struct inode *inode, +					     unsigned int line, int type, +					     int nblocks)  { -	return ext4_journal_start_sb(inode->i_sb, nblocks); +	return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);  }  #define ext4_journal_stop(handle) \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 26af22832a8..28dd8eeea6a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -112,7 +112,7 @@ static int ext4_split_extent_at(handle_t *handle,  			     int flags);  static int ext4_find_delayed_extent(struct inode *inode, -				    struct ext4_ext_cache *newex); +				    struct extent_status *newes);  static int ext4_ext_truncate_extend_restart(handle_t *handle,  					    struct inode *inode, @@ -714,7 +714,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)  	eh->eh_magic = EXT4_EXT_MAGIC;  	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));  	ext4_mark_inode_dirty(handle, inode); -	ext4_ext_invalidate_cache(inode);  	return 0;  } @@ -725,6 +724,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,  	struct ext4_extent_header *eh;  	struct buffer_head *bh;  	short int depth, i, ppos = 0, alloc = 0; +	int ret;  	eh = ext_inode_hdr(inode);  	depth = ext_depth(inode); @@ -752,12 +752,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,  		path[ppos].p_ext = NULL;  		bh = sb_getblk(inode->i_sb, path[ppos].p_block); -		if (unlikely(!bh)) +		if (unlikely(!bh)) { +			ret = -ENOMEM;  			goto err; +		}  		if (!bh_uptodate_or_lock(bh)) {  			trace_ext4_ext_load_extent(inode, block,  						path[ppos].p_block); -			if (bh_submit_read(bh) < 0) { +			ret = bh_submit_read(bh); +			if (ret < 0) {  				put_bh(bh);  				goto err;  			} @@ -768,13 +771,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,  			put_bh(bh);  			EXT4_ERROR_INODE(inode,  					 "ppos %d > depth %d", ppos, depth); +			ret = -EIO;  			goto err;  		}  		path[ppos].p_bh = bh;  		path[ppos].p_hdr = eh;  		i--; -		if (ext4_ext_check_block(inode, eh, i, bh)) +		ret = ext4_ext_check_block(inode, eh, i, bh); +		if (ret < 0)  			goto err;  	} @@ -796,7 +801,7 @@ err:  	ext4_ext_drop_refs(path);  	if (alloc)  		kfree(path); -	return ERR_PTR(-EIO); +	return ERR_PTR(ret);  }  /* @@ -950,8 +955,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,  		goto cleanup;  	}  	bh = sb_getblk(inode->i_sb, newblock); -	if (!bh) { -		err = -EIO; +	if (unlikely(!bh)) { +		err = -ENOMEM;  		goto cleanup;  	}  	lock_buffer(bh); @@ -1023,8 +1028,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,  		oldblock = newblock;  		newblock = ablocks[--a];  		bh = sb_getblk(inode->i_sb, newblock); -		if (!bh) { -			err = -EIO; +		if (unlikely(!bh)) { +			err = -ENOMEM;  			goto cleanup;  		}  		lock_buffer(bh); @@ -1136,11 +1141,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,  		return err;  	bh = sb_getblk(inode->i_sb, newblock); -	if (!bh) { -		err = -EIO; -		ext4_std_error(inode->i_sb, err); -		return err; -	} +	if (unlikely(!bh)) +		return -ENOMEM;  	lock_buffer(bh);  	err = ext4_journal_get_create_access(handle, bh); @@ -1960,7 +1962,6 @@ cleanup:  		ext4_ext_drop_refs(npath);  		kfree(npath);  	} -	ext4_ext_invalidate_cache(inode);  	return err;  } @@ -1969,8 +1970,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,  				    struct fiemap_extent_info *fieinfo)  {  	struct ext4_ext_path *path = NULL; -	struct ext4_ext_cache newex;  	struct ext4_extent *ex; +	struct extent_status es;  	ext4_lblk_t next, next_del, start = 0, end = 0;  	ext4_lblk_t last = block + num;  	int exists, depth = 0, err = 0; @@ -2044,37 +2045,47 @@ static int ext4_fill_fiemap_extents(struct inode *inode,  		BUG_ON(end <= start);  		if (!exists) { -			newex.ec_block = start; -			newex.ec_len = end - start; -			newex.ec_start = 0; +			es.es_lblk = start; +			es.es_len = end - start; +			es.es_pblk = 0;  		} else { -			newex.ec_block = le32_to_cpu(ex->ee_block); -			newex.ec_len = ext4_ext_get_actual_len(ex); -			newex.ec_start = ext4_ext_pblock(ex); +			es.es_lblk = le32_to_cpu(ex->ee_block); +			es.es_len = ext4_ext_get_actual_len(ex); +			es.es_pblk = ext4_ext_pblock(ex);  			if (ext4_ext_is_uninitialized(ex))  				flags |= FIEMAP_EXTENT_UNWRITTEN;  		}  		/* -		 * Find delayed extent and update newex accordingly. We call -		 * it even in !exists case to find out whether newex is the +		 * Find delayed extent and update es accordingly. We call +		 * it even in !exists case to find out whether es is the  		 * last existing extent or not.  		 */ -		next_del = ext4_find_delayed_extent(inode, &newex); +		next_del = ext4_find_delayed_extent(inode, &es);  		if (!exists && next_del) {  			exists = 1;  			flags |= FIEMAP_EXTENT_DELALLOC;  		}  		up_read(&EXT4_I(inode)->i_data_sem); -		if (unlikely(newex.ec_len == 0)) { -			EXT4_ERROR_INODE(inode, "newex.ec_len == 0"); +		if (unlikely(es.es_len == 0)) { +			EXT4_ERROR_INODE(inode, "es.es_len == 0");  			err = -EIO;  			break;  		} -		/* This is possible iff next == next_del == EXT_MAX_BLOCKS */ -		if (next == next_del) { +		/* +		 * This is possible iff next == next_del == EXT_MAX_BLOCKS. +		 * we need to check next == EXT_MAX_BLOCKS because it is +		 * possible that an extent is with unwritten and delayed +		 * status due to when an extent is delayed allocated and +		 * is allocated by fallocate status tree will track both of +		 * them in a extent. +		 * +		 * So we could return a unwritten and delayed extent, and +		 * its block is equal to 'next'. +		 */ +		if (next == next_del && next == EXT_MAX_BLOCKS) {  			flags |= FIEMAP_EXTENT_LAST;  			if (unlikely(next_del != EXT_MAX_BLOCKS ||  				     next != EXT_MAX_BLOCKS)) { @@ -2089,9 +2100,9 @@ static int ext4_fill_fiemap_extents(struct inode *inode,  		if (exists) {  			err = fiemap_fill_next_extent(fieinfo, -				(__u64)newex.ec_block << blksize_bits, -				(__u64)newex.ec_start << blksize_bits, -				(__u64)newex.ec_len << blksize_bits, +				(__u64)es.es_lblk << blksize_bits, +				(__u64)es.es_pblk << blksize_bits, +				(__u64)es.es_len << blksize_bits,  				flags);  			if (err < 0)  				break; @@ -2101,7 +2112,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,  			}  		} -		block = newex.ec_block + newex.ec_len; +		block = es.es_lblk + es.es_len;  	}  	if (path) { @@ -2112,21 +2123,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,  	return err;  } -static void -ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, -			__u32 len, ext4_fsblk_t start) -{ -	struct ext4_ext_cache *cex; -	BUG_ON(len == 0); -	spin_lock(&EXT4_I(inode)->i_block_reservation_lock); -	trace_ext4_ext_put_in_cache(inode, block, len, start); -	cex = &EXT4_I(inode)->i_cached_extent; -	cex->ec_block = block; -	cex->ec_len = len; -	cex->ec_start = start; -	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -} -  /*   * ext4_ext_put_gap_in_cache:   * calculate boundaries of the gap that the requested block fits into @@ -2143,9 +2139,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,  	ex = path[depth].p_ext;  	if (ex == NULL) { -		/* there is no extent yet, so gap is [0;-] */ -		lblock = 0; -		len = EXT_MAX_BLOCKS; +		/* +		 * there is no extent yet, so gap is [0;-] and we +		 * don't cache it +		 */  		ext_debug("cache gap(whole file):");  	} else if (block < le32_to_cpu(ex->ee_block)) {  		lblock = block; @@ -2154,6 +2151,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,  				block,  				le32_to_cpu(ex->ee_block),  				 ext4_ext_get_actual_len(ex)); +		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) +			ext4_es_insert_extent(inode, lblock, len, ~0, +					      EXTENT_STATUS_HOLE);  	} else if (block >= le32_to_cpu(ex->ee_block)  			+ ext4_ext_get_actual_len(ex)) {  		ext4_lblk_t next; @@ -2167,58 +2167,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,  				block);  		BUG_ON(next == lblock);  		len = next - lblock; +		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) +			ext4_es_insert_extent(inode, lblock, len, ~0, +					      EXTENT_STATUS_HOLE);  	} else {  		lblock = len = 0;  		BUG();  	}  	ext_debug(" -> %u:%lu\n", lblock, len); -	ext4_ext_put_in_cache(inode, lblock, len, 0); -} - -/* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * cache extent pointer. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex:    Pointer where the cached extent will be stored - *         if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, -		  struct ext4_extent *ex) -{ -	struct ext4_ext_cache *cex; -	int ret = 0; - -	/* -	 * We borrow i_block_reservation_lock to protect i_cached_extent -	 */ -	spin_lock(&EXT4_I(inode)->i_block_reservation_lock); -	cex = &EXT4_I(inode)->i_cached_extent; - -	/* has cache valid data? */ -	if (cex->ec_len == 0) -		goto errout; - -	if (in_range(block, cex->ec_block, cex->ec_len)) { -		ex->ee_block = cpu_to_le32(cex->ec_block); -		ext4_ext_store_pblock(ex, cex->ec_start); -		ex->ee_len = cpu_to_le16(cex->ec_len); -		ext_debug("%u cached by %u:%u:%llu\n", -				block, -				cex->ec_block, cex->ec_len, cex->ec_start); -		ret = 1; -	} -errout: -	trace_ext4_ext_in_cache(inode, block, ret); -	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -	return ret;  }  /* @@ -2226,13 +2183,14 @@ errout:   * removes index from the index block.   */  static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, -			struct ext4_ext_path *path) +			struct ext4_ext_path *path, int depth)  {  	int err;  	ext4_fsblk_t leaf;  	/* free index block */ -	path--; +	depth--; +	path = path + depth;  	leaf = ext4_idx_pblock(path->p_idx);  	if (unlikely(path->p_hdr->eh_entries == 0)) {  		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); @@ -2257,6 +2215,19 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,  	ext4_free_blocks(handle, inode, NULL, leaf, 1,  			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + +	while (--depth >= 0) { +		if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) +			break; +		path--; +		err = ext4_ext_get_access(handle, inode, path); +		if (err) +			break; +		path->p_idx->ei_block = (path+1)->p_idx->ei_block; +		err = ext4_ext_dirty(handle, inode, path); +		if (err) +			break; +	}  	return err;  } @@ -2599,7 +2570,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  	/* if this leaf is free, then we should  	 * remove it from index block above */  	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) -		err = ext4_ext_rm_idx(handle, inode, path + depth); +		err = ext4_ext_rm_idx(handle, inode, path, depth);  out:  	return err; @@ -2639,13 +2610,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,  	ext_debug("truncate since %u to %u\n", start, end);  	/* probably first extent we're gonna free will be last in block */ -	handle = ext4_journal_start(inode, depth + 1); +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);  	if (IS_ERR(handle))  		return PTR_ERR(handle);  again: -	ext4_ext_invalidate_cache(inode); -  	trace_ext4_ext_remove_space(inode, start, depth);  	/* @@ -2802,7 +2771,7 @@ again:  				/* index is empty, remove it;  				 * handle must be already prepared by the  				 * truncatei_leaf() */ -				err = ext4_ext_rm_idx(handle, inode, path + i); +				err = ext4_ext_rm_idx(handle, inode, path, i);  			}  			/* root level has p_bh == NULL, brelse() eats this */  			brelse(path[i].p_bh); @@ -3505,19 +3474,19 @@ out:   *   * Return 1 if there is a delalloc block in the range, otherwise 0.   */ -static int ext4_find_delalloc_range(struct inode *inode, -				    ext4_lblk_t lblk_start, -				    ext4_lblk_t lblk_end) +int ext4_find_delalloc_range(struct inode *inode, +			     ext4_lblk_t lblk_start, +			     ext4_lblk_t lblk_end)  {  	struct extent_status es; -	es.start = lblk_start; -	ext4_es_find_extent(inode, &es); -	if (es.len == 0) +	ext4_es_find_delayed_extent(inode, lblk_start, &es); +	if (es.es_len == 0)  		return 0; /* there is no delay extent in this tree */ -	else if (es.start <= lblk_start && lblk_start < es.start + es.len) +	else if (es.es_lblk <= lblk_start && +		 lblk_start < es.es_lblk + es.es_len)  		return 1; -	else if (lblk_start <= es.start && es.start <= lblk_end) +	else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)  		return 1;  	else  		return 0; @@ -3642,6 +3611,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  			ext4_set_io_unwritten_flag(inode, io);  		else  			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); +		map->m_flags |= EXT4_MAP_UNWRITTEN;  		if (ext4_should_dioread_nolock(inode))  			map->m_flags |= EXT4_MAP_UNINIT;  		goto out; @@ -3663,8 +3633,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  	 * repeat fallocate creation request  	 * we already have an unwritten extent  	 */ -	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) +	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { +		map->m_flags |= EXT4_MAP_UNWRITTEN;  		goto map_out; +	}  	/* buffered READ or buffered write_begin() lookup */  	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -3884,35 +3856,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  		  map->m_lblk, map->m_len, inode->i_ino);  	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); -	/* check in cache */ -	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { -		if (!newex.ee_start_lo && !newex.ee_start_hi) { -			if ((sbi->s_cluster_ratio > 1) && -			    ext4_find_delalloc_cluster(inode, map->m_lblk)) -				map->m_flags |= EXT4_MAP_FROM_CLUSTER; - -			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { -				/* -				 * block isn't allocated yet and -				 * user doesn't want to allocate it -				 */ -				goto out2; -			} -			/* we should allocate requested block */ -		} else { -			/* block is already allocated */ -			if (sbi->s_cluster_ratio > 1) -				map->m_flags |= EXT4_MAP_FROM_CLUSTER; -			newblock = map->m_lblk -				   - le32_to_cpu(newex.ee_block) -				   + ext4_ext_pblock(&newex); -			/* number of remaining blocks in the extent */ -			allocated = ext4_ext_get_actual_len(&newex) - -				(map->m_lblk - le32_to_cpu(newex.ee_block)); -			goto out; -		} -	} -  	/* find extent for this block */  	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);  	if (IS_ERR(path)) { @@ -3959,15 +3902,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,  				  ee_block, ee_len, newblock); -			/* -			 * Do not put uninitialized extent -			 * in the cache -			 */ -			if (!ext4_ext_is_uninitialized(ex)) { -				ext4_ext_put_in_cache(inode, ee_block, -					ee_len, ee_start); +			if (!ext4_ext_is_uninitialized(ex))  				goto out; -			} +  			allocated = ext4_ext_handle_uninitialized_extents(  				handle, inode, map, path, flags,  				allocated, newblock); @@ -3988,7 +3925,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  		 * put just found gap into cache to speed up  		 * subsequent requests  		 */ -		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); +		if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) +			ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);  		goto out2;  	} @@ -4094,6 +4032,7 @@ got_allocated_blocks:  	/* Mark uninitialized */  	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){  		ext4_ext_mark_uninitialized(&newex); +		map->m_flags |= EXT4_MAP_UNWRITTEN;  		/*  		 * io_end structure was created for every IO write to an  		 * uninitialized extent. To avoid unnecessary conversion, @@ -4227,10 +4166,9 @@ got_allocated_blocks:  	 * Cache the extent and update transaction to commit on fdatasync only  	 * when it is _not_ an uninitialized extent.  	 */ -	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { -		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); +	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)  		ext4_update_inode_fsync_trans(handle, inode, 1); -	} else +	else  		ext4_update_inode_fsync_trans(handle, inode, 0);  out:  	if (allocated > map->m_len) @@ -4270,7 +4208,7 @@ void ext4_ext_truncate(struct inode *inode)  	 * probably first extent we're gonna free will be last in block  	 */  	err = ext4_writepage_trans_blocks(inode); -	handle = ext4_journal_start(inode, err); +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err);  	if (IS_ERR(handle))  		return; @@ -4289,7 +4227,6 @@ void ext4_ext_truncate(struct inode *inode)  		goto out_stop;  	down_write(&EXT4_I(inode)->i_data_sem); -	ext4_ext_invalidate_cache(inode);  	ext4_discard_preallocations(inode); @@ -4372,7 +4309,7 @@ static void ext4_falloc_update_inode(struct inode *inode,   */  long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	handle_t *handle;  	loff_t new_size;  	unsigned int max_blocks; @@ -4383,13 +4320,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  	struct ext4_map_blocks map;  	unsigned int credits, blkbits = inode->i_blkbits; -	/* -	 * currently supporting (pre)allocate mode for extent-based -	 * files _only_ -	 */ -	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) -		return -EOPNOTSUPP; -  	/* Return error if mode is not supported */  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))  		return -EOPNOTSUPP; @@ -4401,6 +4331,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  	if (ret)  		return ret; +	/* +	 * currently supporting (pre)allocate mode for extent-based +	 * files _only_ +	 */ +	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) +		return -EOPNOTSUPP; +  	trace_ext4_fallocate_enter(inode, offset, len, mode);  	map.m_lblk = offset >> blkbits;  	/* @@ -4437,7 +4374,8 @@ retry:  	while (ret >= 0 && ret < max_blocks) {  		map.m_lblk = map.m_lblk + ret;  		map.m_len = max_blocks = max_blocks - ret; -		handle = ext4_journal_start(inode, credits); +		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, +					    credits);  		if (IS_ERR(handle)) {  			ret = PTR_ERR(handle);  			break; @@ -4445,11 +4383,11 @@ retry:  		ret = ext4_map_blocks(handle, inode, &map, flags);  		if (ret <= 0) {  #ifdef EXT4FS_DEBUG -			WARN_ON(ret <= 0); -			printk(KERN_ERR "%s: ext4_ext_map_blocks " -				    "returned error inode#%lu, block=%u, " -				    "max_blocks=%u", __func__, -				    inode->i_ino, map.m_lblk, max_blocks); +			ext4_warning(inode->i_sb, +				     "inode #%lu: block %u: len %u: " +				     "ext4_ext_map_blocks returned %d", +				     inode->i_ino, map.m_lblk, +				     map.m_len, ret);  #endif  			ext4_mark_inode_dirty(handle, inode);  			ret2 = ext4_journal_stop(handle); @@ -4515,21 +4453,19 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,  	while (ret >= 0 && ret < max_blocks) {  		map.m_lblk += ret;  		map.m_len = (max_blocks -= ret); -		handle = ext4_journal_start(inode, credits); +		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);  		if (IS_ERR(handle)) {  			ret = PTR_ERR(handle);  			break;  		}  		ret = ext4_map_blocks(handle, inode, &map,  				      EXT4_GET_BLOCKS_IO_CONVERT_EXT); -		if (ret <= 0) { -			WARN_ON(ret <= 0); -			ext4_msg(inode->i_sb, KERN_ERR, -				 "%s:%d: inode #%lu: block %u: len %u: " -				 "ext4_ext_map_blocks returned %d", -				 __func__, __LINE__, inode->i_ino, map.m_lblk, -				 map.m_len, ret); -		} +		if (ret <= 0) +			ext4_warning(inode->i_sb, +				     "inode #%lu: block %u: len %u: " +				     "ext4_ext_map_blocks returned %d", +				     inode->i_ino, map.m_lblk, +				     map.m_len, ret);  		ext4_mark_inode_dirty(handle, inode);  		ret2 = ext4_journal_stop(handle);  		if (ret <= 0 || ret2 ) @@ -4539,42 +4475,48 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,  }  /* - * If newex is not existing extent (newex->ec_start equals zero) find - * delayed extent at start of newex and update newex accordingly and + * If newes is not existing extent (newes->ec_pblk equals zero) find + * delayed extent at start of newes and update newes accordingly and   * return start of the next delayed extent.   * - * If newex is existing extent (newex->ec_start is not equal zero) + * If newes is existing extent (newes->ec_pblk is not equal zero)   * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed - * extent found. Leave newex unmodified. + * extent found. Leave newes unmodified.   */  static int ext4_find_delayed_extent(struct inode *inode, -				    struct ext4_ext_cache *newex) +				    struct extent_status *newes)  {  	struct extent_status es; -	ext4_lblk_t next_del; +	ext4_lblk_t block, next_del; -	es.start = newex->ec_block; -	next_del = ext4_es_find_extent(inode, &es); +	ext4_es_find_delayed_extent(inode, newes->es_lblk, &es); -	if (newex->ec_start == 0) { +	if (newes->es_pblk == 0) {  		/* -		 * No extent in extent-tree contains block @newex->ec_start, +		 * No extent in extent-tree contains block @newes->es_pblk,  		 * then the block may stay in 1)a hole or 2)delayed-extent.  		 */ -		if (es.len == 0) +		if (es.es_len == 0)  			/* A hole found. */  			return 0; -		if (es.start > newex->ec_block) { +		if (es.es_lblk > newes->es_lblk) {  			/* A hole found. */ -			newex->ec_len = min(es.start - newex->ec_block, -					    newex->ec_len); +			newes->es_len = min(es.es_lblk - newes->es_lblk, +					    newes->es_len);  			return 0;  		} -		newex->ec_len = es.start + es.len - newex->ec_block; +		newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;  	} +	block = newes->es_lblk + newes->es_len; +	ext4_es_find_delayed_extent(inode, block, &es); +	if (es.es_len == 0) +		next_del = EXT_MAX_BLOCKS; +	else +		next_del = es.es_lblk; +  	return next_del;  }  /* fiemap flags we can handle specified here */ @@ -4629,7 +4571,7 @@ static int ext4_xattr_fiemap(struct inode *inode,   */  int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct super_block *sb = inode->i_sb;  	ext4_lblk_t first_block, stop_block;  	struct address_space *mapping = inode->i_mapping; @@ -4695,7 +4637,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)  	inode_dio_wait(inode);  	credits = ext4_writepage_trans_blocks(inode); -	handle = ext4_journal_start(inode, credits); +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);  	if (IS_ERR(handle)) {  		err = PTR_ERR(handle);  		goto out_dio; @@ -4772,14 +4714,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)  		goto out;  	down_write(&EXT4_I(inode)->i_data_sem); -	ext4_ext_invalidate_cache(inode);  	ext4_discard_preallocations(inode);  	err = ext4_es_remove_extent(inode, first_block,  				    stop_block - first_block);  	err = ext4_ext_remove_space(inode, first_block, stop_block - 1); -	ext4_ext_invalidate_cache(inode);  	ext4_discard_preallocations(inode);  	if (IS_SYNC(inode)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 564d981a2fc..95796a1b752 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -23,40 +23,53 @@   * (e.g. Reservation space warning), and provide extent-level locking.   * Delay extent tree is the first step to achieve this goal.  It is   * original built by Yongqiang Yang.  At that time it is called delay - * extent tree, whose goal is only track delay extent in memory to + * extent tree, whose goal is only track delayed extents in memory to   * simplify the implementation of fiemap and bigalloc, and introduce   * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called - * delay extent tree at the following comment.  But for better - * understand what it does, it has been rename to extent status tree. + * delay extent tree at the first commit.  But for better understand + * what it does, it has been rename to extent status tree.   * - * Currently the first step has been done.  All delay extents are - * tracked in the tree.  It maintains the delay extent when a delay - * allocation is issued, and the delay extent is written out or + * Step1: + * Currently the first step has been done.  All delayed extents are + * tracked in the tree.  It maintains the delayed extent when a delayed + * allocation is issued, and the delayed extent is written out or   * invalidated.  Therefore the implementation of fiemap and bigalloc   * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.   *   * The following comment describes the implemenmtation of extent   * status tree and future works. + * + * Step2: + * In this step all extent status are tracked by extent status tree. + * Thus, we can first try to lookup a block mapping in this tree before + * finding it in extent tree.  Hence, single extent cache can be removed + * because extent status tree can do a better job.  Extents in status + * tree are loaded on-demand.  Therefore, the extent status tree may not + * contain all of the extents in a file.  Meanwhile we define a shrinker + * to reclaim memory from extent status tree because fragmented extent + * tree will make status tree cost too much memory.  written/unwritten/- + * hole extents in the tree will be reclaimed by this shrinker when we + * are under high memory pressure.  Delayed extents will not be + * reclimed because fiemap, bigalloc, and seek_data/hole need it.   */  /* - * extents status tree implementation for ext4. + * Extent status tree implementation for ext4.   *   *   * ========================================================================== - * Extents status encompass delayed extents and extent locks + * Extent status tree tracks all extent status.   * - * 1. Why delayed extent implementation ? + * 1. Why we need to implement extent status tree?   * - * Without delayed extent, ext4 identifies a delayed extent by looking + * Without extent status tree, ext4 identifies a delayed extent by looking   * up page cache, this has several deficiencies - complicated, buggy,   * and inefficient code.   * - * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need - * to know if a block or a range of blocks are belonged to a delayed - * extent. + * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a + * block or a range of blocks are belonged to a delayed extent.   * - * Let us have a look at how they do without delayed extents implementation. + * Let us have a look at how they do without extent status tree.   *   --	FIEMAP   *	FIEMAP looks up page cache to identify delayed allocations from holes.   * @@ -68,47 +81,48 @@   *	already under delayed allocation or not to determine whether   *	quota reserving is needed for the cluster.   * - *   -- punch hole - *	punch hole looks up page cache to identify a delayed extent. - *   *   --	writeout   *	Writeout looks up whole page cache to see if a buffer is   *	mapped, If there are not very many delayed buffers, then it is   *	time comsuming.   * - * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, + * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,   * bigalloc and writeout can figure out if a block or a range of   * blocks is under delayed allocation(belonged to a delayed extent) or - * not by searching the delayed extent tree. + * not by searching the extent tree.   *   *   * ========================================================================== - * 2. ext4 delayed extents impelmentation + * 2. Ext4 extent status tree impelmentation + * + *   --	extent + *	A extent is a range of blocks which are contiguous logically and + *	physically.  Unlike extent in extent tree, this extent in ext4 is + *	a in-memory struct, there is no corresponding on-disk data.  There + *	is no limit on length of extent, so an extent can contain as many + *	blocks as they are contiguous logically and physically.   * - *   --	delayed extent - *	A delayed extent is a range of blocks which are contiguous - *	logically and under delayed allocation.  Unlike extent in - *	ext4, delayed extent in ext4 is a in-memory struct, there is - *	no corresponding on-disk data.  There is no limit on length of - *	delayed extent, so a delayed extent can contain as many blocks - *	as they are contiguous logically. + *   --	extent status tree + *	Every inode has an extent status tree and all allocation blocks + *	are added to the tree with different status.  The extent in the + *	tree are ordered by logical block no.   * - *   --	delayed extent tree - *	Every inode has a delayed extent tree and all under delayed - *	allocation blocks are added to the tree as delayed extents. - *	Delayed extents in the tree are ordered by logical block no. + *   --	operations on a extent status tree + *	There are three important operations on a delayed extent tree: find + *	next extent, adding a extent(a range of blocks) and removing a extent.   * - *   --	operations on a delayed extent tree - *	There are three operations on a delayed extent tree: find next - *	delayed extent, adding a space(a range of blocks) and removing - *	a space. + *   --	race on a extent status tree + *	Extent status tree is protected by inode->i_es_lock.   * - *   --	race on a delayed extent tree - *	Delayed extent tree is protected inode->i_es_lock. + *   --	memory consumption + *      Fragmented extent tree will make extent status tree cost too much + *      memory.  Hence, we will reclaim written/unwritten/hole extents from + *      the tree under a heavy memory pressure.   *   *   * ========================================================================== - * 3. performance analysis + * 3. Performance analysis + *   *   --	overhead   *	1. There is a cache extent for write access, so if writes are   *	not very random, adding space operaions are in O(1) time. @@ -120,18 +134,25 @@   *   * ==========================================================================   * 4. TODO list - *   -- Track all extent status   * - *   -- Improve get block process + *   -- Refactor delayed space reservation   *   *   -- Extent-level locking   */  static struct kmem_cache *ext4_es_cachep; +static int __es_insert_extent(struct inode *inode, struct extent_status *newes); +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, +			      ext4_lblk_t end); +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, +				       int nr_to_scan); +  int __init ext4_init_es(void)  { -	ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); +	ext4_es_cachep = kmem_cache_create("ext4_extent_status", +					   sizeof(struct extent_status), +					   0, (SLAB_RECLAIM_ACCOUNT), NULL);  	if (ext4_es_cachep == NULL)  		return -ENOMEM;  	return 0; @@ -161,7 +182,9 @@ static void ext4_es_print_tree(struct inode *inode)  	while (node) {  		struct extent_status *es;  		es = rb_entry(node, struct extent_status, rb_node); -		printk(KERN_DEBUG " [%u/%u)", es->start, es->len); +		printk(KERN_DEBUG " [%u/%u) %llu %llx", +		       es->es_lblk, es->es_len, +		       ext4_es_pblock(es), ext4_es_status(es));  		node = rb_next(node);  	}  	printk(KERN_DEBUG "\n"); @@ -170,10 +193,10 @@ static void ext4_es_print_tree(struct inode *inode)  #define ext4_es_print_tree(inode)  #endif -static inline ext4_lblk_t extent_status_end(struct extent_status *es) +static inline ext4_lblk_t ext4_es_end(struct extent_status *es)  { -	BUG_ON(es->start + es->len < es->start); -	return es->start + es->len - 1; +	BUG_ON(es->es_lblk + es->es_len < es->es_lblk); +	return es->es_lblk + es->es_len - 1;  }  /* @@ -181,25 +204,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es)   * it can't be found, try to find next extent.   */  static struct extent_status *__es_tree_search(struct rb_root *root, -					      ext4_lblk_t offset) +					      ext4_lblk_t lblk)  {  	struct rb_node *node = root->rb_node;  	struct extent_status *es = NULL;  	while (node) {  		es = rb_entry(node, struct extent_status, rb_node); -		if (offset < es->start) +		if (lblk < es->es_lblk)  			node = node->rb_left; -		else if (offset > extent_status_end(es)) +		else if (lblk > ext4_es_end(es))  			node = node->rb_right;  		else  			return es;  	} -	if (es && offset < es->start) +	if (es && lblk < es->es_lblk)  		return es; -	if (es && offset > extent_status_end(es)) { +	if (es && lblk > ext4_es_end(es)) {  		node = rb_next(&es->rb_node);  		return node ? rb_entry(node, struct extent_status, rb_node) :  			      NULL; @@ -209,79 +232,124 @@ static struct extent_status *__es_tree_search(struct rb_root *root,  }  /* - * ext4_es_find_extent: find the 1st delayed extent covering @es->start - * if it exists, otherwise, the next extent after @es->start. + * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk + * if it exists, otherwise, the next extent after @es->lblk.   *   * @inode: the inode which owns delayed extents + * @lblk: the offset where we start to search   * @es: delayed extent that we found - * - * Returns the first block of the next extent after es, otherwise - * EXT_MAX_BLOCKS if no delay extent is found. - * Delayed extent is returned via @es.   */ -ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) +void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, +				 struct extent_status *es)  {  	struct ext4_es_tree *tree = NULL;  	struct extent_status *es1 = NULL;  	struct rb_node *node; -	ext4_lblk_t ret = EXT_MAX_BLOCKS; -	trace_ext4_es_find_extent_enter(inode, es->start); +	BUG_ON(es == NULL); +	trace_ext4_es_find_delayed_extent_enter(inode, lblk);  	read_lock(&EXT4_I(inode)->i_es_lock);  	tree = &EXT4_I(inode)->i_es_tree; -	/* find delay extent in cache firstly */ +	/* find extent in cache firstly */ +	es->es_lblk = es->es_len = es->es_pblk = 0;  	if (tree->cache_es) {  		es1 = tree->cache_es; -		if (in_range(es->start, es1->start, es1->len)) { -			es_debug("%u cached by [%u/%u)\n", -				 es->start, es1->start, es1->len); +		if (in_range(lblk, es1->es_lblk, es1->es_len)) { +			es_debug("%u cached by [%u/%u) %llu %llx\n", +				 lblk, es1->es_lblk, es1->es_len, +				 ext4_es_pblock(es1), ext4_es_status(es1));  			goto out;  		}  	} -	es->len = 0; -	es1 = __es_tree_search(&tree->root, es->start); +	es1 = __es_tree_search(&tree->root, lblk);  out: -	if (es1) { -		tree->cache_es = es1; -		es->start = es1->start; -		es->len = es1->len; -		node = rb_next(&es1->rb_node); -		if (node) { +	if (es1 && !ext4_es_is_delayed(es1)) { +		while ((node = rb_next(&es1->rb_node)) != NULL) {  			es1 = rb_entry(node, struct extent_status, rb_node); -			ret = es1->start; +			if (ext4_es_is_delayed(es1)) +				break;  		}  	} +	if (es1 && ext4_es_is_delayed(es1)) { +		tree->cache_es = es1; +		es->es_lblk = es1->es_lblk; +		es->es_len = es1->es_len; +		es->es_pblk = es1->es_pblk; +	} +  	read_unlock(&EXT4_I(inode)->i_es_lock); -	trace_ext4_es_find_extent_exit(inode, es, ret); -	return ret; +	ext4_es_lru_add(inode); +	trace_ext4_es_find_delayed_extent_exit(inode, es);  }  static struct extent_status * -ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) +ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, +		     ext4_fsblk_t pblk)  {  	struct extent_status *es;  	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);  	if (es == NULL)  		return NULL; -	es->start = start; -	es->len = len; +	es->es_lblk = lblk; +	es->es_len = len; +	es->es_pblk = pblk; + +	/* +	 * We don't count delayed extent because we never try to reclaim them +	 */ +	if (!ext4_es_is_delayed(es)) { +		EXT4_I(inode)->i_es_lru_nr++; +		percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); +	} +  	return es;  } -static void ext4_es_free_extent(struct extent_status *es) +static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)  { +	/* Decrease the lru counter when this es is not delayed */ +	if (!ext4_es_is_delayed(es)) { +		BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); +		EXT4_I(inode)->i_es_lru_nr--; +		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); +	} +  	kmem_cache_free(ext4_es_cachep, es);  } +/* + * Check whether or not two extents can be merged + * Condition: + *  - logical block number is contiguous + *  - physical block number is contiguous + *  - status is equal + */ +static int ext4_es_can_be_merged(struct extent_status *es1, +				 struct extent_status *es2) +{ +	if (es1->es_lblk + es1->es_len != es2->es_lblk) +		return 0; + +	if (ext4_es_status(es1) != ext4_es_status(es2)) +		return 0; + +	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && +	    (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) +		return 0; + +	return 1; +} +  static struct extent_status * -ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) +ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)  { +	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;  	struct extent_status *es1;  	struct rb_node *node; @@ -290,10 +358,10 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)  		return es;  	es1 = rb_entry(node, struct extent_status, rb_node); -	if (es->start == extent_status_end(es1) + 1) { -		es1->len += es->len; +	if (ext4_es_can_be_merged(es1, es)) { +		es1->es_len += es->es_len;  		rb_erase(&es->rb_node, &tree->root); -		ext4_es_free_extent(es); +		ext4_es_free_extent(inode, es);  		es = es1;  	} @@ -301,8 +369,9 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)  }  static struct extent_status * -ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) +ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)  { +	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;  	struct extent_status *es1;  	struct rb_node *node; @@ -311,69 +380,57 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)  		return es;  	es1 = rb_entry(node, struct extent_status, rb_node); -	if (es1->start == extent_status_end(es) + 1) { -		es->len += es1->len; +	if (ext4_es_can_be_merged(es, es1)) { +		es->es_len += es1->es_len;  		rb_erase(node, &tree->root); -		ext4_es_free_extent(es1); +		ext4_es_free_extent(inode, es1);  	}  	return es;  } -static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, -			      ext4_lblk_t len) +static int __es_insert_extent(struct inode *inode, struct extent_status *newes)  { +	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;  	struct rb_node **p = &tree->root.rb_node;  	struct rb_node *parent = NULL;  	struct extent_status *es; -	ext4_lblk_t end = offset + len - 1; - -	BUG_ON(end < offset); -	es = tree->cache_es; -	if (es && offset == (extent_status_end(es) + 1)) { -		es_debug("cached by [%u/%u)\n", es->start, es->len); -		es->len += len; -		es = ext4_es_try_to_merge_right(tree, es); -		goto out; -	} else if (es && es->start == end + 1) { -		es_debug("cached by [%u/%u)\n", es->start, es->len); -		es->start = offset; -		es->len += len; -		es = ext4_es_try_to_merge_left(tree, es); -		goto out; -	} else if (es && es->start <= offset && -		   end <= extent_status_end(es)) { -		es_debug("cached by [%u/%u)\n", es->start, es->len); -		goto out; -	}  	while (*p) {  		parent = *p;  		es = rb_entry(parent, struct extent_status, rb_node); -		if (offset < es->start) { -			if (es->start == end + 1) { -				es->start = offset; -				es->len += len; -				es = ext4_es_try_to_merge_left(tree, es); +		if (newes->es_lblk < es->es_lblk) { +			if (ext4_es_can_be_merged(newes, es)) { +				/* +				 * Here we can modify es_lblk directly +				 * because it isn't overlapped. +				 */ +				es->es_lblk = newes->es_lblk; +				es->es_len += newes->es_len; +				if (ext4_es_is_written(es) || +				    ext4_es_is_unwritten(es)) +					ext4_es_store_pblock(es, +							     newes->es_pblk); +				es = ext4_es_try_to_merge_left(inode, es);  				goto out;  			}  			p = &(*p)->rb_left; -		} else if (offset > extent_status_end(es)) { -			if (offset == extent_status_end(es) + 1) { -				es->len += len; -				es = ext4_es_try_to_merge_right(tree, es); +		} else if (newes->es_lblk > ext4_es_end(es)) { +			if (ext4_es_can_be_merged(es, newes)) { +				es->es_len += newes->es_len; +				es = ext4_es_try_to_merge_right(inode, es);  				goto out;  			}  			p = &(*p)->rb_right;  		} else { -			if (extent_status_end(es) <= end) -				es->len = offset - es->start + len; -			goto out; +			BUG_ON(1); +			return -EINVAL;  		}  	} -	es = ext4_es_alloc_extent(offset, len); +	es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len, +				  newes->es_pblk);  	if (!es)  		return -ENOMEM;  	rb_link_node(&es->rb_node, parent, p); @@ -385,85 +442,166 @@ out:  }  /* - * ext4_es_insert_extent() adds a space to a delayed extent tree. - * Caller holds inode->i_es_lock. + * ext4_es_insert_extent() adds a space to a extent status tree.   *   * ext4_es_insert_extent is called by ext4_da_write_begin and   * ext4_es_remove_extent.   *   * Return 0 on success, error code on failure.   */ -int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, -			  ext4_lblk_t len) +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, +			  ext4_lblk_t len, ext4_fsblk_t pblk, +			  unsigned long long status)  { -	struct ext4_es_tree *tree; +	struct extent_status newes; +	ext4_lblk_t end = lblk + len - 1;  	int err = 0; -	trace_ext4_es_insert_extent(inode, offset, len); -	es_debug("add [%u/%u) to extent status tree of inode %lu\n", -		 offset, len, inode->i_ino); +	es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n", +		 lblk, len, pblk, status, inode->i_ino); + +	if (!len) +		return 0; + +	BUG_ON(end < lblk); + +	newes.es_lblk = lblk; +	newes.es_len = len; +	ext4_es_store_pblock(&newes, pblk); +	ext4_es_store_status(&newes, status); +	trace_ext4_es_insert_extent(inode, &newes);  	write_lock(&EXT4_I(inode)->i_es_lock); -	tree = &EXT4_I(inode)->i_es_tree; -	err = __es_insert_extent(tree, offset, len); +	err = __es_remove_extent(inode, lblk, end); +	if (err != 0) +		goto error; +	err = __es_insert_extent(inode, &newes); + +error:  	write_unlock(&EXT4_I(inode)->i_es_lock); +	ext4_es_lru_add(inode);  	ext4_es_print_tree(inode);  	return err;  }  /* - * ext4_es_remove_extent() removes a space from a delayed extent tree. - * Caller holds inode->i_es_lock. + * ext4_es_lookup_extent() looks up an extent in extent status tree.   * - * Return 0 on success, error code on failure. + * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. + * + * Return: 1 on found, 0 on not   */ -int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, -			  ext4_lblk_t len) +int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, +			  struct extent_status *es)  { -	struct rb_node *node;  	struct ext4_es_tree *tree; +	struct extent_status *es1 = NULL; +	struct rb_node *node; +	int found = 0; + +	trace_ext4_es_lookup_extent_enter(inode, lblk); +	es_debug("lookup extent in block %u\n", lblk); + +	tree = &EXT4_I(inode)->i_es_tree; +	read_lock(&EXT4_I(inode)->i_es_lock); + +	/* find extent in cache firstly */ +	es->es_lblk = es->es_len = es->es_pblk = 0; +	if (tree->cache_es) { +		es1 = tree->cache_es; +		if (in_range(lblk, es1->es_lblk, es1->es_len)) { +			es_debug("%u cached by [%u/%u)\n", +				 lblk, es1->es_lblk, es1->es_len); +			found = 1; +			goto out; +		} +	} + +	node = tree->root.rb_node; +	while (node) { +		es1 = rb_entry(node, struct extent_status, rb_node); +		if (lblk < es1->es_lblk) +			node = node->rb_left; +		else if (lblk > ext4_es_end(es1)) +			node = node->rb_right; +		else { +			found = 1; +			break; +		} +	} + +out: +	if (found) { +		BUG_ON(!es1); +		es->es_lblk = es1->es_lblk; +		es->es_len = es1->es_len; +		es->es_pblk = es1->es_pblk; +	} + +	read_unlock(&EXT4_I(inode)->i_es_lock); + +	ext4_es_lru_add(inode); +	trace_ext4_es_lookup_extent_exit(inode, es, found); +	return found; +} + +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, +			      ext4_lblk_t end) +{ +	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; +	struct rb_node *node;  	struct extent_status *es;  	struct extent_status orig_es; -	ext4_lblk_t len1, len2, end; +	ext4_lblk_t len1, len2; +	ext4_fsblk_t block;  	int err = 0; -	trace_ext4_es_remove_extent(inode, offset, len); -	es_debug("remove [%u/%u) from extent status tree of inode %lu\n", -		 offset, len, inode->i_ino); - -	end = offset + len - 1; -	BUG_ON(end < offset); -	write_lock(&EXT4_I(inode)->i_es_lock); -	tree = &EXT4_I(inode)->i_es_tree; -	es = __es_tree_search(&tree->root, offset); +	es = __es_tree_search(&tree->root, lblk);  	if (!es)  		goto out; -	if (es->start > end) +	if (es->es_lblk > end)  		goto out;  	/* Simply invalidate cache_es. */  	tree->cache_es = NULL; -	orig_es.start = es->start; -	orig_es.len = es->len; -	len1 = offset > es->start ? offset - es->start : 0; -	len2 = extent_status_end(es) > end ? -	       extent_status_end(es) - end : 0; +	orig_es.es_lblk = es->es_lblk; +	orig_es.es_len = es->es_len; +	orig_es.es_pblk = es->es_pblk; + +	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; +	len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;  	if (len1 > 0) -		es->len = len1; +		es->es_len = len1;  	if (len2 > 0) {  		if (len1 > 0) { -			err = __es_insert_extent(tree, end + 1, len2); +			struct extent_status newes; + +			newes.es_lblk = end + 1; +			newes.es_len = len2; +			if (ext4_es_is_written(&orig_es) || +			    ext4_es_is_unwritten(&orig_es)) { +				block = ext4_es_pblock(&orig_es) + +					orig_es.es_len - len2; +				ext4_es_store_pblock(&newes, block); +			} +			ext4_es_store_status(&newes, ext4_es_status(&orig_es)); +			err = __es_insert_extent(inode, &newes);  			if (err) { -				es->start = orig_es.start; -				es->len = orig_es.len; +				es->es_lblk = orig_es.es_lblk; +				es->es_len = orig_es.es_len;  				goto out;  			}  		} else { -			es->start = end + 1; -			es->len = len2; +			es->es_lblk = end + 1; +			es->es_len = len2; +			if (ext4_es_is_written(es) || +			    ext4_es_is_unwritten(es)) { +				block = orig_es.es_pblk + orig_es.es_len - len2; +				ext4_es_store_pblock(es, block); +			}  		}  		goto out;  	} @@ -476,10 +614,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,  			es = NULL;  	} -	while (es && extent_status_end(es) <= end) { +	while (es && ext4_es_end(es) <= end) {  		node = rb_next(&es->rb_node);  		rb_erase(&es->rb_node, &tree->root); -		ext4_es_free_extent(es); +		ext4_es_free_extent(inode, es);  		if (!node) {  			es = NULL;  			break; @@ -487,14 +625,166 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,  		es = rb_entry(node, struct extent_status, rb_node);  	} -	if (es && es->start < end + 1) { -		len1 = extent_status_end(es) - end; -		es->start = end + 1; -		es->len = len1; +	if (es && es->es_lblk < end + 1) { +		ext4_lblk_t orig_len = es->es_len; + +		len1 = ext4_es_end(es) - end; +		es->es_lblk = end + 1; +		es->es_len = len1; +		if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { +			block = es->es_pblk + orig_len - len1; +			ext4_es_store_pblock(es, block); +		}  	}  out: +	return err; +} + +/* + * ext4_es_remove_extent() removes a space from a extent status tree. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, +			  ext4_lblk_t len) +{ +	ext4_lblk_t end; +	int err = 0; + +	trace_ext4_es_remove_extent(inode, lblk, len); +	es_debug("remove [%u/%u) from extent status tree of inode %lu\n", +		 lblk, len, inode->i_ino); + +	if (!len) +		return err; + +	end = lblk + len - 1; +	BUG_ON(end < lblk); + +	write_lock(&EXT4_I(inode)->i_es_lock); +	err = __es_remove_extent(inode, lblk, end);  	write_unlock(&EXT4_I(inode)->i_es_lock);  	ext4_es_print_tree(inode);  	return err;  } + +static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ +	struct ext4_sb_info *sbi = container_of(shrink, +					struct ext4_sb_info, s_es_shrinker); +	struct ext4_inode_info *ei; +	struct list_head *cur, *tmp, scanned; +	int nr_to_scan = sc->nr_to_scan; +	int ret, nr_shrunk = 0; + +	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); +	trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + +	if (!nr_to_scan) +		return ret; + +	INIT_LIST_HEAD(&scanned); + +	spin_lock(&sbi->s_es_lru_lock); +	list_for_each_safe(cur, tmp, &sbi->s_es_lru) { +		list_move_tail(cur, &scanned); + +		ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + +		read_lock(&ei->i_es_lock); +		if (ei->i_es_lru_nr == 0) { +			read_unlock(&ei->i_es_lock); +			continue; +		} +		read_unlock(&ei->i_es_lock); + +		write_lock(&ei->i_es_lock); +		ret = __es_try_to_reclaim_extents(ei, nr_to_scan); +		write_unlock(&ei->i_es_lock); + +		nr_shrunk += ret; +		nr_to_scan -= ret; +		if (nr_to_scan == 0) +			break; +	} +	list_splice_tail(&scanned, &sbi->s_es_lru); +	spin_unlock(&sbi->s_es_lru_lock); + +	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); +	trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); +	return ret; +} + +void ext4_es_register_shrinker(struct super_block *sb) +{ +	struct ext4_sb_info *sbi; + +	sbi = EXT4_SB(sb); +	INIT_LIST_HEAD(&sbi->s_es_lru); +	spin_lock_init(&sbi->s_es_lru_lock); +	sbi->s_es_shrinker.shrink = ext4_es_shrink; +	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; +	register_shrinker(&sbi->s_es_shrinker); +} + +void ext4_es_unregister_shrinker(struct super_block *sb) +{ +	unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); +} + +void ext4_es_lru_add(struct inode *inode) +{ +	struct ext4_inode_info *ei = EXT4_I(inode); +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + +	spin_lock(&sbi->s_es_lru_lock); +	if (list_empty(&ei->i_es_lru)) +		list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); +	else +		list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); +	spin_unlock(&sbi->s_es_lru_lock); +} + +void ext4_es_lru_del(struct inode *inode) +{ +	struct ext4_inode_info *ei = EXT4_I(inode); +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + +	spin_lock(&sbi->s_es_lru_lock); +	if (!list_empty(&ei->i_es_lru)) +		list_del_init(&ei->i_es_lru); +	spin_unlock(&sbi->s_es_lru_lock); +} + +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, +				       int nr_to_scan) +{ +	struct inode *inode = &ei->vfs_inode; +	struct ext4_es_tree *tree = &ei->i_es_tree; +	struct rb_node *node; +	struct extent_status *es; +	int nr_shrunk = 0; + +	if (ei->i_es_lru_nr == 0) +		return 0; + +	node = rb_first(&tree->root); +	while (node != NULL) { +		es = rb_entry(node, struct extent_status, rb_node); +		node = rb_next(&es->rb_node); +		/* +		 * We can't reclaim delayed extent from status tree because +		 * fiemap, bigallic, and seek_data/hole need to use it. +		 */ +		if (!ext4_es_is_delayed(es)) { +			rb_erase(&es->rb_node, &tree->root); +			ext4_es_free_extent(inode, es); +			nr_shrunk++; +			if (--nr_to_scan == 0) +				break; +		} +	} +	tree->cache_es = NULL; +	return nr_shrunk; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 077f82db092..f190dfe969d 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,10 +20,24 @@  #define es_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)  #endif +/* + * These flags live in the high bits of extent_status.es_pblk + */ +#define EXTENT_STATUS_WRITTEN	(1ULL << 63) +#define EXTENT_STATUS_UNWRITTEN (1ULL << 62) +#define EXTENT_STATUS_DELAYED	(1ULL << 61) +#define EXTENT_STATUS_HOLE	(1ULL << 60) + +#define EXTENT_STATUS_FLAGS	(EXTENT_STATUS_WRITTEN | \ +				 EXTENT_STATUS_UNWRITTEN | \ +				 EXTENT_STATUS_DELAYED | \ +				 EXTENT_STATUS_HOLE) +  struct extent_status {  	struct rb_node rb_node; -	ext4_lblk_t start;	/* first block extent covers */ -	ext4_lblk_t len;	/* length of extent in block */ +	ext4_lblk_t es_lblk;	/* first logical block extent covers */ +	ext4_lblk_t es_len;	/* length of extent in block */ +	ext4_fsblk_t es_pblk;	/* first physical block */  };  struct ext4_es_tree { @@ -35,11 +49,69 @@ extern int __init ext4_init_es(void);  extern void ext4_exit_es(void);  extern void ext4_es_init_tree(struct ext4_es_tree *tree); -extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, +				 ext4_lblk_t len, ext4_fsblk_t pblk, +				 unsigned long long status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,  				 ext4_lblk_t len); -extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, -				 ext4_lblk_t len); -extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, -				struct extent_status *es); +extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, +					struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, +				 struct extent_status *es); + +static inline int ext4_es_is_written(struct extent_status *es) +{ +	return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ +	return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ +	return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ +	return (es->es_pblk & EXTENT_STATUS_HOLE) != 0; +} + +static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) +{ +	return (es->es_pblk & EXTENT_STATUS_FLAGS); +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ +	return (es->es_pblk & ~EXTENT_STATUS_FLAGS); +} + +static inline void ext4_es_store_pblock(struct extent_status *es, +					ext4_fsblk_t pb) +{ +	ext4_fsblk_t block; + +	block = (pb & ~EXTENT_STATUS_FLAGS) | +		(es->es_pblk & EXTENT_STATUS_FLAGS); +	es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, +					unsigned long long status) +{ +	ext4_fsblk_t block; + +	block = (status & EXTENT_STATUS_FLAGS) | +		(es->es_pblk & ~EXTENT_STATUS_FLAGS); +	es->es_pblk = block; +} + +extern void ext4_es_register_shrinker(struct super_block *sb); +extern void ext4_es_unregister_shrinker(struct super_block *sb); +extern void ext4_es_lru_add(struct inode *inode); +extern void ext4_es_lru_del(struct inode *inode);  #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d07c27ca594..64848b595b2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -108,14 +108,6 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,  	/* Unaligned direct AIO must be serialized; see comment above */  	if (unaligned_aio) { -		static unsigned long unaligned_warn_time; - -		/* Warn about this once per day */ -		if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) -			ext4_msg(inode->i_sb, KERN_WARNING, -				 "Unaligned AIO/DIO on inode %ld by %s; " -				 "performance will be poor.", -				 inode->i_ino, current->comm);  		mutex_lock(ext4_aio_mutex(inode));  		ext4_unwritten_wait(inode);  	} @@ -175,7 +167,7 @@ static ssize_t  ext4_file_write(struct kiocb *iocb, const struct iovec *iov,  		unsigned long nr_segs, loff_t pos)  { -	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(iocb->ki_filp);  	ssize_t ret;  	/* @@ -248,7 +240,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)  			handle_t *handle;  			int err; -			handle = ext4_journal_start_sb(sb, 1); +			handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);  			if (IS_ERR(handle))  				return PTR_ERR(handle);  			err = ext4_journal_get_write_access(handle, sbi->s_sbh); @@ -472,10 +464,8 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)  		 * If there is a delay extent at this offset,  		 * it will be as a data.  		 */ -		es.start = last; -		(void)ext4_es_find_extent(inode, &es); -		if (last >= es.start && -		    last < es.start + es.len) { +		ext4_es_find_delayed_extent(inode, last, &es); +		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {  			if (last != start)  				dataoff = last << blkbits;  			break; @@ -557,11 +547,9 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)  		 * If there is a delay extent at this offset,  		 * we will skip this extent.  		 */ -		es.start = last; -		(void)ext4_es_find_extent(inode, &es); -		if (last >= es.start && -		    last < es.start + es.len) { -			last = es.start + es.len; +		ext4_es_find_delayed_extent(inode, last, &es); +		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { +			last = es.es_lblk + es.es_len;  			holeoff = last << blkbits;  			continue;  		} diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index dfbc1fe9667..3278e64e57b 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -109,8 +109,6 @@ static int __sync_inode(struct inode *inode, int datasync)   *   * What we do is just kick off a commit and wait on it.  This will snapshot the   * inode to disk. - * - * i_mutex lock is held when entering and exiting this function   */  int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index fa8e4911d35..3d586f02883 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -155,11 +155,11 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)  	/* Check to see if the seed is all zero's */  	if (hinfo->seed) {  		for (i = 0; i < 4; i++) { -			if (hinfo->seed[i]) +			if (hinfo->seed[i]) { +				memcpy(buf, hinfo->seed, sizeof(buf));  				break; +			}  		} -		if (i < 4) -			memcpy(buf, hinfo->seed, sizeof(buf));  	}  	switch (hinfo->hash_version) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3f32c801244..32fd2b9075d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -634,8 +634,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,   * For other inodes, search forward from the parent directory's block   * group to find a free inode.   */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, -			     const struct qstr *qstr, __u32 goal, uid_t *owner) +struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, +			       umode_t mode, const struct qstr *qstr, +			       __u32 goal, uid_t *owner, int handle_type, +			       unsigned int line_no, int nblocks)  {  	struct super_block *sb;  	struct buffer_head *inode_bitmap_bh = NULL; @@ -725,6 +727,15 @@ repeat_in_this_group:  				   "inode=%lu", ino + 1);  			continue;  		} +		if (!handle) { +			BUG_ON(nblocks <= 0); +			handle = __ext4_journal_start_sb(dir->i_sb, line_no, +							 handle_type, nblocks); +			if (IS_ERR(handle)) { +				err = PTR_ERR(handle); +				goto fail; +			} +		}  		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");  		err = ext4_journal_get_write_access(handle, inode_bitmap_bh);  		if (err) @@ -1017,17 +1028,17 @@ iget_failed:  	inode = NULL;  bad_orphan:  	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino); -	printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", +	printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",  	       bit, (unsigned long long)bitmap_bh->b_blocknr,  	       ext4_test_bit(bit, bitmap_bh->b_data)); -	printk(KERN_NOTICE "inode=%p\n", inode); +	printk(KERN_WARNING "inode=%p\n", inode);  	if (inode) { -		printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", +		printk(KERN_WARNING "is_bad_inode(inode)=%d\n",  		       is_bad_inode(inode)); -		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", +		printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",  		       NEXT_ORPHAN(inode)); -		printk(KERN_NOTICE "max_ino=%lu\n", max_ino); -		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); +		printk(KERN_WARNING "max_ino=%lu\n", max_ino); +		printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);  		/* Avoid freeing blocks if we got a bad deleted inode */  		if (inode->i_nlink == 0)  			inode->i_blocks = 0; @@ -1137,7 +1148,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,  	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))  		goto out; -	handle = ext4_journal_start_sb(sb, 1); +	handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);  	if (IS_ERR(handle)) {  		ret = PTR_ERR(handle);  		goto out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 20862f96e8a..b505a145a59 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -146,6 +146,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,  	struct super_block *sb = inode->i_sb;  	Indirect *p = chain;  	struct buffer_head *bh; +	int ret = -EIO;  	*err = 0;  	/* i_data is not going away, no lock needed */ @@ -154,8 +155,10 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,  		goto no_block;  	while (--depth) {  		bh = sb_getblk(sb, le32_to_cpu(p->key)); -		if (unlikely(!bh)) +		if (unlikely(!bh)) { +			ret = -ENOMEM;  			goto failure; +		}  		if (!bh_uptodate_or_lock(bh)) {  			if (bh_submit_read(bh) < 0) { @@ -177,7 +180,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,  	return NULL;  failure: -	*err = -EIO; +	*err = ret;  no_block:  	return p;  } @@ -355,9 +358,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,  			 * for the first direct block  			 */  			new_blocks[index] = current_block; -			printk(KERN_INFO "%s returned more blocks than " +			WARN(1, KERN_INFO "%s returned more blocks than "  						"requested\n", __func__); -			WARN_ON(1);  			break;  		}  	} @@ -471,7 +473,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,  		 */  		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);  		if (unlikely(!bh)) { -			err = -EIO; +			err = -ENOMEM;  			goto failed;  		} @@ -789,7 +791,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,  		if (final_size > inode->i_size) {  			/* Credits for sb + inode write */ -			handle = ext4_journal_start(inode, 2); +			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);  			if (IS_ERR(handle)) {  				ret = PTR_ERR(handle);  				goto out; @@ -849,7 +851,7 @@ locked:  		int err;  		/* Credits for sb + inode write */ -		handle = ext4_journal_start(inode, 2); +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);  		if (IS_ERR(handle)) {  			/* This is really bad luck. We've written the data  			 * but cannot extend i_size. Bail out and pretend @@ -948,7 +950,8 @@ static handle_t *start_transaction(struct inode *inode)  {  	handle_t *result; -	result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); +	result = ext4_journal_start(inode, EXT4_HT_TRUNCATE, +				    ext4_blocks_for_truncate(inode));  	if (!IS_ERR(result))  		return result; @@ -1515,3 +1518,243 @@ out_stop:  	trace_ext4_truncate_exit(inode);  } +static int free_hole_blocks(handle_t *handle, struct inode *inode, +			    struct buffer_head *parent_bh, __le32 *i_data, +			    int level, ext4_lblk_t first, +			    ext4_lblk_t count, int max) +{ +	struct buffer_head *bh = NULL; +	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); +	int ret = 0; +	int i, inc; +	ext4_lblk_t offset; +	__le32 blk; + +	inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level); +	for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) { +		if (offset >= count + first) +			break; +		if (*i_data == 0 || (offset + inc) <= first) +			continue; +		blk = *i_data; +		if (level > 0) { +			ext4_lblk_t first2; +			bh = sb_bread(inode->i_sb, blk); +			if (!bh) { +				EXT4_ERROR_INODE_BLOCK(inode, blk, +						       "Read failure"); +				return -EIO; +			} +			first2 = (first > offset) ? first - offset : 0; +			ret = free_hole_blocks(handle, inode, bh, +					       (__le32 *)bh->b_data, level - 1, +					       first2, count - offset, +					       inode->i_sb->s_blocksize >> 2); +			if (ret) { +				brelse(bh); +				goto err; +			} +		} +		if (level == 0 || +		    (bh && all_zeroes((__le32 *)bh->b_data, +				      (__le32 *)bh->b_data + addr_per_block))) { +			ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); +			*i_data = 0; +		} +		brelse(bh); +		bh = NULL; +	} + +err: +	return ret; +} + +static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, +				 ext4_lblk_t first, ext4_lblk_t stop) +{ +	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); +	int level, ret = 0; +	int num = EXT4_NDIR_BLOCKS; +	ext4_lblk_t count, max = EXT4_NDIR_BLOCKS; +	__le32 *i_data = EXT4_I(inode)->i_data; + +	count = stop - first; +	for (level = 0; level < 4; level++, max *= addr_per_block) { +		if (first < max) { +			ret = free_hole_blocks(handle, inode, NULL, i_data, +					       level, first, count, num); +			if (ret) +				goto err; +			if (count > max - first) +				count -= max - first; +			else +				break; +			first = 0; +		} else { +			first -= max; +		} +		i_data += num; +		if (level == 0) { +			num = 1; +			max = 1; +		} +	} + +err: +	return ret; +} + +int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length) +{ +	struct inode *inode = file_inode(file); +	struct super_block *sb = inode->i_sb; +	ext4_lblk_t first_block, stop_block; +	struct address_space *mapping = inode->i_mapping; +	handle_t *handle = NULL; +	loff_t first_page, last_page, page_len; +	loff_t first_page_offset, last_page_offset; +	int err = 0; + +	/* +	 * Write out all dirty pages to avoid race conditions +	 * Then release them. +	 */ +	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { +		err = filemap_write_and_wait_range(mapping, +			offset, offset + length - 1); +		if (err) +			return err; +	} + +	mutex_lock(&inode->i_mutex); +	/* It's not possible punch hole on append only file */ +	if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { +		err = -EPERM; +		goto out_mutex; +	} +	if (IS_SWAPFILE(inode)) { +		err = -ETXTBSY; +		goto out_mutex; +	} + +	/* No need to punch hole beyond i_size */ +	if (offset >= inode->i_size) +		goto out_mutex; + +	/* +	 * If the hole extents beyond i_size, set the hole +	 * to end after the page that contains i_size +	 */ +	if (offset + length > inode->i_size) { +		length = inode->i_size + +		    PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - +		    offset; +	} + +	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +	last_page = (offset + length) >> PAGE_CACHE_SHIFT; + +	first_page_offset = first_page << PAGE_CACHE_SHIFT; +	last_page_offset = last_page << PAGE_CACHE_SHIFT; + +	/* Now release the pages */ +	if (last_page_offset > first_page_offset) { +		truncate_pagecache_range(inode, first_page_offset, +					 last_page_offset - 1); +	} + +	/* Wait all existing dio works, newcomers will block on i_mutex */ +	inode_dio_wait(inode); + +	handle = start_transaction(inode); +	if (IS_ERR(handle)) +		goto out_mutex; + +	/* +	 * Now we need to zero out the non-page-aligned data in the +	 * pages at the start and tail of the hole, and unmap the buffer +	 * heads for the block aligned regions of the page that were +	 * completely zerod. +	 */ +	if (first_page > last_page) { +		/* +		 * If the file space being truncated is contained within a page +		 * just zero out and unmap the middle of that page +		 */ +		err = ext4_discard_partial_page_buffers(handle, +			mapping, offset, length, 0); +		if (err) +			goto out; +	} else { +		/* +		 * Zero out and unmap the paritial page that contains +		 * the start of the hole +		 */ +		page_len = first_page_offset - offset; +		if (page_len > 0) { +			err = ext4_discard_partial_page_buffers(handle, mapping, +							offset, page_len, 0); +			if (err) +				goto out; +		} + +		/* +		 * Zero out and unmap the partial page that contains +		 * the end of the hole +		 */ +		page_len = offset + length - last_page_offset; +		if (page_len > 0) { +			err = ext4_discard_partial_page_buffers(handle, mapping, +						last_page_offset, page_len, 0); +			if (err) +				goto out; +		} +	} + +	/* +	 * If i_size contained in the last page, we need to +	 * unmap and zero the paritial page after i_size +	 */ +	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && +	    inode->i_size % PAGE_CACHE_SIZE != 0) { +		page_len = PAGE_CACHE_SIZE - +			(inode->i_size & (PAGE_CACHE_SIZE - 1)); +		if (page_len > 0) { +			err = ext4_discard_partial_page_buffers(handle, +				mapping, inode->i_size, page_len, 0); +			if (err) +				goto out; +		} +	} + +	first_block = (offset + sb->s_blocksize - 1) >> +		EXT4_BLOCK_SIZE_BITS(sb); +	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + +	if (first_block >= stop_block) +		goto out; + +	down_write(&EXT4_I(inode)->i_data_sem); +	ext4_discard_preallocations(inode); + +	err = ext4_es_remove_extent(inode, first_block, +				    stop_block - first_block); +	err = ext4_free_hole_blocks(handle, inode, first_block, stop_block); + +	ext4_discard_preallocations(inode); + +	if (IS_SYNC(inode)) +		ext4_handle_sync(handle); + +	up_write(&EXT4_I(inode)->i_data_sem); + +out: +	inode->i_mtime = inode->i_ctime = ext4_current_time(inode); +	ext4_mark_inode_dirty(handle, inode); +	ext4_journal_stop(handle); + +out_mutex: +	mutex_unlock(&inode->i_mutex); + +	return err; +} diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 387c47c6cda..c0fd1a123f7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -545,7 +545,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,  		return ret;  retry: -	handle = ext4_journal_start(inode, needed_blocks); +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);  	if (IS_ERR(handle)) {  		ret = PTR_ERR(handle);  		handle = NULL; @@ -657,7 +657,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,  	 * The possible write could happen in the inode,  	 * so try to reserve the space in inode first.  	 */ -	handle = ext4_journal_start(inode, 1); +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);  	if (IS_ERR(handle)) {  		ret = PTR_ERR(handle);  		handle = NULL; @@ -853,7 +853,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,  	if (ret)  		return ret; -	handle = ext4_journal_start(inode, 1); +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);  	if (IS_ERR(handle)) {  		ret = PTR_ERR(handle);  		handle = NULL; @@ -1188,7 +1188,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,  	data_bh = sb_getblk(inode->i_sb, map.m_pblk);  	if (!data_bh) { -		error = -EIO; +		error = -ENOMEM;  		goto out_restore;  	} @@ -1298,7 +1298,7 @@ int ext4_read_inline_dir(struct file *filp,  	int i, stored;  	struct ext4_dir_entry_2 *de;  	struct super_block *sb; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	int ret, inline_size = 0;  	struct ext4_iloc iloc;  	void *dir_buf = NULL; @@ -1770,7 +1770,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)  	needed_blocks = ext4_writepage_trans_blocks(inode); -	handle = ext4_journal_start(inode, needed_blocks); +	handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);  	if (IS_ERR(handle))  		return; @@ -1862,7 +1862,7 @@ int ext4_convert_inline_data(struct inode *inode)  	if (error)  		return error; -	handle = ext4_journal_start(inode, needed_blocks); +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);  	if (IS_ERR(handle)) {  		error = PTR_ERR(handle);  		goto out_free; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cb1c1ab2720..9ea0cde3fa9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -132,10 +132,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,  }  static void ext4_invalidatepage(struct page *page, unsigned long offset); -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, -				   struct buffer_head *bh_result, int create); -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);  static int __ext4_journalled_writepage(struct page *page, unsigned int len);  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);  static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, @@ -238,7 +234,8 @@ void ext4_evict_inode(struct inode *inode)  	 * protection against it  	 */  	sb_start_intwrite(inode->i_sb); -	handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, +				    ext4_blocks_for_truncate(inode)+3);  	if (IS_ERR(handle)) {  		ext4_std_error(inode->i_sb, PTR_ERR(handle));  		/* @@ -346,7 +343,7 @@ void ext4_da_update_reserve_space(struct inode *inode,  	spin_lock(&ei->i_block_reservation_lock);  	trace_ext4_da_update_reserve_space(inode, used, quota_claim);  	if (unlikely(used > ei->i_reserved_data_blocks)) { -		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " +		ext4_warning(inode->i_sb, "%s: ino %lu, used %d "  			 "with only %d reserved data blocks",  			 __func__, inode->i_ino, used,  			 ei->i_reserved_data_blocks); @@ -355,10 +352,12 @@ void ext4_da_update_reserve_space(struct inode *inode,  	}  	if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { -		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d " -			 "with only %d reserved metadata blocks\n", __func__, -			 inode->i_ino, ei->i_allocated_meta_blocks, -			 ei->i_reserved_meta_blocks); +		ext4_warning(inode->i_sb, "ino %lu, allocated %d " +			"with only %d reserved metadata blocks " +			"(releasing %d blocks with reserved %d data blocks)", +			inode->i_ino, ei->i_allocated_meta_blocks, +			     ei->i_reserved_meta_blocks, used, +			     ei->i_reserved_data_blocks);  		WARN_ON(1);  		ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;  	} @@ -508,12 +507,33 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,  int ext4_map_blocks(handle_t *handle, struct inode *inode,  		    struct ext4_map_blocks *map, int flags)  { +	struct extent_status es;  	int retval;  	map->m_flags = 0;  	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"  		  "logical block %lu\n", inode->i_ino, flags, map->m_len,  		  (unsigned long) map->m_lblk); + +	/* Lookup extent status tree firstly */ +	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { +		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { +			map->m_pblk = ext4_es_pblock(&es) + +					map->m_lblk - es.es_lblk; +			map->m_flags |= ext4_es_is_written(&es) ? +					EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; +			retval = es.es_len - (map->m_lblk - es.es_lblk); +			if (retval > map->m_len) +				retval = map->m_len; +			map->m_len = retval; +		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { +			retval = 0; +		} else { +			BUG_ON(1); +		} +		goto found; +	} +  	/*  	 * Try to see if we can get the block without requesting a new  	 * file system block. @@ -527,20 +547,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		retval = ext4_ind_map_blocks(handle, inode, map, flags &  					     EXT4_GET_BLOCKS_KEEP_SIZE);  	} +	if (retval > 0) { +		int ret; +		unsigned long long status; + +		status = map->m_flags & EXT4_MAP_UNWRITTEN ? +				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; +		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && +		    ext4_find_delalloc_range(inode, map->m_lblk, +					     map->m_lblk + map->m_len - 1)) +			status |= EXTENT_STATUS_DELAYED; +		ret = ext4_es_insert_extent(inode, map->m_lblk, +					    map->m_len, map->m_pblk, status); +		if (ret < 0) +			retval = ret; +	}  	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))  		up_read((&EXT4_I(inode)->i_data_sem)); +found:  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { -		int ret; -		if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { -			/* delayed alloc may be allocated by fallocate and -			 * coverted to initialized by directIO. -			 * we need to handle delayed extent here. -			 */ -			down_write((&EXT4_I(inode)->i_data_sem)); -			goto delayed_mapped; -		} -		ret = check_block_validity(inode, map); +		int ret = check_block_validity(inode, map);  		if (ret != 0)  			return ret;  	} @@ -560,16 +587,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		return retval;  	/* -	 * When we call get_blocks without the create flag, the -	 * BH_Unwritten flag could have gotten set if the blocks -	 * requested were part of a uninitialized extent.  We need to -	 * clear this flag now that we are committed to convert all or -	 * part of the uninitialized extent to be an initialized -	 * extent.  This is because we need to avoid the combination -	 * of BH_Unwritten and BH_Mapped flags being simultaneously -	 * set on the buffer_head. +	 * Here we clear m_flags because after allocating an new extent, +	 * it will be set again.  	 */ -	map->m_flags &= ~EXT4_MAP_UNWRITTEN; +	map->m_flags &= ~EXT4_MAP_FLAGS;  	/*  	 * New blocks allocate and/or writing to uninitialized extent @@ -615,18 +636,23 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))  			ext4_da_update_reserve_space(inode, retval, 1);  	} -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { +	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)  		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); -		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { -			int ret; -delayed_mapped: -			/* delayed allocation blocks has been allocated */ -			ret = ext4_es_remove_extent(inode, map->m_lblk, -						    map->m_len); -			if (ret < 0) -				retval = ret; -		} +	if (retval > 0) { +		int ret; +		unsigned long long status; + +		status = map->m_flags & EXT4_MAP_UNWRITTEN ? +				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; +		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && +		    ext4_find_delalloc_range(inode, map->m_lblk, +					     map->m_lblk + map->m_len - 1)) +			status |= EXTENT_STATUS_DELAYED; +		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, +					    map->m_pblk, status); +		if (ret < 0) +			retval = ret;  	}  	up_write((&EXT4_I(inode)->i_data_sem)); @@ -660,7 +686,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,  		if (map.m_len > DIO_MAX_BLOCKS)  			map.m_len = DIO_MAX_BLOCKS;  		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); -		handle = ext4_journal_start(inode, dio_credits); +		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, +					    dio_credits);  		if (IS_ERR(handle)) {  			ret = PTR_ERR(handle);  			return ret; @@ -707,14 +734,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,  	/* ensure we send some value back into *errp */  	*errp = 0; +	if (create && err == 0) +		err = -ENOSPC;	/* should never happen */  	if (err < 0)  		*errp = err;  	if (err <= 0)  		return NULL;  	bh = sb_getblk(inode->i_sb, map.m_pblk); -	if (!bh) { -		*errp = -EIO; +	if (unlikely(!bh)) { +		*errp = -ENOMEM;  		return NULL;  	}  	if (map.m_flags & EXT4_MAP_NEW) { @@ -808,11 +837,10 @@ int ext4_walk_page_buffers(handle_t *handle,   * and the commit_write().  So doing the jbd2_journal_start at the start of   * prepare_write() is the right place.   * - * Also, this function can nest inside ext4_writepage() -> - * block_write_full_page(). In that case, we *know* that ext4_writepage() - * has generated enough buffer credits to do the whole page.  So we won't - * block on the journal in that case, which is good, because the caller may - * be PF_MEMALLOC. + * Also, this function can nest inside ext4_writepage().  In that case, we + * *know* that ext4_writepage() has generated enough buffer credits to do the + * whole page.  So we won't block on the journal in that case, which is good, + * because the caller may be PF_MEMALLOC.   *   * By accident, ext4 can be reentered when a transaction is open via   * quota file writes.  If we were to commit the transaction while thus @@ -878,32 +906,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,  		ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,  						    flags, pagep);  		if (ret < 0) -			goto out; -		if (ret == 1) { -			ret = 0; -			goto out; -		} +			return ret; +		if (ret == 1) +			return 0;  	} -retry: -	handle = ext4_journal_start(inode, needed_blocks); +	/* +	 * grab_cache_page_write_begin() can take a long time if the +	 * system is thrashing due to memory pressure, or if the page +	 * is being written back.  So grab it first before we start +	 * the transaction handle.  This also allows us to allocate +	 * the page (if needed) without using GFP_NOFS. +	 */ +retry_grab: +	page = grab_cache_page_write_begin(mapping, index, flags); +	if (!page) +		return -ENOMEM; +	unlock_page(page); + +retry_journal: +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);  	if (IS_ERR(handle)) { -		ret = PTR_ERR(handle); -		goto out; +		page_cache_release(page); +		return PTR_ERR(handle);  	} -	/* We cannot recurse into the filesystem as the transaction is already -	 * started */ -	flags |= AOP_FLAG_NOFS; - -	page = grab_cache_page_write_begin(mapping, index, flags); -	if (!page) { +	lock_page(page); +	if (page->mapping != mapping) { +		/* The page got truncated from under us */ +		unlock_page(page); +		page_cache_release(page);  		ext4_journal_stop(handle); -		ret = -ENOMEM; -		goto out; +		goto retry_grab;  	} - -	*pagep = page; +	wait_on_page_writeback(page);  	if (ext4_should_dioread_nolock(inode))  		ret = __block_write_begin(page, pos, len, ext4_get_block_write); @@ -918,7 +954,6 @@ retry:  	if (ret) {  		unlock_page(page); -		page_cache_release(page);  		/*  		 * __block_write_begin may have instantiated a few blocks  		 * outside i_size.  Trim these off again. Don't need @@ -942,11 +977,14 @@ retry:  			if (inode->i_nlink)  				ext4_orphan_del(NULL, inode);  		} -	} -	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -		goto retry; -out: +		if (ret == -ENOSPC && +		    ext4_should_retry_alloc(inode->i_sb, &retries)) +			goto retry_journal; +		page_cache_release(page); +		return ret; +	} +	*pagep = page;  	return ret;  } @@ -1256,7 +1294,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)  		 * function is called from invalidate page, it's  		 * harmless to return without any action.  		 */ -		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " +		ext4_warning(inode->i_sb, "ext4_da_release_space: "  			 "ino %lu, to_free %d with only %d reserved "  			 "data blocks", inode->i_ino, to_free,  			 ei->i_reserved_data_blocks); @@ -1357,7 +1395,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,  	loff_t size = i_size_read(inode);  	unsigned int len, block_start;  	struct buffer_head *bh, *page_bufs = NULL; -	int journal_data = ext4_should_journal_data(inode);  	sector_t pblock = 0, cur_logical = 0;  	struct ext4_io_submit io_submit; @@ -1378,7 +1415,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,  		if (nr_pages == 0)  			break;  		for (i = 0; i < nr_pages; i++) { -			int commit_write = 0, skip_page = 0; +			int skip_page = 0;  			struct page *page = pvec.pages[i];  			index = page->index; @@ -1400,27 +1437,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,  			BUG_ON(!PageLocked(page));  			BUG_ON(PageWriteback(page)); -			/* -			 * If the page does not have buffers (for -			 * whatever reason), try to create them using -			 * __block_write_begin.  If this fails, -			 * skip the page and move on. -			 */ -			if (!page_has_buffers(page)) { -				if (__block_write_begin(page, 0, len, -						noalloc_get_block_write)) { -				skip_page: -					unlock_page(page); -					continue; -				} -				commit_write = 1; -			} -  			bh = page_bufs = page_buffers(page);  			block_start = 0;  			do { -				if (!bh) -					goto skip_page;  				if (map && (cur_logical >= map->m_lblk) &&  				    (cur_logical <= (map->m_lblk +  						     (map->m_len - 1)))) { @@ -1448,33 +1467,14 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,  				pblock++;  			} while (bh != page_bufs); -			if (skip_page) -				goto skip_page; - -			if (commit_write) -				/* mark the buffer_heads as dirty & uptodate */ -				block_commit_write(page, 0, len); +			if (skip_page) { +				unlock_page(page); +				continue; +			}  			clear_page_dirty_for_io(page); -			/* -			 * Delalloc doesn't support data journalling, -			 * but eventually maybe we'll lift this -			 * restriction. -			 */ -			if (unlikely(journal_data && PageChecked(page))) -				err = __ext4_journalled_writepage(page, len); -			else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) -				err = ext4_bio_write_page(&io_submit, page, -							  len, mpd->wbc); -			else if (buffer_uninit(page_bufs)) { -				ext4_set_bh_endio(page_bufs, inode); -				err = block_write_full_page_endio(page, -					noalloc_get_block_write, -					mpd->wbc, ext4_end_io_buffer_write); -			} else -				err = block_write_full_page(page, -					noalloc_get_block_write, mpd->wbc); - +			err = ext4_bio_write_page(&io_submit, page, len, +						  mpd->wbc);  			if (!err)  				mpd->pages_written++;  			/* @@ -1640,7 +1640,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)  				 (unsigned long long) next,  				 mpd->b_size >> mpd->inode->i_blkbits, err);  			ext4_msg(sb, KERN_CRIT, -				"This should not happen!! Data will be lost\n"); +				"This should not happen!! Data will be lost");  			if (err == -ENOSPC)  				ext4_print_free_blocks(mpd->inode);  		} @@ -1690,16 +1690,16 @@ submit_io:   *   * @mpd->lbh - extent of blocks   * @logical - logical number of the block in the file - * @bh - bh of the block (used to access block's state) + * @b_state - b_state of the buffer head added   *   * the function is used to collect contig. blocks in same state   */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, -				   sector_t logical, size_t b_size, +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,  				   unsigned long b_state)  {  	sector_t next; -	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; +	int blkbits = mpd->inode->i_blkbits; +	int nrblocks = mpd->b_size >> blkbits;  	/*  	 * XXX Don't go larger than mballoc is willing to allocate @@ -1707,11 +1707,11 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,  	 * mpage_da_submit_io() into this function and then call  	 * ext4_map_blocks() multiple times in a loop  	 */ -	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) +	if (nrblocks >= (8*1024*1024 >> blkbits))  		goto flush_it; -	/* check if thereserved journal credits might overflow */ -	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { +	/* check if the reserved journal credits might overflow */ +	if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {  		if (nrblocks >= EXT4_MAX_TRANS_DATA) {  			/*  			 * With non-extent format we are limited by the journal @@ -1720,16 +1720,6 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,  			 * nrblocks.  So limit nrblocks.  			 */  			goto flush_it; -		} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > -				EXT4_MAX_TRANS_DATA) { -			/* -			 * Adding the new buffer_head would make it cross the -			 * allowed limit for which we have journal credit -			 * reserved. So limit the new bh->b_size -			 */ -			b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << -						mpd->inode->i_blkbits; -			/* we will do mpage_da_submit_io in the next loop */  		}  	}  	/* @@ -1737,7 +1727,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,  	 */  	if (mpd->b_size == 0) {  		mpd->b_blocknr = logical; -		mpd->b_size = b_size; +		mpd->b_size = 1 << blkbits;  		mpd->b_state = b_state & BH_FLAGS;  		return;  	} @@ -1747,7 +1737,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,  	 * Can we merge the block to our big extent?  	 */  	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { -		mpd->b_size += b_size; +		mpd->b_size += 1 << blkbits;  		return;  	} @@ -1775,6 +1765,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  			      struct ext4_map_blocks *map,  			      struct buffer_head *bh)  { +	struct extent_status es;  	int retval;  	sector_t invalid_block = ~((sector_t) 0xffff); @@ -1785,6 +1776,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"  		  "logical block %lu\n", inode->i_ino, map->m_len,  		  (unsigned long) map->m_lblk); + +	/* Lookup extent status tree firstly */ +	if (ext4_es_lookup_extent(inode, iblock, &es)) { + +		if (ext4_es_is_hole(&es)) { +			retval = 0; +			down_read((&EXT4_I(inode)->i_data_sem)); +			goto add_delayed; +		} + +		/* +		 * Delayed extent could be allocated by fallocate. +		 * So we need to check it. +		 */ +		if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { +			map_bh(bh, inode->i_sb, invalid_block); +			set_buffer_new(bh); +			set_buffer_delay(bh); +			return 0; +		} + +		map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; +		retval = es.es_len - (iblock - es.es_lblk); +		if (retval > map->m_len) +			retval = map->m_len; +		map->m_len = retval; +		if (ext4_es_is_written(&es)) +			map->m_flags |= EXT4_MAP_MAPPED; +		else if (ext4_es_is_unwritten(&es)) +			map->m_flags |= EXT4_MAP_UNWRITTEN; +		else +			BUG_ON(1); + +		return retval; +	} +  	/*  	 * Try to see if we can get the block without requesting a new  	 * file system block. @@ -1803,11 +1830,15 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  			map->m_flags |= EXT4_MAP_FROM_CLUSTER;  		retval = 0;  	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) -		retval = ext4_ext_map_blocks(NULL, inode, map, 0); +		retval = ext4_ext_map_blocks(NULL, inode, map, +					     EXT4_GET_BLOCKS_NO_PUT_HOLE);  	else -		retval = ext4_ind_map_blocks(NULL, inode, map, 0); +		retval = ext4_ind_map_blocks(NULL, inode, map, +					     EXT4_GET_BLOCKS_NO_PUT_HOLE); +add_delayed:  	if (retval == 0) { +		int ret;  		/*  		 * XXX: __block_prepare_write() unmaps passed block,  		 * is it OK? @@ -1815,15 +1846,20 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  		/* If the block was allocated from previously allocated cluster,  		 * then we dont need to reserve it again. */  		if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { -			retval = ext4_da_reserve_space(inode, iblock); -			if (retval) +			ret = ext4_da_reserve_space(inode, iblock); +			if (ret) {  				/* not enough space to reserve */ +				retval = ret;  				goto out_unlock; +			}  		} -		retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); -		if (retval) +		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, +					    ~0, EXTENT_STATUS_DELAYED); +		if (ret) { +			retval = ret;  			goto out_unlock; +		}  		/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served  		 * and it should not appear on the bh->b_state. @@ -1833,6 +1869,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  		map_bh(bh, inode->i_sb, invalid_block);  		set_buffer_new(bh);  		set_buffer_delay(bh); +	} else if (retval > 0) { +		int ret; +		unsigned long long status; + +		status = map->m_flags & EXT4_MAP_UNWRITTEN ? +				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; +		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, +					    map->m_pblk, status); +		if (ret != 0) +			retval = ret;  	}  out_unlock: @@ -1890,27 +1936,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,  	return 0;  } -/* - * This function is used as a standard get_block_t calback function - * when there is no desire to allocate any blocks.  It is used as a - * callback function for block_write_begin() and block_write_full_page(). - * These functions should only try to map a single block at a time. - * - * Since this function doesn't do block allocations even if the caller - * requests it by passing in create=1, it is critically important that - * any caller checks to make sure that any buffer heads are returned - * by this function are either all already mapped or marked for - * delayed allocation before calling  block_write_full_page().  Otherwise, - * b_blocknr could be left unitialized, and the page write functions will - * be taken by surprise. - */ -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, -				   struct buffer_head *bh_result, int create) -{ -	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); -	return _ext4_get_block(inode, iblock, bh_result, 0); -} -  static int bget_one(handle_t *handle, struct buffer_head *bh)  {  	get_bh(bh); @@ -1955,7 +1980,8 @@ static int __ext4_journalled_writepage(struct page *page,  	 * references to buffers so we are safe */  	unlock_page(page); -	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, +				    ext4_writepage_trans_blocks(inode));  	if (IS_ERR(handle)) {  		ret = PTR_ERR(handle);  		goto out; @@ -2035,11 +2061,12 @@ out:  static int ext4_writepage(struct page *page,  			  struct writeback_control *wbc)  { -	int ret = 0, commit_write = 0; +	int ret = 0;  	loff_t size;  	unsigned int len;  	struct buffer_head *page_bufs = NULL;  	struct inode *inode = page->mapping->host; +	struct ext4_io_submit io_submit;  	trace_ext4_writepage(page);  	size = i_size_read(inode); @@ -2048,39 +2075,29 @@ static int ext4_writepage(struct page *page,  	else  		len = PAGE_CACHE_SIZE; +	page_bufs = page_buffers(page);  	/* -	 * If the page does not have buffers (for whatever reason), -	 * try to create them using __block_write_begin.  If this -	 * fails, redirty the page and move on. +	 * We cannot do block allocation or other extent handling in this +	 * function. If there are buffers needing that, we have to redirty +	 * the page. But we may reach here when we do a journal commit via +	 * journal_submit_inode_data_buffers() and in that case we must write +	 * allocated buffers to achieve data=ordered mode guarantees.  	 */ -	if (!page_has_buffers(page)) { -		if (__block_write_begin(page, 0, len, -					noalloc_get_block_write)) { -		redirty_page: -			redirty_page_for_writepage(wbc, page); +	if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, +				   ext4_bh_delay_or_unwritten)) { +		redirty_page_for_writepage(wbc, page); +		if (current->flags & PF_MEMALLOC) { +			/* +			 * For memory cleaning there's no point in writing only +			 * some buffers. So just bail out. Warn if we came here +			 * from direct reclaim. +			 */ +			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) +							== PF_MEMALLOC);  			unlock_page(page);  			return 0;  		} -		commit_write = 1;  	} -	page_bufs = page_buffers(page); -	if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, -				   ext4_bh_delay_or_unwritten)) { -		/* -		 * We don't want to do block allocation, so redirty -		 * the page and return.  We may reach here when we do -		 * a journal commit via journal_submit_inode_data_buffers. -		 * We can also reach here via shrink_page_list but it -		 * should never be for direct reclaim so warn if that -		 * happens -		 */ -		WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == -								PF_MEMALLOC); -		goto redirty_page; -	} -	if (commit_write) -		/* now mark the buffer_heads as dirty and uptodate */ -		block_commit_write(page, 0, len);  	if (PageChecked(page) && ext4_should_journal_data(inode))  		/* @@ -2089,14 +2106,9 @@ static int ext4_writepage(struct page *page,  		 */  		return __ext4_journalled_writepage(page, len); -	if (buffer_uninit(page_bufs)) { -		ext4_set_bh_endio(page_bufs, inode); -		ret = block_write_full_page_endio(page, noalloc_get_block_write, -					    wbc, ext4_end_io_buffer_write); -	} else -		ret = block_write_full_page(page, noalloc_get_block_write, -					    wbc); - +	memset(&io_submit, 0, sizeof(io_submit)); +	ret = ext4_bio_write_page(&io_submit, page, len, wbc); +	ext4_io_submit(&io_submit);  	return ret;  } @@ -2228,51 +2240,38 @@ static int write_cache_pages_da(handle_t *handle,  			logical = (sector_t) page->index <<  				(PAGE_CACHE_SHIFT - inode->i_blkbits); -			if (!page_has_buffers(page)) { -				mpage_add_bh_to_extent(mpd, logical, -						       PAGE_CACHE_SIZE, -						       (1 << BH_Dirty) | (1 << BH_Uptodate)); -				if (mpd->io_done) -					goto ret_extent_tail; -			} else { +			/* Add all dirty buffers to mpd */ +			head = page_buffers(page); +			bh = head; +			do { +				BUG_ON(buffer_locked(bh));  				/* -				 * Page with regular buffer heads, -				 * just add all dirty ones +				 * We need to try to allocate unmapped blocks +				 * in the same page.  Otherwise we won't make +				 * progress with the page in ext4_writepage  				 */ -				head = page_buffers(page); -				bh = head; -				do { -					BUG_ON(buffer_locked(bh)); +				if (ext4_bh_delay_or_unwritten(NULL, bh)) { +					mpage_add_bh_to_extent(mpd, logical, +							       bh->b_state); +					if (mpd->io_done) +						goto ret_extent_tail; +				} else if (buffer_dirty(bh) && +					   buffer_mapped(bh)) {  					/* -					 * We need to try to allocate -					 * unmapped blocks in the same page. -					 * Otherwise we won't make progress -					 * with the page in ext4_writepage +					 * mapped dirty buffer. We need to +					 * update the b_state because we look +					 * at b_state in mpage_da_map_blocks. +					 * We don't update b_size because if we +					 * find an unmapped buffer_head later +					 * we need to use the b_state flag of +					 * that buffer_head.  					 */ -					if (ext4_bh_delay_or_unwritten(NULL, bh)) { -						mpage_add_bh_to_extent(mpd, logical, -								       bh->b_size, -								       bh->b_state); -						if (mpd->io_done) -							goto ret_extent_tail; -					} else if (buffer_dirty(bh) && (buffer_mapped(bh))) { -						/* -						 * mapped dirty buffer. We need -						 * to update the b_state -						 * because we look at b_state -						 * in mpage_da_map_blocks.  We -						 * don't update b_size because -						 * if we find an unmapped -						 * buffer_head later we need to -						 * use the b_state flag of that -						 * buffer_head. -						 */ -						if (mpd->b_size == 0) -							mpd->b_state = bh->b_state & BH_FLAGS; -					} -					logical++; -				} while ((bh = bh->b_this_page) != head); -			} +					if (mpd->b_size == 0) +						mpd->b_state = +							bh->b_state & BH_FLAGS; +				} +				logical++; +			} while ((bh = bh->b_this_page) != head);  			if (nr_to_write > 0) {  				nr_to_write--; @@ -2413,7 +2412,8 @@ retry:  		needed_blocks = ext4_da_writepages_trans_blocks(inode);  		/* start a new transaction*/ -		handle = ext4_journal_start(inode, needed_blocks); +		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, +					    needed_blocks);  		if (IS_ERR(handle)) {  			ret = PTR_ERR(handle);  			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " @@ -2512,12 +2512,8 @@ static int ext4_nonda_switch(struct super_block *sb)  	/*  	 * Start pushing delalloc when 1/2 of free blocks are dirty.  	 */ -	if (dirty_blocks && (free_blocks < 2 * dirty_blocks) && -	    !writeback_in_progress(sb->s_bdi) && -	    down_read_trylock(&sb->s_umount)) { -		writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); -		up_read(&sb->s_umount); -	} +	if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) +		try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);  	if (2 * free_blocks < 3 * dirty_blocks ||  		free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { @@ -2555,42 +2551,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,  						      pos, len, flags,  						      pagep, fsdata);  		if (ret < 0) -			goto out; -		if (ret == 1) { -			ret = 0; -			goto out; -		} +			return ret; +		if (ret == 1) +			return 0;  	} -retry: +	/* +	 * grab_cache_page_write_begin() can take a long time if the +	 * system is thrashing due to memory pressure, or if the page +	 * is being written back.  So grab it first before we start +	 * the transaction handle.  This also allows us to allocate +	 * the page (if needed) without using GFP_NOFS. +	 */ +retry_grab: +	page = grab_cache_page_write_begin(mapping, index, flags); +	if (!page) +		return -ENOMEM; +	unlock_page(page); +  	/*  	 * With delayed allocation, we don't log the i_disksize update  	 * if there is delayed block allocation. But we still need  	 * to journalling the i_disksize update if writes to the end  	 * of file which has an already mapped buffer.  	 */ -	handle = ext4_journal_start(inode, 1); +retry_journal: +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);  	if (IS_ERR(handle)) { -		ret = PTR_ERR(handle); -		goto out; +		page_cache_release(page); +		return PTR_ERR(handle);  	} -	/* We cannot recurse into the filesystem as the transaction is already -	 * started */ -	flags |= AOP_FLAG_NOFS; -	page = grab_cache_page_write_begin(mapping, index, flags); -	if (!page) { +	lock_page(page); +	if (page->mapping != mapping) { +		/* The page got truncated from under us */ +		unlock_page(page); +		page_cache_release(page);  		ext4_journal_stop(handle); -		ret = -ENOMEM; -		goto out; +		goto retry_grab;  	} -	*pagep = page; +	/* In case writeback began while the page was unlocked */ +	wait_on_page_writeback(page);  	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);  	if (ret < 0) {  		unlock_page(page);  		ext4_journal_stop(handle); -		page_cache_release(page);  		/*  		 * block_write_begin may have instantiated a few blocks  		 * outside i_size.  Trim these off again. Don't need @@ -2598,11 +2604,16 @@ retry:  		 */  		if (pos + len > inode->i_size)  			ext4_truncate_failed_write(inode); + +		if (ret == -ENOSPC && +		    ext4_should_retry_alloc(inode->i_sb, &retries)) +			goto retry_journal; + +		page_cache_release(page); +		return ret;  	} -	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -		goto retry; -out: +	*pagep = page;  	return ret;  } @@ -2858,47 +2869,37 @@ ext4_readpages(struct file *file, struct address_space *mapping,  	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);  } -static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) +static void ext4_invalidatepage(struct page *page, unsigned long offset)  { -	struct buffer_head *head, *bh; -	unsigned int curr_off = 0; +	trace_ext4_invalidatepage(page, offset); -	if (!page_has_buffers(page)) -		return; -	head = bh = page_buffers(page); -	do { -		if (offset <= curr_off && test_clear_buffer_uninit(bh) -					&& bh->b_private) { -			ext4_free_io_end(bh->b_private); -			bh->b_private = NULL; -			bh->b_end_io = NULL; -		} -		curr_off = curr_off + bh->b_size; -		bh = bh->b_this_page; -	} while (bh != head); +	/* No journalling happens on data buffers when this function is used */ +	WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); + +	block_invalidatepage(page, offset);  } -static void ext4_invalidatepage(struct page *page, unsigned long offset) +static int __ext4_journalled_invalidatepage(struct page *page, +					    unsigned long offset)  {  	journal_t *journal = EXT4_JOURNAL(page->mapping->host); -	trace_ext4_invalidatepage(page, offset); +	trace_ext4_journalled_invalidatepage(page, offset);  	/* -	 * free any io_end structure allocated for buffers to be discarded -	 */ -	if (ext4_should_dioread_nolock(page->mapping->host)) -		ext4_invalidatepage_free_endio(page, offset); -	/*  	 * If it's a full truncate we just forget about the pending dirtying  	 */  	if (offset == 0)  		ClearPageChecked(page); -	if (journal) -		jbd2_journal_invalidatepage(journal, page, offset); -	else -		block_invalidatepage(page, offset); +	return jbd2_journal_invalidatepage(journal, page, offset); +} + +/* Wrapper for aops... */ +static void ext4_journalled_invalidatepage(struct page *page, +					   unsigned long offset) +{ +	WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);  }  static int ext4_releasepage(struct page *page, gfp_t wait) @@ -2943,7 +2944,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,  			    ssize_t size, void *private, int ret,  			    bool is_async)  { -	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(iocb->ki_filp);          ext4_io_end_t *io_end = iocb->private;  	/* if not async direct IO or dio with 0 bytes write, just return */ @@ -2961,9 +2962,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,  	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {  		ext4_free_io_end(io_end);  out: +		inode_dio_done(inode);  		if (is_async)  			aio_complete(iocb, ret, 0); -		inode_dio_done(inode);  		return;  	} @@ -2977,65 +2978,6 @@ out:  	ext4_add_complete_io(io_end);  } -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) -{ -	ext4_io_end_t *io_end = bh->b_private; -	struct inode *inode; - -	if (!test_clear_buffer_uninit(bh) || !io_end) -		goto out; - -	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { -		ext4_msg(io_end->inode->i_sb, KERN_INFO, -			 "sb umounted, discard end_io request for inode %lu", -			 io_end->inode->i_ino); -		ext4_free_io_end(io_end); -		goto out; -	} - -	/* -	 * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, -	 * but being more careful is always safe for the future change. -	 */ -	inode = io_end->inode; -	ext4_set_io_unwritten_flag(inode, io_end); -	ext4_add_complete_io(io_end); -out: -	bh->b_private = NULL; -	bh->b_end_io = NULL; -	clear_buffer_uninit(bh); -	end_buffer_async_write(bh, uptodate); -} - -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) -{ -	ext4_io_end_t *io_end; -	struct page *page = bh->b_page; -	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; -	size_t size = bh->b_size; - -retry: -	io_end = ext4_init_io_end(inode, GFP_ATOMIC); -	if (!io_end) { -		pr_warn_ratelimited("%s: allocation fail\n", __func__); -		schedule(); -		goto retry; -	} -	io_end->offset = offset; -	io_end->size = size; -	/* -	 * We need to hold a reference to the page to make sure it -	 * doesn't get evicted before ext4_end_io_work() has a chance -	 * to convert the extent from written to unwritten. -	 */ -	io_end->page = page; -	get_page(io_end->page); - -	bh->b_private = io_end; -	bh->b_end_io = ext4_end_io_buffer_write; -	return 0; -} -  /*   * For ext4 extent files, ext4 will do direct-io write to holes,   * preallocated extents, and those write extend the file, no need to @@ -3264,7 +3206,7 @@ static const struct address_space_operations ext4_journalled_aops = {  	.write_end		= ext4_journalled_write_end,  	.set_page_dirty		= ext4_journalled_set_page_dirty,  	.bmap			= ext4_bmap, -	.invalidatepage		= ext4_invalidatepage, +	.invalidatepage		= ext4_journalled_invalidatepage,  	.releasepage		= ext4_releasepage,  	.direct_IO		= ext4_direct_IO,  	.is_partially_uptodate  = block_is_partially_uptodate, @@ -3537,20 +3479,20 @@ int ext4_can_truncate(struct inode *inode)  int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	if (!S_ISREG(inode->i_mode))  		return -EOPNOTSUPP; -	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { -		/* TODO: Add support for non extent hole punching */ -		return -EOPNOTSUPP; -	} +	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) +		return ext4_ind_punch_hole(file, offset, length);  	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {  		/* TODO: Add support for bigalloc file systems */  		return -EOPNOTSUPP;  	} +	trace_ext4_punch_hole(inode, offset, length); +  	return ext4_ext_punch_hole(file, offset, length);  } @@ -3644,11 +3586,8 @@ static int __ext4_get_inode_loc(struct inode *inode,  	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);  	bh = sb_getblk(sb, block); -	if (!bh) { -		EXT4_ERROR_INODE_BLOCK(inode, block, -				       "unable to read itable block"); -		return -EIO; -	} +	if (unlikely(!bh)) +		return -ENOMEM;  	if (!buffer_uptodate(bh)) {  		lock_buffer(bh); @@ -3680,7 +3619,7 @@ static int __ext4_get_inode_loc(struct inode *inode,  			/* Is the inode bitmap in cache? */  			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); -			if (!bitmap_bh) +			if (unlikely(!bitmap_bh))  				goto make_io;  			/* @@ -4305,6 +4244,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)  }  /* + * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate + * buffers that are attached to a page stradding i_size and are undergoing + * commit. In that case we have to wait for commit to finish and try again. + */ +static void ext4_wait_for_tail_page_commit(struct inode *inode) +{ +	struct page *page; +	unsigned offset; +	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; +	tid_t commit_tid = 0; +	int ret; + +	offset = inode->i_size & (PAGE_CACHE_SIZE - 1); +	/* +	 * All buffers in the last page remain valid? Then there's nothing to +	 * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE == +	 * blocksize case +	 */ +	if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits)) +		return; +	while (1) { +		page = find_lock_page(inode->i_mapping, +				      inode->i_size >> PAGE_CACHE_SHIFT); +		if (!page) +			return; +		ret = __ext4_journalled_invalidatepage(page, offset); +		unlock_page(page); +		page_cache_release(page); +		if (ret != -EBUSY) +			return; +		commit_tid = 0; +		read_lock(&journal->j_state_lock); +		if (journal->j_committing_transaction) +			commit_tid = journal->j_committing_transaction->t_tid; +		read_unlock(&journal->j_state_lock); +		if (commit_tid) +			jbd2_log_wait_commit(journal, commit_tid); +	} +} + +/*   * ext4_setattr()   *   * Called from notify_change. @@ -4347,8 +4327,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  		/* (user+group)*(old+new) structure, inode write (sb,  		 * inode block, ? - but truncate inode update has it) */ -		handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ -					EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); +		handle = ext4_journal_start(inode, EXT4_HT_QUOTA, +			(EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + +			 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);  		if (IS_ERR(handle)) {  			error = PTR_ERR(handle);  			goto err_out; @@ -4383,7 +4364,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  	    (attr->ia_size < inode->i_size)) {  		handle_t *handle; -		handle = ext4_journal_start(inode, 3); +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);  		if (IS_ERR(handle)) {  			error = PTR_ERR(handle);  			goto err_out; @@ -4403,7 +4384,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  							    attr->ia_size);  			if (error) {  				/* Do as much error cleanup as possible */ -				handle = ext4_journal_start(inode, 3); +				handle = ext4_journal_start(inode, +							    EXT4_HT_INODE, 3);  				if (IS_ERR(handle)) {  					ext4_orphan_del(NULL, inode);  					goto err_out; @@ -4417,16 +4399,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  	}  	if (attr->ia_valid & ATTR_SIZE) { -		if (attr->ia_size != i_size_read(inode)) { -			truncate_setsize(inode, attr->ia_size); -			/* Inode size will be reduced, wait for dio in flight. -			 * Temporarily disable dioread_nolock to prevent -			 * livelock. */ +		if (attr->ia_size != inode->i_size) { +			loff_t oldsize = inode->i_size; + +			i_size_write(inode, attr->ia_size); +			/* +			 * Blocks are going to be removed from the inode. Wait +			 * for dio in flight.  Temporarily disable +			 * dioread_nolock to prevent livelock. +			 */  			if (orphan) { -				ext4_inode_block_unlocked_dio(inode); -				inode_dio_wait(inode); -				ext4_inode_resume_unlocked_dio(inode); +				if (!ext4_should_journal_data(inode)) { +					ext4_inode_block_unlocked_dio(inode); +					inode_dio_wait(inode); +					ext4_inode_resume_unlocked_dio(inode); +				} else +					ext4_wait_for_tail_page_commit(inode);  			} +			/* +			 * Truncate pagecache after we've waited for commit +			 * in data=journal mode to make pages freeable. +			 */ +			truncate_pagecache(inode, oldsize, inode->i_size);  		}  		ext4_truncate(inode);  	} @@ -4732,7 +4726,7 @@ void ext4_dirty_inode(struct inode *inode, int flags)  {  	handle_t *handle; -	handle = ext4_journal_start(inode, 2); +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);  	if (IS_ERR(handle))  		goto out; @@ -4833,7 +4827,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)  	/* Finally we can mark the inode as dirty. */ -	handle = ext4_journal_start(inode, 1); +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);  	if (IS_ERR(handle))  		return PTR_ERR(handle); @@ -4857,7 +4851,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	unsigned long len;  	int ret;  	struct file *file = vma->vm_file; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct address_space *mapping = inode->i_mapping;  	handle_t *handle;  	get_block_t *get_block; @@ -4899,7 +4893,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  					    0, len, NULL,  					    ext4_bh_unmapped)) {  			/* Wait so that we don't change page under IO */ -			wait_on_page_writeback(page); +			wait_for_stable_page(page);  			ret = VM_FAULT_LOCKED;  			goto out;  		} @@ -4911,7 +4905,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	else  		get_block = ext4_get_block;  retry_alloc: -	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, +				    ext4_writepage_trans_blocks(inode));  	if (IS_ERR(handle)) {  		ret = VM_FAULT_SIGBUS;  		goto out; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5747f52f7c7..721f4d33e14 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -22,7 +22,7 @@  long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	struct ext4_inode_info *ei = EXT4_I(inode);  	unsigned int flags; @@ -104,7 +104,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  		} else if (oldflags & EXT4_EOFBLOCKS_FL)  			ext4_truncate(inode); -		handle = ext4_journal_start(inode, 1); +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);  		if (IS_ERR(handle)) {  			err = PTR_ERR(handle);  			goto flags_out; @@ -173,7 +173,7 @@ flags_out:  		}  		mutex_lock(&inode->i_mutex); -		handle = ext4_journal_start(inode, 1); +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);  		if (IS_ERR(handle)) {  			err = PTR_ERR(handle);  			goto unlock_out; @@ -313,6 +313,9 @@ mext_out:  		if (err == 0)  			err = err2;  		mnt_drop_write_file(filp); +		if (!err && ext4_has_group_desc_csum(sb) && +		    test_opt(sb, INIT_INODE_TABLE)) +			err = ext4_register_li_request(sb, input.group);  group_add_out:  		ext4_resize_end(sb);  		return err; @@ -358,6 +361,7 @@ group_add_out:  		ext4_fsblk_t n_blocks_count;  		struct super_block *sb = inode->i_sb;  		int err = 0, err2 = 0; +		ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;  		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,  			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { @@ -388,6 +392,11 @@ group_add_out:  		if (err == 0)  			err = err2;  		mnt_drop_write_file(filp); +		if (!err && (o_group > EXT4_SB(sb)->s_groups_count) && +		    ext4_has_group_desc_csum(sb) && +		    test_opt(sb, INIT_INODE_TABLE)) +			err = ext4_register_li_request(sb, o_group); +  resizefs_out:  		ext4_resize_end(sb);  		return err; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1bf6fe785c4..7bb713a46fe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -23,11 +23,18 @@  #include "ext4_jbd2.h"  #include "mballoc.h" -#include <linux/debugfs.h>  #include <linux/log2.h> +#include <linux/module.h>  #include <linux/slab.h>  #include <trace/events/ext4.h> +#ifdef CONFIG_EXT4_DEBUG +ushort ext4_mballoc_debug __read_mostly; + +module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); +MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); +#endif +  /*   * MUSTDO:   *   - test ext4_ext_search_left() and ext4_ext_search_right() @@ -1884,15 +1891,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,  	case 0:  		BUG_ON(ac->ac_2order == 0); -		if (grp->bb_largest_free_order < ac->ac_2order) -			return 0; -  		/* Avoid using the first bg of a flexgroup for data files */  		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&  		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&  		    ((group % flex_size) == 0))  			return 0; +		if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || +		    (free / fragments) >= ac->ac_g_ex.fe_len) +			return 1; + +		if (grp->bb_largest_free_order < ac->ac_2order) +			return 0; +  		return 1;  	case 1:  		if ((free / fragments) >= ac->ac_g_ex.fe_len) @@ -2007,7 +2018,7 @@ repeat:  			}  			ac->ac_groups_scanned++; -			if (cr == 0) +			if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)  				ext4_mb_simple_scan_group(ac, &e4b);  			else if (cr == 1 && sbi->s_stripe &&  					!(ac->ac_g_ex.fe_len % sbi->s_stripe)) @@ -2656,40 +2667,6 @@ static void ext4_free_data_callback(struct super_block *sb,  	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);  } -#ifdef CONFIG_EXT4_DEBUG -u8 mb_enable_debug __read_mostly; - -static struct dentry *debugfs_dir; -static struct dentry *debugfs_debug; - -static void __init ext4_create_debugfs_entry(void) -{ -	debugfs_dir = debugfs_create_dir("ext4", NULL); -	if (debugfs_dir) -		debugfs_debug = debugfs_create_u8("mballoc-debug", -						  S_IRUGO | S_IWUSR, -						  debugfs_dir, -						  &mb_enable_debug); -} - -static void ext4_remove_debugfs_entry(void) -{ -	debugfs_remove(debugfs_debug); -	debugfs_remove(debugfs_dir); -} - -#else - -static void __init ext4_create_debugfs_entry(void) -{ -} - -static void ext4_remove_debugfs_entry(void) -{ -} - -#endif -  int __init ext4_init_mballoc(void)  {  	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, @@ -2711,7 +2688,6 @@ int __init ext4_init_mballoc(void)  		kmem_cache_destroy(ext4_ac_cachep);  		return -ENOMEM;  	} -	ext4_create_debugfs_entry();  	return 0;  } @@ -2726,7 +2702,6 @@ void ext4_exit_mballoc(void)  	kmem_cache_destroy(ext4_ac_cachep);  	kmem_cache_destroy(ext4_free_data_cachep);  	ext4_groupinfo_destroy_slabs(); -	ext4_remove_debugfs_entry();  } @@ -3444,7 +3419,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)  			win = offs;  		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - -			EXT4_B2C(sbi, win); +			EXT4_NUM_B2C(sbi, win);  		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);  		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);  	} @@ -3872,7 +3847,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)  	struct super_block *sb = ac->ac_sb;  	ext4_group_t ngroups, i; -	if (!mb_enable_debug || +	if (!ext4_mballoc_debug ||  	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))  		return; @@ -4005,8 +3980,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,  	len = ar->len;  	/* just a dirty hack to filter too big requests  */ -	if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) -		len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; +	if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) +		len = EXT4_CLUSTERS_PER_GROUP(sb);  	/* start searching from the goal */  	goal = ar->goal; @@ -4136,7 +4111,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)  		/* The max size of hash table is PREALLOC_TB_SIZE */  		order = PREALLOC_TB_SIZE - 1;  	/* Add the prealloc space to lg */ -	rcu_read_lock(); +	spin_lock(&lg->lg_prealloc_lock);  	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],  						pa_inode_list) {  		spin_lock(&tmp_pa->pa_lock); @@ -4160,12 +4135,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)  	if (!added)  		list_add_tail_rcu(&pa->pa_inode_list,  					&lg->lg_prealloc_list[order]); -	rcu_read_unlock(); +	spin_unlock(&lg->lg_prealloc_lock);  	/* Now trim the list to be not more than 8 elements */  	if (lg_prealloc_count > 8) {  		ext4_mb_discard_lg_preallocations(sb, lg, -						order, lg_prealloc_count); +						  order, lg_prealloc_count);  		return;  	}  	return ; @@ -4590,7 +4565,7 @@ do_more:  			EXT4_BLOCKS_PER_GROUP(sb);  		count -= overflow;  	} -	count_clusters = EXT4_B2C(sbi, count); +	count_clusters = EXT4_NUM_B2C(sbi, count);  	bitmap_bh = ext4_read_block_bitmap(sb, block_group);  	if (!bitmap_bh) {  		err = -EIO; @@ -4832,11 +4807,11 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,  	ext4_group_desc_csum_set(sb, block_group, desc);  	ext4_unlock_group(sb, block_group);  	percpu_counter_add(&sbi->s_freeclusters_counter, -			   EXT4_B2C(sbi, blocks_freed)); +			   EXT4_NUM_B2C(sbi, blocks_freed));  	if (sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group = ext4_flex_group(sbi, block_group); -		atomic_add(EXT4_B2C(sbi, blocks_freed), +		atomic_add(EXT4_NUM_B2C(sbi, blocks_freed),  			   &sbi->s_flex_groups[flex_group].free_clusters);  	} diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 3ccd889ba95..08481ee84cd 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -37,11 +37,11 @@  /*   */  #ifdef CONFIG_EXT4_DEBUG -extern u8 mb_enable_debug; +extern ushort ext4_mballoc_debug;  #define mb_debug(n, fmt, a...)	                                        \  	do {								\ -		if ((n) <= mb_enable_debug) {		        	\ +		if ((n) <= ext4_mballoc_debug) {		        \  			printk(KERN_DEBUG "(%s, %d): %s: ",		\  			       __FILE__, __LINE__, __func__);		\  			printk(fmt, ## a);				\ diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index db8226d595f..480acf4a085 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -456,11 +456,14 @@ int ext4_ext_migrate(struct inode *inode)  		 */  		return retval; -	handle = ext4_journal_start(inode, -					EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + -					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + -					EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) -					+ 1); +	/* +	 * Worst case we can touch the allocation bitmaps, a bgd +	 * block, and a block to link in the orphan list.  We do need +	 * need to worry about credits for modifying the quota inode. +	 */ +	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, +		4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); +  	if (IS_ERR(handle)) {  		retval = PTR_ERR(handle);  		return retval; @@ -507,7 +510,7 @@ int ext4_ext_migrate(struct inode *inode)  	ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);  	up_read((&EXT4_I(inode)->i_data_sem)); -	handle = ext4_journal_start(inode, 1); +	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);  	if (IS_ERR(handle)) {  		/*  		 * It is impossible to update on-disk structures without diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index fe7c63f4717..f9b551561d2 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -80,6 +80,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,  	 * is not blocked in the elevator. */  	if (!*bh)  		*bh = sb_getblk(sb, mmp_block); +	if (!*bh) +		return -ENOMEM;  	if (*bh) {  		get_bh(*bh);  		lock_buffer(*bh); @@ -91,7 +93,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,  			*bh = NULL;  		}  	} -	if (!*bh) { +	if (unlikely(!*bh)) {  		ext4_warning(sb, "Error while reading MMP block %llu",  			     mmp_block);  		return -EIO; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d9cc5ee42f5..4e81d47aa8c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -681,6 +681,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,  	depth = ext_depth(donor_inode);  	dext = donor_path[depth].p_ext; +	if (unlikely(!dext)) +		goto missing_donor_extent;  	tmp_dext = *dext;  	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, @@ -691,7 +693,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,  	/* Loop for the donor extents */  	while (1) {  		/* The extent for donor must be found. */ -		if (!dext) { +		if (unlikely(!dext)) { +		missing_donor_extent:  			EXT4_ERROR_INODE(donor_inode,  				   "The extent for donor must be found");  			*err = -EIO; @@ -761,9 +764,6 @@ out:  		kfree(donor_path);  	} -	ext4_ext_invalidate_cache(orig_inode); -	ext4_ext_invalidate_cache(donor_inode); -  	return replaced_count;  } @@ -900,7 +900,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,  		  pgoff_t orig_page_offset, int data_offset_in_page,  		  int block_len_in_page, int uninit, int *err)  { -	struct inode *orig_inode = o_filp->f_dentry->d_inode; +	struct inode *orig_inode = file_inode(o_filp);  	struct page *pagep[2] = {NULL, NULL};  	handle_t *handle;  	ext4_lblk_t orig_blk_offset; @@ -920,7 +920,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,  again:  	*err = 0;  	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; -	handle = ext4_journal_start(orig_inode, jblocks); +	handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);  	if (IS_ERR(handle)) {  		*err = PTR_ERR(handle);  		return 0; @@ -1279,8 +1279,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,  		 __u64 orig_start, __u64 donor_start, __u64 len,  		 __u64 *moved_len)  { -	struct inode *orig_inode = o_filp->f_dentry->d_inode; -	struct inode *donor_inode = d_filp->f_dentry->d_inode; +	struct inode *orig_inode = file_inode(o_filp); +	struct inode *donor_inode = file_inode(d_filp);  	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;  	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;  	ext4_lblk_t block_start = orig_start; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cac44828233..3825d6aa833 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -47,38 +47,111 @@  #define NAMEI_RA_CHUNKS  2  #define NAMEI_RA_BLOCKS  4  #define NAMEI_RA_SIZE	     (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))  static struct buffer_head *ext4_append(handle_t *handle,  					struct inode *inode, -					ext4_lblk_t *block, int *err) +					ext4_lblk_t *block)  {  	struct buffer_head *bh; +	int err = 0;  	if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&  		     ((inode->i_size >> 10) >= -		      EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) { -		*err = -ENOSPC; -		return NULL; -	} +		      EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) +		return ERR_PTR(-ENOSPC);  	*block = inode->i_size >> inode->i_sb->s_blocksize_bits; -	bh = ext4_bread(handle, inode, *block, 1, err); -	if (bh) { -		inode->i_size += inode->i_sb->s_blocksize; -		EXT4_I(inode)->i_disksize = inode->i_size; -		*err = ext4_journal_get_write_access(handle, bh); -		if (*err) { +	bh = ext4_bread(handle, inode, *block, 1, &err); +	if (!bh) +		return ERR_PTR(err); +	inode->i_size += inode->i_sb->s_blocksize; +	EXT4_I(inode)->i_disksize = inode->i_size; +	err = ext4_journal_get_write_access(handle, bh); +	if (err) { +		brelse(bh); +		ext4_std_error(inode->i_sb, err); +		return ERR_PTR(err); +	} +	return bh; +} + +static int ext4_dx_csum_verify(struct inode *inode, +			       struct ext4_dir_entry *dirent); + +typedef enum { +	EITHER, INDEX, DIRENT +} dirblock_type_t; + +#define ext4_read_dirblock(inode, block, type) \ +	__ext4_read_dirblock((inode), (block), (type), __LINE__) + +static struct buffer_head *__ext4_read_dirblock(struct inode *inode, +					      ext4_lblk_t block, +					      dirblock_type_t type, +					      unsigned int line) +{ +	struct buffer_head *bh; +	struct ext4_dir_entry *dirent; +	int err = 0, is_dx_block = 0; + +	bh = ext4_bread(NULL, inode, block, 0, &err); +	if (!bh) { +		if (err == 0) { +			ext4_error_inode(inode, __func__, line, block, +					       "Directory hole found"); +			return ERR_PTR(-EIO); +		} +		__ext4_warning(inode->i_sb, __func__, line, +			       "error reading directory block " +			       "(ino %lu, block %lu)", inode->i_ino, +			       (unsigned long) block); +		return ERR_PTR(err); +	} +	dirent = (struct ext4_dir_entry *) bh->b_data; +	/* Determine whether or not we have an index block */ +	if (is_dx(inode)) { +		if (block == 0) +			is_dx_block = 1; +		else if (ext4_rec_len_from_disk(dirent->rec_len, +						inode->i_sb->s_blocksize) == +			 inode->i_sb->s_blocksize) +			is_dx_block = 1; +	} +	if (!is_dx_block && type == INDEX) { +		ext4_error_inode(inode, __func__, line, block, +		       "directory leaf block found instead of index block"); +		return ERR_PTR(-EIO); +	} +	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, +					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) || +	    buffer_verified(bh)) +		return bh; + +	/* +	 * An empty leaf block can get mistaken for a index block; for +	 * this reason, we can only check the index checksum when the +	 * caller is sure it should be an index block. +	 */ +	if (is_dx_block && type == INDEX) { +		if (ext4_dx_csum_verify(inode, dirent)) +			set_buffer_verified(bh); +		else { +			ext4_error_inode(inode, __func__, line, block, +				"Directory index failed checksum");  			brelse(bh); -			bh = NULL; +			return ERR_PTR(-EIO);  		}  	} -	if (!bh && !(*err)) { -		*err = -EIO; -		ext4_error(inode->i_sb, -			   "Directory hole detected on inode %lu\n", -			   inode->i_ino); +	if (!is_dx_block) { +		if (ext4_dirent_csum_verify(inode, dirent)) +			set_buffer_verified(bh); +		else { +			ext4_error_inode(inode, __func__, line, block, +				"Directory block failed checksum"); +			brelse(bh); +			return ERR_PTR(-EIO); +		}  	}  	return bh;  } @@ -604,9 +677,9 @@ dx_probe(const struct qstr *d_name, struct inode *dir,  	u32 hash;  	frame->bh = NULL; -	if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) { -		if (*err == 0) -			*err = ERR_BAD_DX_DIR; +	bh = ext4_read_dirblock(dir, 0, INDEX); +	if (IS_ERR(bh)) { +		*err = PTR_ERR(bh);  		goto fail;  	}  	root = (struct dx_root *) bh->b_data; @@ -643,15 +716,6 @@ dx_probe(const struct qstr *d_name, struct inode *dir,  		goto fail;  	} -	if (!buffer_verified(bh) && -	    !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) { -		ext4_warning(dir->i_sb, "Root failed checksum"); -		brelse(bh); -		*err = ERR_BAD_DX_DIR; -		goto fail; -	} -	set_buffer_verified(bh); -  	entries = (struct dx_entry *) (((char *)&root->info) +  				       root->info.info_length); @@ -709,22 +773,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,  		frame->entries = entries;  		frame->at = at;  		if (!indirect--) return frame; -		if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) { -			if (!(*err)) -				*err = ERR_BAD_DX_DIR; +		bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); +		if (IS_ERR(bh)) { +			*err = PTR_ERR(bh);  			goto fail2;  		} -		at = entries = ((struct dx_node *) bh->b_data)->entries; - -		if (!buffer_verified(bh) && -		    !ext4_dx_csum_verify(dir, -					 (struct ext4_dir_entry *)bh->b_data)) { -			ext4_warning(dir->i_sb, "Node failed checksum"); -			brelse(bh); -			*err = ERR_BAD_DX_DIR; -			goto fail; -		} -		set_buffer_verified(bh); +		entries = ((struct dx_node *) bh->b_data)->entries;  		if (dx_get_limit(entries) != dx_node_limit (dir)) {  			ext4_warning(dir->i_sb, @@ -783,7 +837,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,  {  	struct dx_frame *p;  	struct buffer_head *bh; -	int err, num_frames = 0; +	int num_frames = 0;  	__u32 bhash;  	p = frame; @@ -822,25 +876,9 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,  	 * block so no check is necessary  	 */  	while (num_frames--) { -		if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), -				      0, &err))) { -			if (!err) { -				ext4_error(dir->i_sb, -					   "Directory hole detected on inode %lu\n", -					   dir->i_ino); -				return -EIO; -			} -			return err; /* Failure */ -		} - -		if (!buffer_verified(bh) && -		    !ext4_dx_csum_verify(dir, -					 (struct ext4_dir_entry *)bh->b_data)) { -			ext4_warning(dir->i_sb, "Node failed checksum"); -			return -EIO; -		} -		set_buffer_verified(bh); - +		bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); +		if (IS_ERR(bh)) +			return PTR_ERR(bh);  		p++;  		brelse(p->bh);  		p->bh = bh; @@ -866,20 +904,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,  	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",  							(unsigned long)block)); -	if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) { -		if (!err) { -			err = -EIO; -			ext4_error(dir->i_sb, -				   "Directory hole detected on inode %lu\n", -				   dir->i_ino); -		} -		return err; -	} - -	if (!buffer_verified(bh) && -	    !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) -		return -EIO; -	set_buffer_verified(bh); +	bh = ext4_read_dirblock(dir, block, DIRENT); +	if (IS_ERR(bh)) +		return PTR_ERR(bh);  	de = (struct ext4_dir_entry_2 *) bh->b_data;  	top = (struct ext4_dir_entry_2 *) ((char *) de + @@ -937,7 +964,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,  	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",  		       start_hash, start_minor_hash)); -	dir = dir_file->f_path.dentry->d_inode; +	dir = file_inode(dir_file);  	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {  		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;  		if (hinfo.hash_version <= DX_HASH_TEA) @@ -1333,26 +1360,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q  		return NULL;  	do {  		block = dx_get_block(frame->at); -		if (!(bh = ext4_bread(NULL, dir, block, 0, err))) { -			if (!(*err)) { -				*err = -EIO; -				ext4_error(dir->i_sb, -					   "Directory hole detected on inode %lu\n", -					   dir->i_ino); -			} +		bh = ext4_read_dirblock(dir, block, DIRENT); +		if (IS_ERR(bh)) { +			*err = PTR_ERR(bh);  			goto errout;  		} - -		if (!buffer_verified(bh) && -		    !ext4_dirent_csum_verify(dir, -				(struct ext4_dir_entry *)bh->b_data)) { -			EXT4_ERROR_INODE(dir, "checksumming directory " -					 "block %lu", (unsigned long)block); -			brelse(bh); -			*err = -EIO; -			goto errout; -		} -		set_buffer_verified(bh);  		retval = search_dirblock(bh, dir, d_name,  					 block << EXT4_BLOCK_SIZE_BITS(sb),  					 res_dir); @@ -1536,11 +1548,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,  				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))  		csum_size = sizeof(struct ext4_dir_entry_tail); -	bh2 = ext4_append (handle, dir, &newblock, &err); -	if (!(bh2)) { +	bh2 = ext4_append(handle, dir, &newblock); +	if (IS_ERR(bh2)) {  		brelse(*bh);  		*bh = NULL; -		goto errout; +		*error = PTR_ERR(bh2); +		return NULL;  	}  	BUFFER_TRACE(*bh, "get_write_access"); @@ -1621,7 +1634,6 @@ journal_error:  	brelse(bh2);  	*bh = NULL;  	ext4_std_error(dir->i_sb, err); -errout:  	*error = err;  	return NULL;  } @@ -1699,7 +1711,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,  	const char	*name = dentry->d_name.name;  	int		namelen = dentry->d_name.len;  	unsigned int	blocksize = dir->i_sb->s_blocksize; -	unsigned short	reclen;  	int		csum_size = 0;  	int		err; @@ -1707,7 +1718,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,  				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))  		csum_size = sizeof(struct ext4_dir_entry_tail); -	reclen = EXT4_DIR_REC_LEN(namelen);  	if (!de) {  		err = ext4_find_dest_de(dir, inode,  					bh, bh->b_data, blocksize - csum_size, @@ -1798,10 +1808,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,  	len = ((char *) root) + (blocksize - csum_size) - (char *) de;  	/* Allocate new block for the 0th block's dirents */ -	bh2 = ext4_append(handle, dir, &block, &retval); -	if (!(bh2)) { +	bh2 = ext4_append(handle, dir, &block); +	if (IS_ERR(bh2)) {  		brelse(bh); -		return retval; +		return PTR_ERR(bh2);  	}  	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);  	data1 = bh2->b_data; @@ -1918,20 +1928,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,  	}  	blocks = dir->i_size >> sb->s_blocksize_bits;  	for (block = 0; block < blocks; block++) { -		if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) { -			if (!retval) { -				retval = -EIO; -				ext4_error(inode->i_sb, -					   "Directory hole detected on inode %lu\n", -					   inode->i_ino); -			} -			return retval; -		} -		if (!buffer_verified(bh) && -		    !ext4_dirent_csum_verify(dir, -				(struct ext4_dir_entry *)bh->b_data)) -			return -EIO; -		set_buffer_verified(bh); +		bh = ext4_read_dirblock(dir, block, DIRENT); +		if (IS_ERR(bh)) +			return PTR_ERR(bh); +  		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);  		if (retval != -ENOSPC) {  			brelse(bh); @@ -1943,9 +1943,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,  			return make_indexed_dir(handle, dentry, inode, bh);  		brelse(bh);  	} -	bh = ext4_append(handle, dir, &block, &retval); -	if (!bh) -		return retval; +	bh = ext4_append(handle, dir, &block); +	if (IS_ERR(bh)) +		return PTR_ERR(bh);  	de = (struct ext4_dir_entry_2 *) bh->b_data;  	de->inode = 0;  	de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); @@ -1982,22 +1982,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,  		return err;  	entries = frame->entries;  	at = frame->at; - -	if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) { -		if (!err) { -			err = -EIO; -			ext4_error(dir->i_sb, -				   "Directory hole detected on inode %lu\n", -				   dir->i_ino); -		} +	bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); +	if (IS_ERR(bh)) { +		err = PTR_ERR(bh); +		bh = NULL;  		goto cleanup;  	} -	if (!buffer_verified(bh) && -	    !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) -		goto journal_error; -	set_buffer_verified(bh); -  	BUFFER_TRACE(bh, "get_write_access");  	err = ext4_journal_get_write_access(handle, bh);  	if (err) @@ -2025,9 +2016,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,  			err = -ENOSPC;  			goto cleanup;  		} -		bh2 = ext4_append (handle, dir, &newblock, &err); -		if (!(bh2)) +		bh2 = ext4_append(handle, dir, &newblock); +		if (IS_ERR(bh2)) { +			err = PTR_ERR(bh2);  			goto cleanup; +		}  		node2 = (struct dx_node *)(bh2->b_data);  		entries2 = node2->entries;  		memset(&node2->fake, 0, sizeof(struct fake_dirent)); @@ -2106,8 +2099,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,  journal_error:  	ext4_std_error(dir->i_sb, err);  cleanup: -	if (bh) -		brelse(bh); +	brelse(bh);  	dx_release(frames);  	return err;  } @@ -2254,29 +2246,28 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,  {  	handle_t *handle;  	struct inode *inode; -	int err, retries = 0; +	int err, credits, retries = 0;  	dquot_initialize(dir); +	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + +		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + +		   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));  retry: -	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + -					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + -					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); -	if (IS_ERR(handle)) -		return PTR_ERR(handle); - -	if (IS_DIRSYNC(dir)) -		ext4_handle_sync(handle); - -	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); +	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, +					    NULL, EXT4_HT_DIR, credits); +	handle = ext4_journal_current_handle();  	err = PTR_ERR(inode);  	if (!IS_ERR(inode)) {  		inode->i_op = &ext4_file_inode_operations;  		inode->i_fop = &ext4_file_operations;  		ext4_set_aops(inode);  		err = ext4_add_nondir(handle, dentry, inode); +		if (!err && IS_DIRSYNC(dir)) +			ext4_handle_sync(handle);  	} -	ext4_journal_stop(handle); +	if (handle) +		ext4_journal_stop(handle);  	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))  		goto retry;  	return err; @@ -2287,31 +2278,30 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,  {  	handle_t *handle;  	struct inode *inode; -	int err, retries = 0; +	int err, credits, retries = 0;  	if (!new_valid_dev(rdev))  		return -EINVAL;  	dquot_initialize(dir); +	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + +		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + +		   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));  retry: -	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + -					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + -					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); -	if (IS_ERR(handle)) -		return PTR_ERR(handle); - -	if (IS_DIRSYNC(dir)) -		ext4_handle_sync(handle); - -	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); +	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, +					    NULL, EXT4_HT_DIR, credits); +	handle = ext4_journal_current_handle();  	err = PTR_ERR(inode);  	if (!IS_ERR(inode)) {  		init_special_inode(inode, inode->i_mode, rdev);  		inode->i_op = &ext4_special_inode_operations;  		err = ext4_add_nondir(handle, dentry, inode); +		if (!err && IS_DIRSYNC(dir)) +			ext4_handle_sync(handle);  	} -	ext4_journal_stop(handle); +	if (handle) +		ext4_journal_stop(handle);  	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))  		goto retry;  	return err; @@ -2351,6 +2341,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,  	struct buffer_head *dir_block = NULL;  	struct ext4_dir_entry_2 *de;  	struct ext4_dir_entry_tail *t; +	ext4_lblk_t block = 0;  	unsigned int blocksize = dir->i_sb->s_blocksize;  	int csum_size = 0;  	int err; @@ -2367,17 +2358,10 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,  			goto out;  	} -	inode->i_size = EXT4_I(inode)->i_disksize = blocksize; -	dir_block = ext4_bread(handle, inode, 0, 1, &err); -	if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { -		if (!err) { -			err = -EIO; -			ext4_error(inode->i_sb, -				   "Directory hole detected on inode %lu\n", -				   inode->i_ino); -		} -		goto out; -	} +	inode->i_size = 0; +	dir_block = ext4_append(handle, inode, &block); +	if (IS_ERR(dir_block)) +		return PTR_ERR(dir_block);  	BUFFER_TRACE(dir_block, "get_write_access");  	err = ext4_journal_get_write_access(handle, dir_block);  	if (err) @@ -2404,25 +2388,21 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  {  	handle_t *handle;  	struct inode *inode; -	int err, retries = 0; +	int err, credits, retries = 0;  	if (EXT4_DIR_LINK_MAX(dir))  		return -EMLINK;  	dquot_initialize(dir); +	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + +		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + +		   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));  retry: -	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + -					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + -					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); -	if (IS_ERR(handle)) -		return PTR_ERR(handle); - -	if (IS_DIRSYNC(dir)) -		ext4_handle_sync(handle); - -	inode = ext4_new_inode(handle, dir, S_IFDIR | mode, -			       &dentry->d_name, 0, NULL); +	inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, +					    &dentry->d_name, +					    0, NULL, EXT4_HT_DIR, credits); +	handle = ext4_journal_current_handle();  	err = PTR_ERR(inode);  	if (IS_ERR(inode))  		goto out_stop; @@ -2450,8 +2430,12 @@ out_clear_inode:  		goto out_clear_inode;  	unlock_new_inode(inode);  	d_instantiate(dentry, inode); +	if (IS_DIRSYNC(dir)) +		ext4_handle_sync(handle); +  out_stop: -	ext4_journal_stop(handle); +	if (handle) +		ext4_journal_stop(handle);  	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))  		goto retry;  	return err; @@ -2477,25 +2461,14 @@ static int empty_dir(struct inode *inode)  	}  	sb = inode->i_sb; -	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || -	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { -		if (err) -			EXT4_ERROR_INODE(inode, -				"error %d reading directory lblock 0", err); -		else -			ext4_warning(inode->i_sb, -				     "bad directory (dir #%lu) - no data block", -				     inode->i_ino); +	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { +		EXT4_ERROR_INODE(inode, "invalid size");  		return 1;  	} -	if (!buffer_verified(bh) && -	    !ext4_dirent_csum_verify(inode, -			(struct ext4_dir_entry *)bh->b_data)) { -		EXT4_ERROR_INODE(inode, "checksum error reading directory " -				 "lblock 0"); -		return -EIO; -	} -	set_buffer_verified(bh); +	bh = ext4_read_dirblock(inode, 0, EITHER); +	if (IS_ERR(bh)) +		return 1; +  	de = (struct ext4_dir_entry_2 *) bh->b_data;  	de1 = ext4_next_entry(de, sb->s_blocksize);  	if (le32_to_cpu(de->inode) != inode->i_ino || @@ -2518,28 +2491,9 @@ static int empty_dir(struct inode *inode)  			err = 0;  			brelse(bh);  			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); -			bh = ext4_bread(NULL, inode, lblock, 0, &err); -			if (!bh) { -				if (err) -					EXT4_ERROR_INODE(inode, -						"error %d reading directory " -						"lblock %u", err, lblock); -				else -					ext4_warning(inode->i_sb, -						"bad directory (dir #%lu) - no data block", -						inode->i_ino); - -				offset += sb->s_blocksize; -				continue; -			} -			if (!buffer_verified(bh) && -			    !ext4_dirent_csum_verify(inode, -					(struct ext4_dir_entry *)bh->b_data)) { -				EXT4_ERROR_INODE(inode, "checksum error " -						 "reading directory lblock 0"); -				return -EIO; -			} -			set_buffer_verified(bh); +			bh = ext4_read_dirblock(inode, lblock, EITHER); +			if (IS_ERR(bh)) +				return 1;  			de = (struct ext4_dir_entry_2 *) bh->b_data;  		}  		if (ext4_check_dir_entry(inode, NULL, de, bh, @@ -2648,7 +2602,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)  	struct ext4_iloc iloc;  	int err = 0; -	if (!EXT4_SB(inode->i_sb)->s_journal) +	if ((!EXT4_SB(inode->i_sb)->s_journal) && +	    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))  		return 0;  	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); @@ -2717,25 +2672,18 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)  	struct inode *inode;  	struct buffer_head *bh;  	struct ext4_dir_entry_2 *de; -	handle_t *handle; +	handle_t *handle = NULL;  	/* Initialize quotas before so that eventual writes go in  	 * separate transaction */  	dquot_initialize(dir);  	dquot_initialize(dentry->d_inode); -	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); -	if (IS_ERR(handle)) -		return PTR_ERR(handle); -  	retval = -ENOENT;  	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);  	if (!bh)  		goto end_rmdir; -	if (IS_DIRSYNC(dir)) -		ext4_handle_sync(handle); -  	inode = dentry->d_inode;  	retval = -EIO; @@ -2746,6 +2694,17 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)  	if (!empty_dir(inode))  		goto end_rmdir; +	handle = ext4_journal_start(dir, EXT4_HT_DIR, +				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); +	if (IS_ERR(handle)) { +		retval = PTR_ERR(handle); +		handle = NULL; +		goto end_rmdir; +	} + +	if (IS_DIRSYNC(dir)) +		ext4_handle_sync(handle); +  	retval = ext4_delete_entry(handle, dir, de, bh);  	if (retval)  		goto end_rmdir; @@ -2767,8 +2726,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)  	ext4_mark_inode_dirty(handle, dir);  end_rmdir: -	ext4_journal_stop(handle);  	brelse(bh); +	if (handle) +		ext4_journal_stop(handle);  	return retval;  } @@ -2778,7 +2738,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)  	struct inode *inode;  	struct buffer_head *bh;  	struct ext4_dir_entry_2 *de; -	handle_t *handle; +	handle_t *handle = NULL;  	trace_ext4_unlink_enter(dir, dentry);  	/* Initialize quotas before so that eventual writes go @@ -2786,13 +2746,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)  	dquot_initialize(dir);  	dquot_initialize(dentry->d_inode); -	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); -	if (IS_ERR(handle)) -		return PTR_ERR(handle); - -	if (IS_DIRSYNC(dir)) -		ext4_handle_sync(handle); -  	retval = -ENOENT;  	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);  	if (!bh) @@ -2804,6 +2757,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)  	if (le32_to_cpu(de->inode) != inode->i_ino)  		goto end_unlink; +	handle = ext4_journal_start(dir, EXT4_HT_DIR, +				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); +	if (IS_ERR(handle)) { +		retval = PTR_ERR(handle); +		handle = NULL; +		goto end_unlink; +	} + +	if (IS_DIRSYNC(dir)) +		ext4_handle_sync(handle); +  	if (!inode->i_nlink) {  		ext4_warning(inode->i_sb,  			     "Deleting nonexistent file (%lu), %d", @@ -2824,8 +2788,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)  	retval = 0;  end_unlink: -	ext4_journal_stop(handle);  	brelse(bh); +	if (handle) +		ext4_journal_stop(handle);  	trace_ext4_unlink_exit(dentry, retval);  	return retval;  } @@ -2865,15 +2830,10 @@ static int ext4_symlink(struct inode *dir,  			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);  	}  retry: -	handle = ext4_journal_start(dir, credits); -	if (IS_ERR(handle)) -		return PTR_ERR(handle); - -	if (IS_DIRSYNC(dir)) -		ext4_handle_sync(handle); - -	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, -			       &dentry->d_name, 0, NULL); +	inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, +					    &dentry->d_name, 0, NULL, +					    EXT4_HT_DIR, credits); +	handle = ext4_journal_current_handle();  	err = PTR_ERR(inode);  	if (IS_ERR(inode))  		goto out_stop; @@ -2903,7 +2863,7 @@ retry:  		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS  		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified  		 */ -		handle = ext4_journal_start(dir, +		handle = ext4_journal_start(dir, EXT4_HT_DIR,  				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +  				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);  		if (IS_ERR(handle)) { @@ -2926,8 +2886,12 @@ retry:  	}  	EXT4_I(inode)->i_disksize = inode->i_size;  	err = ext4_add_nondir(handle, dentry, inode); +	if (!err && IS_DIRSYNC(dir)) +		ext4_handle_sync(handle); +  out_stop: -	ext4_journal_stop(handle); +	if (handle) +		ext4_journal_stop(handle);  	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))  		goto retry;  	return err; @@ -2950,8 +2914,9 @@ static int ext4_link(struct dentry *old_dentry,  	dquot_initialize(dir);  retry: -	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + -					EXT4_INDEX_EXTRA_TRANS_BLOCKS); +	handle = ext4_journal_start(dir, EXT4_HT_DIR, +		(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + +		 EXT4_INDEX_EXTRA_TRANS_BLOCKS));  	if (IS_ERR(handle))  		return PTR_ERR(handle); @@ -2991,13 +2956,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,  	struct buffer_head *bh;  	if (!ext4_has_inline_data(inode)) { -		if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) { -			if (!*retval) { -				*retval = -EIO; -				ext4_error(inode->i_sb, -					   "Directory hole detected on inode %lu\n", -					   inode->i_ino); -			} +		bh = ext4_read_dirblock(inode, 0, EITHER); +		if (IS_ERR(bh)) { +			*retval = PTR_ERR(bh);  			return NULL;  		}  		*parent_de = ext4_next_entry( @@ -3034,9 +2995,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,  	 * in separate transaction */  	if (new_dentry->d_inode)  		dquot_initialize(new_dentry->d_inode); -	handle = ext4_journal_start(old_dir, 2 * -					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + -					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); +	handle = ext4_journal_start(old_dir, EXT4_HT_DIR, +		(2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + +		 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));  	if (IS_ERR(handle))  		return PTR_ERR(handle); @@ -3076,11 +3037,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,  						  &inlined);  		if (!dir_bh)  			goto end_rename; -		if (!inlined && !buffer_verified(dir_bh) && -		    !ext4_dirent_csum_verify(old_inode, -				(struct ext4_dir_entry *)dir_bh->b_data)) -			goto end_rename; -		set_buffer_verified(dir_bh);  		if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)  			goto end_rename;  		retval = -EMLINK; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0016fbca2a4..809b31003ec 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -23,6 +23,7 @@  #include <linux/workqueue.h>  #include <linux/kernel.h>  #include <linux/slab.h> +#include <linux/mm.h>  #include "ext4_jbd2.h"  #include "xattr.h" @@ -73,8 +74,6 @@ void ext4_free_io_end(ext4_io_end_t *io)  	BUG_ON(!list_empty(&io->list));  	BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); -	if (io->page) -		put_page(io->page);  	for (i = 0; i < io->num_io_pages; i++)  		put_io_page(io->pages[i]);  	io->num_io_pages = 0; @@ -103,14 +102,13 @@ static int ext4_end_io(ext4_io_end_t *io)  			 "(inode %lu, offset %llu, size %zd, error %d)",  			 inode->i_ino, offset, size, ret);  	} -	if (io->iocb) -		aio_complete(io->iocb, io->result, 0); - -	if (io->flag & EXT4_IO_END_DIRECT) -		inode_dio_done(inode);  	/* Wake up anyone waiting on unwritten extent conversion */  	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))  		wake_up_all(ext4_ioend_wq(inode)); +	if (io->flag & EXT4_IO_END_DIRECT) +		inode_dio_done(inode); +	if (io->iocb) +		aio_complete(io->iocb, io->result, 0);  	return ret;  } @@ -119,7 +117,6 @@ static void dump_completed_IO(struct inode *inode)  #ifdef	EXT4FS_DEBUG  	struct list_head *cur, *before, *after;  	ext4_io_end_t *io, *io0, *io1; -	unsigned long flags;  	if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {  		ext4_debug("inode %lu completed_io list is empty\n", @@ -152,26 +149,20 @@ void ext4_add_complete_io(ext4_io_end_t *io_end)  	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;  	spin_lock_irqsave(&ei->i_completed_io_lock, flags); -	if (list_empty(&ei->i_completed_io_list)) { -		io_end->flag |= EXT4_IO_END_QUEUED; -		queue_work(wq, &io_end->work); -	} +	if (list_empty(&ei->i_completed_io_list)) +		queue_work(wq, &ei->i_unwritten_work);  	list_add_tail(&io_end->list, &ei->i_completed_io_list);  	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);  } -static int ext4_do_flush_completed_IO(struct inode *inode, -				      ext4_io_end_t *work_io) +static int ext4_do_flush_completed_IO(struct inode *inode)  {  	ext4_io_end_t *io; -	struct list_head unwritten, complete, to_free; +	struct list_head unwritten;  	unsigned long flags;  	struct ext4_inode_info *ei = EXT4_I(inode);  	int err, ret = 0; -	INIT_LIST_HEAD(&complete); -	INIT_LIST_HEAD(&to_free); -  	spin_lock_irqsave(&ei->i_completed_io_lock, flags);  	dump_completed_IO(inode);  	list_replace_init(&ei->i_completed_io_list, &unwritten); @@ -185,32 +176,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,  		err = ext4_end_io(io);  		if (unlikely(!ret && err))  			ret = err; - -		list_add_tail(&io->list, &complete); -	} -	spin_lock_irqsave(&ei->i_completed_io_lock, flags); -	while (!list_empty(&complete)) { -		io = list_entry(complete.next, ext4_io_end_t, list);  		io->flag &= ~EXT4_IO_END_UNWRITTEN; -		/* end_io context can not be destroyed now because it still -		 * used by queued worker. Worker thread will destroy it later */ -		if (io->flag & EXT4_IO_END_QUEUED) -			list_del_init(&io->list); -		else -			list_move(&io->list, &to_free); -	} -	/* If we are called from worker context, it is time to clear queued -	 * flag, and destroy it's end_io if it was converted already */ -	if (work_io) { -		work_io->flag &= ~EXT4_IO_END_QUEUED; -		if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) -			list_add_tail(&work_io->list, &to_free); -	} -	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - -	while (!list_empty(&to_free)) { -		io = list_entry(to_free.next, ext4_io_end_t, list); -		list_del_init(&io->list);  		ext4_free_io_end(io);  	}  	return ret; @@ -219,10 +185,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode,  /*   * work on completed aio dio IO, to convert unwritten extents to extents   */ -static void ext4_end_io_work(struct work_struct *work) +void ext4_end_io_work(struct work_struct *work)  { -	ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); -	ext4_do_flush_completed_IO(io->inode, io); +	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, +						  i_unwritten_work); +	ext4_do_flush_completed_IO(&ei->vfs_inode);  }  int ext4_flush_unwritten_io(struct inode *inode) @@ -230,7 +197,7 @@ int ext4_flush_unwritten_io(struct inode *inode)  	int ret;  	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&  		     !(inode->i_state & I_FREEING)); -	ret = ext4_do_flush_completed_IO(inode, NULL); +	ret = ext4_do_flush_completed_IO(inode);  	ext4_unwritten_wait(inode);  	return ret;  } @@ -241,7 +208,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)  	if (io) {  		atomic_inc(&EXT4_I(inode)->i_ioend_count);  		io->inode = inode; -		INIT_WORK(&io->work, ext4_end_io_work);  		INIT_LIST_HEAD(&io->list);  	}  	return io; @@ -382,14 +348,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io,  		unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);  	} -	if (!buffer_mapped(bh) || buffer_delay(bh)) { -		if (!buffer_mapped(bh)) -			clear_buffer_dirty(bh); -		if (io->io_bio) -			ext4_io_submit(io); -		return 0; -	} -  	if (io->io_bio && bh->b_blocknr != io->io_next_block) {  submit_and_retry:  		ext4_io_submit(io); @@ -436,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,  	io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);  	if (!io_page) { -		set_page_dirty(page); +		redirty_page_for_writepage(wbc, page);  		unlock_page(page);  		return -ENOMEM;  	} @@ -468,7 +426,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io,  			set_buffer_uptodate(bh);  			continue;  		} -		clear_buffer_dirty(bh); +		if (!buffer_dirty(bh) || buffer_delay(bh) || +		    !buffer_mapped(bh) || buffer_unwritten(bh)) { +			/* A hole? We can safely clear the dirty bit */ +			if (!buffer_mapped(bh)) +				clear_buffer_dirty(bh); +			if (io->io_bio) +				ext4_io_submit(io); +			continue; +		}  		ret = io_submit_add_bh(io, io_page, inode, wbc, bh);  		if (ret) {  			/* @@ -476,9 +442,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,  			 * we can do but mark the page as dirty, and  			 * better luck next time.  			 */ -			set_page_dirty(page); +			redirty_page_for_writepage(wbc, page);  			break;  		} +		clear_buffer_dirty(bh);  	}  	unlock_page(page);  	/* diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index d99387b89ed..b2c8ee56eb9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -333,8 +333,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,  	int err;  	bh = sb_getblk(sb, blk); -	if (!bh) -		return ERR_PTR(-EIO); +	if (unlikely(!bh)) +		return ERR_PTR(-ENOMEM);  	if ((err = ext4_journal_get_write_access(handle, bh))) {  		brelse(bh);  		bh = ERR_PTR(err); @@ -410,8 +410,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,  			return err;  		bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); -		if (!bh) -			return -EIO; +		if (unlikely(!bh)) +			return -ENOMEM;  		err = ext4_journal_get_write_access(handle, bh);  		if (err) @@ -466,7 +466,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,  	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);  	/* This transaction may be extended/restarted along the way */ -	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); +	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);  	if (IS_ERR(handle))  		return PTR_ERR(handle); @@ -500,8 +500,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,  				goto out;  			gdb = sb_getblk(sb, block); -			if (!gdb) { -				err = -EIO; +			if (unlikely(!gdb)) { +				err = -ENOMEM;  				goto out;  			} @@ -1031,7 +1031,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,  	handle_t *handle;  	int err = 0, err2; -	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); +	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);  	if (IS_ERR(handle)) {  		group = 1;  		err = PTR_ERR(handle); @@ -1064,8 +1064,8 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,  					ext4_bg_has_super(sb, group));  		bh = sb_getblk(sb, backup_block); -		if (!bh) { -			err = -EIO; +		if (unlikely(!bh)) { +			err = -ENOMEM;  			break;  		}  		ext4_debug("update metadata backup %llu(+%llu)\n", @@ -1168,7 +1168,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,  static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)  {  	struct buffer_head *bh = sb_getblk(sb, block); -	if (!bh) +	if (unlikely(!bh))  		return NULL;  	if (!bh_uptodate_or_lock(bh)) {  		if (bh_submit_read(bh) < 0) { @@ -1247,7 +1247,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,  		ext4_inode_table_set(sb, gdp, group_data->inode_table);  		ext4_free_group_clusters_set(sb, gdp, -					     EXT4_B2C(sbi, group_data->free_blocks_count)); +			EXT4_NUM_B2C(sbi, group_data->free_blocks_count));  		ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));  		if (ext4_has_group_desc_csum(sb))  			ext4_itable_unused_set(sb, gdp, @@ -1349,7 +1349,7 @@ static void ext4_update_super(struct super_block *sb,  	/* Update the free space counts */  	percpu_counter_add(&sbi->s_freeclusters_counter, -			   EXT4_B2C(sbi, free_blocks)); +			   EXT4_NUM_B2C(sbi, free_blocks));  	percpu_counter_add(&sbi->s_freeinodes_counter,  			   EXT4_INODES_PER_GROUP(sb) * flex_gd->count); @@ -1360,7 +1360,7 @@ static void ext4_update_super(struct super_block *sb,  	    sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group;  		flex_group = ext4_flex_group(sbi, group_data[0].group); -		atomic_add(EXT4_B2C(sbi, free_blocks), +		atomic_add(EXT4_NUM_B2C(sbi, free_blocks),  			   &sbi->s_flex_groups[flex_group].free_clusters);  		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,  			   &sbi->s_flex_groups[flex_group].free_inodes); @@ -1412,7 +1412,7 @@ static int ext4_flex_group_add(struct super_block *sb,  	 * modify each of the reserved GDT dindirect blocks.  	 */  	credit = flex_gd->count * 4 + reserved_gdb; -	handle = ext4_journal_start_sb(sb, credit); +	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);  	if (IS_ERR(handle)) {  		err = PTR_ERR(handle);  		goto exit; @@ -1506,10 +1506,12 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,  		group_data[i].blocks_count = blocks_per_group;  		overhead = ext4_group_overhead_blocks(sb, group + i);  		group_data[i].free_blocks_count = blocks_per_group - overhead; -		if (ext4_has_group_desc_csum(sb)) +		if (ext4_has_group_desc_csum(sb)) {  			flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |  					       EXT4_BG_INODE_UNINIT; -		else +			if (!test_opt(sb, INIT_INODE_TABLE)) +				flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED; +		} else  			flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;  	} @@ -1594,7 +1596,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)  	err = ext4_alloc_flex_bg_array(sb, input->group + 1);  	if (err) -		return err; +		goto out;  	err = ext4_mb_alloc_groupinfo(sb, input->group + 1);  	if (err) @@ -1622,7 +1624,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,  	/* We will update the superblock, one block bitmap, and  	 * one group descriptor via ext4_group_add_blocks().  	 */ -	handle = ext4_journal_start_sb(sb, 3); +	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3);  	if (IS_ERR(handle)) {  		err = PTR_ERR(handle);  		ext4_warning(sb, "error %d on journal start", err); @@ -1786,7 +1788,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)  		credits += 3;	/* block bitmap, bg descriptor, resize inode */  	} -	handle = ext4_journal_start_sb(sb, credits); +	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits);  	if (IS_ERR(handle))  		return PTR_ERR(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3cdb0a2fc64..5e6c8783619 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -69,8 +69,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,  static void ext4_clear_journal_err(struct super_block *sb,  				   struct ext4_super_block *es);  static int ext4_sync_fs(struct super_block *sb, int wait); -static const char *ext4_decode_error(struct super_block *sb, int errno, -				     char nbuf[16]);  static int ext4_remount(struct super_block *sb, int *flags, char *data);  static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);  static int ext4_unfreeze(struct super_block *sb); @@ -296,107 +294,6 @@ void ext4_itable_unused_set(struct super_block *sb,  } -/* Just increment the non-pointer handle value */ -static handle_t *ext4_get_nojournal(void) -{ -	handle_t *handle = current->journal_info; -	unsigned long ref_cnt = (unsigned long)handle; - -	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); - -	ref_cnt++; -	handle = (handle_t *)ref_cnt; - -	current->journal_info = handle; -	return handle; -} - - -/* Decrement the non-pointer handle value */ -static void ext4_put_nojournal(handle_t *handle) -{ -	unsigned long ref_cnt = (unsigned long)handle; - -	BUG_ON(ref_cnt == 0); - -	ref_cnt--; -	handle = (handle_t *)ref_cnt; - -	current->journal_info = handle; -} - -/* - * Wrappers for jbd2_journal_start/end. - */ -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) -{ -	journal_t *journal; - -	trace_ext4_journal_start(sb, nblocks, _RET_IP_); -	if (sb->s_flags & MS_RDONLY) -		return ERR_PTR(-EROFS); - -	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); -	journal = EXT4_SB(sb)->s_journal; -	if (!journal) -		return ext4_get_nojournal(); -	/* -	 * Special case here: if the journal has aborted behind our -	 * backs (eg. EIO in the commit thread), then we still need to -	 * take the FS itself readonly cleanly. -	 */ -	if (is_journal_aborted(journal)) { -		ext4_abort(sb, "Detected aborted journal"); -		return ERR_PTR(-EROFS); -	} -	return jbd2_journal_start(journal, nblocks); -} - -int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) -{ -	struct super_block *sb; -	int err; -	int rc; - -	if (!ext4_handle_valid(handle)) { -		ext4_put_nojournal(handle); -		return 0; -	} -	sb = handle->h_transaction->t_journal->j_private; -	err = handle->h_err; -	rc = jbd2_journal_stop(handle); - -	if (!err) -		err = rc; -	if (err) -		__ext4_std_error(sb, where, line, err); -	return err; -} - -void ext4_journal_abort_handle(const char *caller, unsigned int line, -			       const char *err_fn, struct buffer_head *bh, -			       handle_t *handle, int err) -{ -	char nbuf[16]; -	const char *errstr = ext4_decode_error(NULL, err, nbuf); - -	BUG_ON(!ext4_handle_valid(handle)); - -	if (bh) -		BUFFER_TRACE(bh, "abort"); - -	if (!handle->h_err) -		handle->h_err = err; - -	if (is_handle_aborted(handle)) -		return; - -	printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", -	       caller, line, errstr, err_fn); - -	jbd2_journal_abort_handle(handle); -} -  static void __save_error_info(struct super_block *sb, const char *func,  			    unsigned int line)  { @@ -553,7 +450,7 @@ void ext4_error_file(struct file *file, const char *function,  	va_list args;  	struct va_format vaf;  	struct ext4_super_block *es; -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	char pathname[80], *path;  	es = EXT4_SB(inode->i_sb)->s_es; @@ -582,8 +479,8 @@ void ext4_error_file(struct file *file, const char *function,  	ext4_handle_error(inode->i_sb);  } -static const char *ext4_decode_error(struct super_block *sb, int errno, -				     char nbuf[16]) +const char *ext4_decode_error(struct super_block *sb, int errno, +			      char nbuf[16])  {  	char *errstr = NULL; @@ -858,6 +755,7 @@ static void ext4_put_super(struct super_block *sb)  			ext4_abort(sb, "Couldn't clean up the journal");  	} +	ext4_es_unregister_shrinker(sb);  	del_timer(&sbi->s_err_report);  	ext4_release_system_zone(sb);  	ext4_mb_release(sb); @@ -885,6 +783,7 @@ static void ext4_put_super(struct super_block *sb)  	percpu_counter_destroy(&sbi->s_freeinodes_counter);  	percpu_counter_destroy(&sbi->s_dirs_counter);  	percpu_counter_destroy(&sbi->s_dirtyclusters_counter); +	percpu_counter_destroy(&sbi->s_extent_cache_cnt);  	brelse(sbi->s_sbh);  #ifdef CONFIG_QUOTA  	for (i = 0; i < MAXQUOTAS; i++) @@ -939,11 +838,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)  		return NULL;  	ei->vfs_inode.i_version = 1; -	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));  	INIT_LIST_HEAD(&ei->i_prealloc_list);  	spin_lock_init(&ei->i_prealloc_lock);  	ext4_es_init_tree(&ei->i_es_tree);  	rwlock_init(&ei->i_es_lock); +	INIT_LIST_HEAD(&ei->i_es_lru); +	ei->i_es_lru_nr = 0;  	ei->i_reserved_data_blocks = 0;  	ei->i_reserved_meta_blocks = 0;  	ei->i_allocated_meta_blocks = 0; @@ -960,6 +860,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)  	ei->i_datasync_tid = 0;  	atomic_set(&ei->i_ioend_count, 0);  	atomic_set(&ei->i_unwritten, 0); +	INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);  	return &ei->vfs_inode;  } @@ -1031,6 +932,7 @@ void ext4_clear_inode(struct inode *inode)  	dquot_drop(inode);  	ext4_discard_preallocations(inode);  	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); +	ext4_es_lru_del(inode);  	if (EXT4_I(inode)->jinode) {  		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),  					       EXT4_I(inode)->jinode); @@ -1280,8 +1182,8 @@ static const match_table_t tokens = {  	{Opt_stripe, "stripe=%u"},  	{Opt_delalloc, "delalloc"},  	{Opt_nodelalloc, "nodelalloc"}, -	{Opt_mblk_io_submit, "mblk_io_submit"}, -	{Opt_nomblk_io_submit, "nomblk_io_submit"}, +	{Opt_removed, "mblk_io_submit"}, +	{Opt_removed, "nomblk_io_submit"},  	{Opt_block_validity, "block_validity"},  	{Opt_noblock_validity, "noblock_validity"},  	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, @@ -1337,6 +1239,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)  {  	struct ext4_sb_info *sbi = EXT4_SB(sb);  	char *qname; +	int ret = -1;  	if (sb_any_quota_loaded(sb) &&  		!sbi->s_qf_names[qtype]) { @@ -1345,29 +1248,37 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)  			"quota options when quota turned on");  		return -1;  	} +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { +		ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " +			 "when QUOTA feature is enabled"); +		return -1; +	}  	qname = match_strdup(args);  	if (!qname) {  		ext4_msg(sb, KERN_ERR,  			"Not enough memory for storing quotafile name");  		return -1;  	} -	if (sbi->s_qf_names[qtype] && -		strcmp(sbi->s_qf_names[qtype], qname)) { -		ext4_msg(sb, KERN_ERR, -			"%s quota file already specified", QTYPE2NAME(qtype)); -		kfree(qname); -		return -1; +	if (sbi->s_qf_names[qtype]) { +		if (strcmp(sbi->s_qf_names[qtype], qname) == 0) +			ret = 1; +		else +			ext4_msg(sb, KERN_ERR, +				 "%s quota file already specified", +				 QTYPE2NAME(qtype)); +		goto errout;  	} -	sbi->s_qf_names[qtype] = qname; -	if (strchr(sbi->s_qf_names[qtype], '/')) { +	if (strchr(qname, '/')) {  		ext4_msg(sb, KERN_ERR,  			"quotafile must be on filesystem root"); -		kfree(sbi->s_qf_names[qtype]); -		sbi->s_qf_names[qtype] = NULL; -		return -1; +		goto errout;  	} +	sbi->s_qf_names[qtype] = qname;  	set_opt(sb, QUOTA);  	return 1; +errout: +	kfree(qname); +	return ret;  }  static int clear_qf_name(struct super_block *sb, int qtype) @@ -1381,10 +1292,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)  			" when quota turned on");  		return -1;  	} -	/* -	 * The space will be released later when all options are confirmed -	 * to be correct -	 */ +	kfree(sbi->s_qf_names[qtype]);  	sbi->s_qf_names[qtype] = NULL;  	return 1;  } @@ -1404,6 +1312,9 @@ static int clear_qf_name(struct super_block *sb, int qtype)  #define MOPT_QFMT	MOPT_NOSUPPORT  #endif  #define MOPT_DATAJ	0x0080 +#define MOPT_NO_EXT2	0x0100 +#define MOPT_NO_EXT3	0x0200 +#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)  static const struct mount_opts {  	int	token; @@ -1414,25 +1325,31 @@ static const struct mount_opts {  	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},  	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},  	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, -	{Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET}, -	{Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},  	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},  	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, -	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET}, -	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR}, +	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, +	 MOPT_EXT4_ONLY | MOPT_SET}, +	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, +	 MOPT_EXT4_ONLY | MOPT_CLEAR},  	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},  	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, -	{Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT}, -	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT}, -	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET}, +	{Opt_delalloc, EXT4_MOUNT_DELALLOC, +	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, +	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC, +	 MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT}, +	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, +	 MOPT_EXT4_ONLY | MOPT_SET},  	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | -				    EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET}, -	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET}, +				    EXT4_MOUNT_JOURNAL_CHECKSUM), +	 MOPT_EXT4_ONLY | MOPT_SET}, +	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},  	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},  	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},  	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, -	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET}, -	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR}, +	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, +	 MOPT_NO_EXT2 | MOPT_SET}, +	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, +	 MOPT_NO_EXT2 | MOPT_CLEAR},  	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},  	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},  	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, @@ -1444,9 +1361,14 @@ static const struct mount_opts {  	{Opt_inode_readahead_blks, 0, MOPT_GTE0},  	{Opt_init_itable, 0, MOPT_GTE0},  	{Opt_stripe, 0, MOPT_GTE0}, -	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, -	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, -	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, +	{Opt_resuid, 0, MOPT_GTE0}, +	{Opt_resgid, 0, MOPT_GTE0}, +	{Opt_journal_dev, 0, MOPT_GTE0}, +	{Opt_journal_ioprio, 0, MOPT_GTE0}, +	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, +	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, +	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, +	 MOPT_NO_EXT2 | MOPT_DATAJ},  	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},  	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},  #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -1496,8 +1418,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,  	else if (token == Opt_offgrpjquota)  		return clear_qf_name(sb, GRPQUOTA);  #endif -	if (args->from && match_int(args, &arg)) -		return -1;  	switch (token) {  	case Opt_noacl:  	case Opt_nouser_xattr: @@ -1506,138 +1426,156 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,  	case Opt_sb:  		return 1;	/* handled by get_sb_block() */  	case Opt_removed: -		ext4_msg(sb, KERN_WARNING, -			 "Ignoring removed %s option", opt); +		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);  		return 1; -	case Opt_resuid: +	case Opt_abort: +		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; +		return 1; +	case Opt_i_version: +		sb->s_flags |= MS_I_VERSION; +		return 1; +	} + +	for (m = ext4_mount_opts; m->token != Opt_err; m++) +		if (token == m->token) +			break; + +	if (m->token == Opt_err) { +		ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " +			 "or missing value", opt); +		return -1; +	} + +	if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { +		ext4_msg(sb, KERN_ERR, +			 "Mount option \"%s\" incompatible with ext2", opt); +		return -1; +	} +	if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { +		ext4_msg(sb, KERN_ERR, +			 "Mount option \"%s\" incompatible with ext3", opt); +		return -1; +	} + +	if (args->from && match_int(args, &arg)) +		return -1; +	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) +		return -1; +	if (m->flags & MOPT_EXPLICIT) +		set_opt2(sb, EXPLICIT_DELALLOC); +	if (m->flags & MOPT_CLEAR_ERR) +		clear_opt(sb, ERRORS_MASK); +	if (token == Opt_noquota && sb_any_quota_loaded(sb)) { +		ext4_msg(sb, KERN_ERR, "Cannot change quota " +			 "options when quota turned on"); +		return -1; +	} + +	if (m->flags & MOPT_NOSUPPORT) { +		ext4_msg(sb, KERN_ERR, "%s option not supported", opt); +	} else if (token == Opt_commit) { +		if (arg == 0) +			arg = JBD2_DEFAULT_MAX_COMMIT_AGE; +		sbi->s_commit_interval = HZ * arg; +	} else if (token == Opt_max_batch_time) { +		if (arg == 0) +			arg = EXT4_DEF_MAX_BATCH_TIME; +		sbi->s_max_batch_time = arg; +	} else if (token == Opt_min_batch_time) { +		sbi->s_min_batch_time = arg; +	} else if (token == Opt_inode_readahead_blks) { +		if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) { +			ext4_msg(sb, KERN_ERR, +				 "EXT4-fs: inode_readahead_blks must be " +				 "0 or a power of 2 smaller than 2^31"); +			return -1; +		} +		sbi->s_inode_readahead_blks = arg; +	} else if (token == Opt_init_itable) { +		set_opt(sb, INIT_INODE_TABLE); +		if (!args->from) +			arg = EXT4_DEF_LI_WAIT_MULT; +		sbi->s_li_wait_mult = arg; +	} else if (token == Opt_max_dir_size_kb) { +		sbi->s_max_dir_size_kb = arg; +	} else if (token == Opt_stripe) { +		sbi->s_stripe = arg; +	} else if (token == Opt_resuid) {  		uid = make_kuid(current_user_ns(), arg);  		if (!uid_valid(uid)) {  			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);  			return -1;  		}  		sbi->s_resuid = uid; -		return 1; -	case Opt_resgid: +	} else if (token == Opt_resgid) {  		gid = make_kgid(current_user_ns(), arg);  		if (!gid_valid(gid)) {  			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);  			return -1;  		}  		sbi->s_resgid = gid; -		return 1; -	case Opt_abort: -		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; -		return 1; -	case Opt_i_version: -		sb->s_flags |= MS_I_VERSION; -		return 1; -	case Opt_journal_dev: +	} else if (token == Opt_journal_dev) {  		if (is_remount) {  			ext4_msg(sb, KERN_ERR,  				 "Cannot specify journal on remount");  			return -1;  		}  		*journal_devnum = arg; -		return 1; -	case Opt_journal_ioprio: -		if (arg < 0 || arg > 7) -			return -1; -		*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); -		return 1; -	} - -	for (m = ext4_mount_opts; m->token != Opt_err; m++) { -		if (token != m->token) -			continue; -		if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) -			return -1; -		if (m->flags & MOPT_EXPLICIT) -			set_opt2(sb, EXPLICIT_DELALLOC); -		if (m->flags & MOPT_CLEAR_ERR) -			clear_opt(sb, ERRORS_MASK); -		if (token == Opt_noquota && sb_any_quota_loaded(sb)) { -			ext4_msg(sb, KERN_ERR, "Cannot change quota " -				 "options when quota turned on"); +	} else if (token == Opt_journal_ioprio) { +		if (arg > 7) { +			ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" +				 " (must be 0-7)");  			return -1;  		} - -		if (m->flags & MOPT_NOSUPPORT) { -			ext4_msg(sb, KERN_ERR, "%s option not supported", opt); -		} else if (token == Opt_commit) { -			if (arg == 0) -				arg = JBD2_DEFAULT_MAX_COMMIT_AGE; -			sbi->s_commit_interval = HZ * arg; -		} else if (token == Opt_max_batch_time) { -			if (arg == 0) -				arg = EXT4_DEF_MAX_BATCH_TIME; -			sbi->s_max_batch_time = arg; -		} else if (token == Opt_min_batch_time) { -			sbi->s_min_batch_time = arg; -		} else if (token == Opt_inode_readahead_blks) { -			if (arg > (1 << 30)) -				return -1; -			if (arg && !is_power_of_2(arg)) { +		*journal_ioprio = +			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); +	} else if (m->flags & MOPT_DATAJ) { +		if (is_remount) { +			if (!sbi->s_journal) +				ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); +			else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {  				ext4_msg(sb, KERN_ERR, -					 "EXT4-fs: inode_readahead_blks" -					 " must be a power of 2"); -				return -1; -			} -			sbi->s_inode_readahead_blks = arg; -		} else if (token == Opt_init_itable) { -			set_opt(sb, INIT_INODE_TABLE); -			if (!args->from) -				arg = EXT4_DEF_LI_WAIT_MULT; -			sbi->s_li_wait_mult = arg; -		} else if (token == Opt_max_dir_size_kb) { -			sbi->s_max_dir_size_kb = arg; -		} else if (token == Opt_stripe) { -			sbi->s_stripe = arg; -		} else if (m->flags & MOPT_DATAJ) { -			if (is_remount) { -				if (!sbi->s_journal) -					ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); -				else if (test_opt(sb, DATA_FLAGS) != -					 m->mount_opt) { -					ext4_msg(sb, KERN_ERR,  					 "Cannot change data mode on remount"); -					return -1; -				} -			} else { -				clear_opt(sb, DATA_FLAGS); -				sbi->s_mount_opt |= m->mount_opt; -			} -#ifdef CONFIG_QUOTA -		} else if (m->flags & MOPT_QFMT) { -			if (sb_any_quota_loaded(sb) && -			    sbi->s_jquota_fmt != m->mount_opt) { -				ext4_msg(sb, KERN_ERR, "Cannot " -					 "change journaled quota options " -					 "when quota turned on");  				return -1;  			} -			sbi->s_jquota_fmt = m->mount_opt; -#endif  		} else { -			if (!args->from) -				arg = 1; -			if (m->flags & MOPT_CLEAR) -				arg = !arg; -			else if (unlikely(!(m->flags & MOPT_SET))) { -				ext4_msg(sb, KERN_WARNING, -					 "buggy handling of option %s", opt); -				WARN_ON(1); -				return -1; -			} -			if (arg != 0) -				sbi->s_mount_opt |= m->mount_opt; -			else -				sbi->s_mount_opt &= ~m->mount_opt; +			clear_opt(sb, DATA_FLAGS); +			sbi->s_mount_opt |= m->mount_opt;  		} -		return 1; +#ifdef CONFIG_QUOTA +	} else if (m->flags & MOPT_QFMT) { +		if (sb_any_quota_loaded(sb) && +		    sbi->s_jquota_fmt != m->mount_opt) { +			ext4_msg(sb, KERN_ERR, "Cannot change journaled " +				 "quota options when quota turned on"); +			return -1; +		} +		if (EXT4_HAS_RO_COMPAT_FEATURE(sb, +					       EXT4_FEATURE_RO_COMPAT_QUOTA)) { +			ext4_msg(sb, KERN_ERR, +				 "Cannot set journaled quota options " +				 "when QUOTA feature is enabled"); +			return -1; +		} +		sbi->s_jquota_fmt = m->mount_opt; +#endif +	} else { +		if (!args->from) +			arg = 1; +		if (m->flags & MOPT_CLEAR) +			arg = !arg; +		else if (unlikely(!(m->flags & MOPT_SET))) { +			ext4_msg(sb, KERN_WARNING, +				 "buggy handling of option %s", opt); +			WARN_ON(1); +			return -1; +		} +		if (arg != 0) +			sbi->s_mount_opt |= m->mount_opt; +		else +			sbi->s_mount_opt &= ~m->mount_opt;  	} -	ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " -		 "or missing value", opt); -	return -1; +	return 1;  }  static int parse_options(char *options, struct super_block *sb, @@ -1645,9 +1583,7 @@ static int parse_options(char *options, struct super_block *sb,  			 unsigned int *journal_ioprio,  			 int is_remount)  { -#ifdef CONFIG_QUOTA  	struct ext4_sb_info *sbi = EXT4_SB(sb); -#endif  	char *p;  	substring_t args[MAX_OPT_ARGS];  	int token; @@ -1669,6 +1605,12 @@ static int parse_options(char *options, struct super_block *sb,  			return 0;  	}  #ifdef CONFIG_QUOTA +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && +	    (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { +		ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " +			 "feature is enabled"); +		return 0; +	}  	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {  		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])  			clear_opt(sb, USRQUOTA); @@ -1696,6 +1638,16 @@ static int parse_options(char *options, struct super_block *sb,  		}  	}  #endif +	if (test_opt(sb, DIOREAD_NOLOCK)) { +		int blocksize = +			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + +		if (blocksize < PAGE_CACHE_SIZE) { +			ext4_msg(sb, KERN_ERR, "can't mount with " +				 "dioread_nolock if block size != PAGE_SIZE"); +			return 0; +		} +	}  	return 1;  } @@ -2212,7 +2164,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,  				__func__, inode->i_ino, inode->i_size);  			jbd_debug(2, "truncating inode %lu to %lld bytes\n",  				  inode->i_ino, inode->i_size); +			mutex_lock(&inode->i_mutex);  			ext4_truncate(inode); +			mutex_unlock(&inode->i_mutex);  			nr_truncates++;  		} else {  			ext4_msg(sb, KERN_DEBUG, @@ -2766,7 +2720,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)  			break;  	} -	if (group == ngroups) +	if (group >= ngroups)  		ret = 1;  	if (!ret) { @@ -3006,33 +2960,34 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,  	return elr;  } -static int ext4_register_li_request(struct super_block *sb, -				    ext4_group_t first_not_zeroed) +int ext4_register_li_request(struct super_block *sb, +			     ext4_group_t first_not_zeroed)  {  	struct ext4_sb_info *sbi = EXT4_SB(sb); -	struct ext4_li_request *elr; +	struct ext4_li_request *elr = NULL;  	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;  	int ret = 0; +	mutex_lock(&ext4_li_mtx);  	if (sbi->s_li_request != NULL) {  		/*  		 * Reset timeout so it can be computed again, because  		 * s_li_wait_mult might have changed.  		 */  		sbi->s_li_request->lr_timeout = 0; -		return 0; +		goto out;  	}  	if (first_not_zeroed == ngroups ||  	    (sb->s_flags & MS_RDONLY) ||  	    !test_opt(sb, INIT_INODE_TABLE)) -		return 0; +		goto out;  	elr = ext4_li_request_new(sb, first_not_zeroed); -	if (!elr) -		return -ENOMEM; - -	mutex_lock(&ext4_li_mtx); +	if (!elr) { +		ret = -ENOMEM; +		goto out; +	}  	if (NULL == ext4_li_info) {  		ret = ext4_li_info_new(); @@ -3223,6 +3178,10 @@ int ext4_calculate_overhead(struct super_block *sb)  			memset(buf, 0, PAGE_SIZE);  		cond_resched();  	} +	/* Add the journal blocks as well */ +	if (sbi->s_journal) +		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); +  	sbi->s_overhead = overhead;  	smp_wmb();  	free_page((unsigned long) buf); @@ -3365,7 +3324,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  #ifdef CONFIG_EXT4_FS_POSIX_ACL  	set_opt(sb, POSIX_ACL);  #endif -	set_opt(sb, MBLK_IO_SUBMIT);  	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)  		set_opt(sb, JOURNAL_DATA);  	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) @@ -3436,15 +3394,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  			clear_opt(sb, DELALLOC);  	} -	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); -	if (test_opt(sb, DIOREAD_NOLOCK)) { -		if (blocksize < PAGE_SIZE) { -			ext4_msg(sb, KERN_ERR, "can't mount with " -				 "dioread_nolock if block size != PAGE_SIZE"); -			goto failed_mount; -		} -	} -  	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |  		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3486,6 +3435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))  		goto failed_mount; +	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);  	if (blocksize < EXT4_MIN_BLOCK_SIZE ||  	    blocksize > EXT4_MAX_BLOCK_SIZE) {  		ext4_msg(sb, KERN_ERR, @@ -3757,6 +3707,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	if (!err) {  		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);  	} +	if (!err) { +		err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0); +	}  	if (err) {  		ext4_msg(sb, KERN_ERR, "insufficient memory");  		goto failed_mount3; @@ -3766,6 +3719,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	sbi->s_max_writeback_mb_bump = 128;  	sbi->s_extent_max_zeroout_kb = 32; +	/* Register extent status tree shrinker */ +	ext4_es_register_shrinker(sb); +  	/*  	 * set up enough so that it can read an inode  	 */ @@ -3777,13 +3733,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	sb->s_export_op = &ext4_export_ops;  	sb->s_xattr = ext4_xattr_handlers;  #ifdef CONFIG_QUOTA -	sb->s_qcop = &ext4_qctl_operations;  	sb->dq_op = &ext4_quota_operations; - -	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { -		/* Use qctl operations for hidden quota files. */ +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))  		sb->s_qcop = &ext4_qctl_sysfile_operations; -	} +	else +		sb->s_qcop = &ext4_qctl_operations;  #endif  	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -3979,6 +3933,16 @@ no_journal:  	if (err)  		goto failed_mount7; +#ifdef CONFIG_QUOTA +	/* Enable quota usage during mount. */ +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && +	    !(sb->s_flags & MS_RDONLY)) { +		err = ext4_enable_quotas(sb); +		if (err) +			goto failed_mount8; +	} +#endif  /* CONFIG_QUOTA */ +  	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;  	ext4_orphan_cleanup(sb, es);  	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -3996,16 +3960,6 @@ no_journal:  	} else  		descr = "out journal"; -#ifdef CONFIG_QUOTA -	/* Enable quota usage during mount. */ -	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && -	    !(sb->s_flags & MS_RDONLY)) { -		err = ext4_enable_quotas(sb); -		if (err) -			goto failed_mount7; -	} -#endif  /* CONFIG_QUOTA */ -  	if (test_opt(sb, DISCARD)) {  		struct request_queue *q = bdev_get_queue(sb->s_bdev);  		if (!blk_queue_discard(q)) @@ -4029,6 +3983,10 @@ cantfind_ext4:  		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");  	goto failed_mount; +#ifdef CONFIG_QUOTA +failed_mount8: +	kobject_del(&sbi->s_kobj); +#endif  failed_mount7:  	ext4_unregister_li_request(sb);  failed_mount6: @@ -4055,6 +4013,7 @@ failed_mount3:  	percpu_counter_destroy(&sbi->s_freeinodes_counter);  	percpu_counter_destroy(&sbi->s_dirs_counter);  	percpu_counter_destroy(&sbi->s_dirtyclusters_counter); +	percpu_counter_destroy(&sbi->s_extent_cache_cnt);  	if (sbi->s_mmp_tsk)  		kthread_stop(sbi->s_mmp_tsk);  failed_mount2: @@ -4470,16 +4429,12 @@ static void ext4_clear_journal_err(struct super_block *sb,  int ext4_force_commit(struct super_block *sb)  {  	journal_t *journal; -	int ret = 0;  	if (sb->s_flags & MS_RDONLY)  		return 0;  	journal = EXT4_SB(sb)->s_journal; -	if (journal) -		ret = ext4_journal_force_commit(journal); - -	return ret; +	return ext4_journal_force_commit(journal);  }  static int ext4_sync_fs(struct super_block *sb, int wait) @@ -4582,7 +4537,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;  	int err = 0;  #ifdef CONFIG_QUOTA -	int i; +	int i, j;  #endif  	char *orig_data = kstrdup(data, GFP_KERNEL); @@ -4598,7 +4553,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  #ifdef CONFIG_QUOTA  	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;  	for (i = 0; i < MAXQUOTAS; i++) -		old_opts.s_qf_names[i] = sbi->s_qf_names[i]; +		if (sbi->s_qf_names[i]) { +			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], +							 GFP_KERNEL); +			if (!old_opts.s_qf_names[i]) { +				for (j = 0; j < i; j++) +					kfree(old_opts.s_qf_names[j]); +				kfree(orig_data); +				return -ENOMEM; +			} +		} else +			old_opts.s_qf_names[i] = NULL;  #endif  	if (sbi->s_journal && sbi->s_journal->j_task->io_context)  		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; @@ -4725,15 +4690,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  	}  	ext4_setup_system_zone(sb); -	if (sbi->s_journal == NULL) +	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))  		ext4_commit_super(sb, 1);  #ifdef CONFIG_QUOTA  	/* Release old quota file names */  	for (i = 0; i < MAXQUOTAS; i++) -		if (old_opts.s_qf_names[i] && -		    old_opts.s_qf_names[i] != sbi->s_qf_names[i]) -			kfree(old_opts.s_qf_names[i]); +		kfree(old_opts.s_qf_names[i]);  	if (enable_quota) {  		if (sb_any_quota_suspended(sb))  			dquot_resume(sb, -1); @@ -4762,9 +4725,7 @@ restore_opts:  #ifdef CONFIG_QUOTA  	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;  	for (i = 0; i < MAXQUOTAS; i++) { -		if (sbi->s_qf_names[i] && -		    old_opts.s_qf_names[i] != sbi->s_qf_names[i]) -			kfree(sbi->s_qf_names[i]); +		kfree(sbi->s_qf_names[i]);  		sbi->s_qf_names[i] = old_opts.s_qf_names[i];  	}  #endif @@ -4829,7 +4790,7 @@ static int ext4_write_dquot(struct dquot *dquot)  	struct inode *inode;  	inode = dquot_to_inode(dquot); -	handle = ext4_journal_start(inode, +	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,  				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));  	if (IS_ERR(handle))  		return PTR_ERR(handle); @@ -4845,7 +4806,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)  	int ret, err;  	handle_t *handle; -	handle = ext4_journal_start(dquot_to_inode(dquot), +	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,  				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));  	if (IS_ERR(handle))  		return PTR_ERR(handle); @@ -4861,7 +4822,7 @@ static int ext4_release_dquot(struct dquot *dquot)  	int ret, err;  	handle_t *handle; -	handle = ext4_journal_start(dquot_to_inode(dquot), +	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,  				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));  	if (IS_ERR(handle)) {  		/* Release dquot anyway to avoid endless cycle in dqput() */ @@ -4877,9 +4838,12 @@ static int ext4_release_dquot(struct dquot *dquot)  static int ext4_mark_dquot_dirty(struct dquot *dquot)  { +	struct super_block *sb = dquot->dq_sb; +	struct ext4_sb_info *sbi = EXT4_SB(sb); +  	/* Are we journaling quotas? */ -	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || -	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) || +	    sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {  		dquot_mark_dquot_dirty(dquot);  		return ext4_write_dquot(dquot);  	} else { @@ -4893,7 +4857,7 @@ static int ext4_write_info(struct super_block *sb, int type)  	handle_t *handle;  	/* Data block + inode block */ -	handle = ext4_journal_start(sb->s_root->d_inode, 2); +	handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);  	if (IS_ERR(handle))  		return PTR_ERR(handle);  	ret = dquot_commit_info(sb, type); @@ -4999,9 +4963,9 @@ static int ext4_enable_quotas(struct super_block *sb)  						DQUOT_USAGE_ENABLED);  			if (err) {  				ext4_warning(sb, -					"Failed to enable quota (type=%d) " -					"tracking. Please run e2fsck to fix.", -					type); +					"Failed to enable quota tracking " +					"(type=%d, err=%d). Please run " +					"e2fsck to fix.", type, err);  				return err;  			}  		} @@ -5039,7 +5003,7 @@ static int ext4_quota_off(struct super_block *sb, int type)  	/* Update modification times of quota files when userspace can  	 * start looking at them */ -	handle = ext4_journal_start(inode, 1); +	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);  	if (IS_ERR(handle))  		goto out;  	inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a91ebc2b66..3a120b27724 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -549,7 +549,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,  		error = ext4_handle_dirty_xattr_block(handle, inode, bh);  		if (IS_SYNC(inode))  			ext4_handle_sync(handle); -		dquot_free_block(inode, 1); +		dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));  		ea_bdebug(bh, "refcount now=%d; releasing",  			  le32_to_cpu(BHDR(bh)->h_refcount));  	} @@ -832,7 +832,8 @@ inserted:  			else {  				/* The old block is released after updating  				   the inode. */ -				error = dquot_alloc_block(inode, 1); +				error = dquot_alloc_block(inode, +						EXT4_C2B(EXT4_SB(sb), 1));  				if (error)  					goto cleanup;  				error = ext4_journal_get_write_access(handle, @@ -886,17 +887,18 @@ inserted:  				  (unsigned long long)block);  			new_bh = sb_getblk(sb, block); -			if (!new_bh) { +			if (unlikely(!new_bh)) { +				error = -ENOMEM;  getblk_failed:  				ext4_free_blocks(handle, inode, NULL, block, 1,  						 EXT4_FREE_BLOCKS_METADATA); -				error = -EIO;  				goto cleanup;  			}  			lock_buffer(new_bh);  			error = ext4_journal_get_create_access(handle, new_bh);  			if (error) {  				unlock_buffer(new_bh); +				error = -EIO;  				goto getblk_failed;  			}  			memcpy(new_bh->b_data, s->base, new_bh->b_size); @@ -928,7 +930,7 @@ cleanup:  	return error;  cleanup_dquot: -	dquot_free_block(inode, 1); +	dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));  	goto cleanup;  bad_block: @@ -1164,17 +1166,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,  {  	handle_t *handle;  	int error, retries = 0; -	int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); +	int credits = ext4_jbd2_credits_xattr(inode);  retry: -	/* -	 * In case of inline data, we may push out the data to a block, -	 * So reserve the journal space first. -	 */ -	if (ext4_has_inline_data(inode)) -		credits += ext4_writepage_trans_blocks(inode) + 1; - -	handle = ext4_journal_start(inode, credits); +	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);  	if (IS_ERR(handle)) {  		error = PTR_ERR(handle);  	} else { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 69eda787a96..aa25deb5c6c 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -125,74 +125,6 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,  				       struct ext4_xattr_info *i,  				       struct ext4_xattr_ibody_find *is); -extern int ext4_has_inline_data(struct inode *inode); -extern int ext4_get_inline_size(struct inode *inode); -extern int ext4_get_max_inline_size(struct inode *inode); -extern int ext4_find_inline_data_nolock(struct inode *inode); -extern void ext4_write_inline_data(struct inode *inode, -				   struct ext4_iloc *iloc, -				   void *buffer, loff_t pos, -				   unsigned int len); -extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, -				    unsigned int len); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, -				 unsigned int len); -extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); - -extern int ext4_readpage_inline(struct inode *inode, struct page *page); -extern int ext4_try_to_write_inline_data(struct address_space *mapping, -					 struct inode *inode, -					 loff_t pos, unsigned len, -					 unsigned flags, -					 struct page **pagep); -extern int ext4_write_inline_data_end(struct inode *inode, -				      loff_t pos, unsigned len, -				      unsigned copied, -				      struct page *page); -extern struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, -				  unsigned len, -				  struct page *page); -extern int ext4_da_write_inline_data_begin(struct address_space *mapping, -					   struct inode *inode, -					   loff_t pos, unsigned len, -					   unsigned flags, -					   struct page **pagep, -					   void **fsdata); -extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, -					 unsigned len, unsigned copied, -					 struct page *page); -extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, -				     struct inode *inode); -extern int ext4_try_create_inline_dir(handle_t *handle, -				      struct inode *parent, -				      struct inode *inode); -extern int ext4_read_inline_dir(struct file *filp, -				void *dirent, filldir_t filldir, -				int *has_inline_data); -extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, -					const struct qstr *d_name, -					struct ext4_dir_entry_2 **res_dir, -					int *has_inline_data); -extern int ext4_delete_inline_entry(handle_t *handle, -				    struct inode *dir, -				    struct ext4_dir_entry_2 *de_del, -				    struct buffer_head *bh, -				    int *has_inline_data); -extern int empty_inline_dir(struct inode *dir, int *has_inline_data); -extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, -					struct ext4_dir_entry_2 **parent_de, -					int *retval); -extern int ext4_inline_data_fiemap(struct inode *inode, -				   struct fiemap_extent_info *fieinfo, -				   int *has_inline); -extern int ext4_try_to_evict_inline_data(handle_t *handle, -					 struct inode *inode, -					 int needed); -extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); - -extern int ext4_convert_inline_data(struct inode *inode); -  #ifdef CONFIG_EXT4_FS_SECURITY  extern int ext4_init_security(handle_t *handle, struct inode *inode,  			      struct inode *dir, const struct qstr *qstr); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig new file mode 100644 index 00000000000..fd27e7e6326 --- /dev/null +++ b/fs/f2fs/Kconfig @@ -0,0 +1,53 @@ +config F2FS_FS +	tristate "F2FS filesystem support (EXPERIMENTAL)" +	depends on BLOCK +	help +	  F2FS is based on Log-structured File System (LFS), which supports +	  versatile "flash-friendly" features. The design has been focused on +	  addressing the fundamental issues in LFS, which are snowball effect +	  of wandering tree and high cleaning overhead. + +	  Since flash-based storages show different characteristics according to +	  the internal geometry or flash memory management schemes aka FTL, F2FS +	  and tools support various parameters not only for configuring on-disk +	  layout, but also for selecting allocation and cleaning algorithms. + +	  If unsure, say N. + +config F2FS_STAT_FS +	bool "F2FS Status Information" +	depends on F2FS_FS && DEBUG_FS +	default y +	help +	  /sys/kernel/debug/f2fs/ contains information about all the partitions +	  mounted as f2fs. Each file shows the whole f2fs information. + +	  /sys/kernel/debug/f2fs/status includes: +	    - major file system information managed by f2fs currently +	    - average SIT information about whole segments +	    - current memory footprint consumed by f2fs. + +config F2FS_FS_XATTR +	bool "F2FS extended attributes" +	depends on F2FS_FS +	default y +	help +	  Extended attributes are name:value pairs associated with inodes by +	  the kernel or by users (see the attr(5) manual page, or visit +	  <http://acl.bestbits.at/> for details). + +	  If unsure, say N. + +config F2FS_FS_POSIX_ACL +	bool "F2FS Access Control Lists" +	depends on F2FS_FS_XATTR +	select FS_POSIX_ACL +	default y +	help +	  Posix Access Control Lists (ACLs) support permissions for users and +	  gourps beyond the owner/group/world scheme. + +	  To learn more about Access Control Lists, visit the POSIX ACLs for +	  Linux website <http://acl.bestbits.at/>. + +	  If you don't know what Access Control Lists are, say N diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile new file mode 100644 index 00000000000..27a0820340b --- /dev/null +++ b/fs/f2fs/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_F2FS_FS) += f2fs.o + +f2fs-y		:= dir.o file.o inode.o namei.o hash.o super.o +f2fs-y		+= checkpoint.o gc.o data.o node.o segment.o recovery.o +f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o +f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o +f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c new file mode 100644 index 00000000000..137af4255da --- /dev/null +++ b/fs/f2fs/acl.c @@ -0,0 +1,412 @@ +/* + * fs/f2fs/acl.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/f2fs_fs.h> +#include "f2fs.h" +#include "xattr.h" +#include "acl.h" + +#define get_inode_mode(i)	((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ +					(F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + +static inline size_t f2fs_acl_size(int count) +{ +	if (count <= 4) { +		return sizeof(struct f2fs_acl_header) + +			count * sizeof(struct f2fs_acl_entry_short); +	} else { +		return sizeof(struct f2fs_acl_header) + +			4 * sizeof(struct f2fs_acl_entry_short) + +			(count - 4) * sizeof(struct f2fs_acl_entry); +	} +} + +static inline int f2fs_acl_count(size_t size) +{ +	ssize_t s; +	size -= sizeof(struct f2fs_acl_header); +	s = size - 4 * sizeof(struct f2fs_acl_entry_short); +	if (s < 0) { +		if (size % sizeof(struct f2fs_acl_entry_short)) +			return -1; +		return size / sizeof(struct f2fs_acl_entry_short); +	} else { +		if (s % sizeof(struct f2fs_acl_entry)) +			return -1; +		return s / sizeof(struct f2fs_acl_entry) + 4; +	} +} + +static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) +{ +	int i, count; +	struct posix_acl *acl; +	struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value; +	struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1); +	const char *end = value + size; + +	if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION)) +		return ERR_PTR(-EINVAL); + +	count = f2fs_acl_count(size); +	if (count < 0) +		return ERR_PTR(-EINVAL); +	if (count == 0) +		return NULL; + +	acl = posix_acl_alloc(count, GFP_KERNEL); +	if (!acl) +		return ERR_PTR(-ENOMEM); + +	for (i = 0; i < count; i++) { + +		if ((char *)entry > end) +			goto fail; + +		acl->a_entries[i].e_tag  = le16_to_cpu(entry->e_tag); +		acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm); + +		switch (acl->a_entries[i].e_tag) { +		case ACL_USER_OBJ: +		case ACL_GROUP_OBJ: +		case ACL_MASK: +		case ACL_OTHER: +			entry = (struct f2fs_acl_entry *)((char *)entry + +					sizeof(struct f2fs_acl_entry_short)); +			break; + +		case ACL_USER: +			acl->a_entries[i].e_uid = +				make_kuid(&init_user_ns, +						le32_to_cpu(entry->e_id)); +			entry = (struct f2fs_acl_entry *)((char *)entry + +					sizeof(struct f2fs_acl_entry)); +			break; +		case ACL_GROUP: +			acl->a_entries[i].e_gid = +				make_kgid(&init_user_ns, +						le32_to_cpu(entry->e_id)); +			entry = (struct f2fs_acl_entry *)((char *)entry + +					sizeof(struct f2fs_acl_entry)); +			break; +		default: +			goto fail; +		} +	} +	if ((char *)entry != end) +		goto fail; +	return acl; +fail: +	posix_acl_release(acl); +	return ERR_PTR(-EINVAL); +} + +static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ +	struct f2fs_acl_header *f2fs_acl; +	struct f2fs_acl_entry *entry; +	int i; + +	f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * +			sizeof(struct f2fs_acl_entry), GFP_KERNEL); +	if (!f2fs_acl) +		return ERR_PTR(-ENOMEM); + +	f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION); +	entry = (struct f2fs_acl_entry *)(f2fs_acl + 1); + +	for (i = 0; i < acl->a_count; i++) { + +		entry->e_tag  = cpu_to_le16(acl->a_entries[i].e_tag); +		entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm); + +		switch (acl->a_entries[i].e_tag) { +		case ACL_USER: +			entry->e_id = cpu_to_le32( +					from_kuid(&init_user_ns, +						acl->a_entries[i].e_uid)); +			entry = (struct f2fs_acl_entry *)((char *)entry + +					sizeof(struct f2fs_acl_entry)); +			break; +		case ACL_GROUP: +			entry->e_id = cpu_to_le32( +					from_kgid(&init_user_ns, +						acl->a_entries[i].e_gid)); +			entry = (struct f2fs_acl_entry *)((char *)entry + +					sizeof(struct f2fs_acl_entry)); +			break; +		case ACL_USER_OBJ: +		case ACL_GROUP_OBJ: +		case ACL_MASK: +		case ACL_OTHER: +			entry = (struct f2fs_acl_entry *)((char *)entry + +					sizeof(struct f2fs_acl_entry_short)); +			break; +		default: +			goto fail; +		} +	} +	*size = f2fs_acl_size(acl->a_count); +	return (void *)f2fs_acl; + +fail: +	kfree(f2fs_acl); +	return ERR_PTR(-EINVAL); +} + +struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; +	void *value = NULL; +	struct posix_acl *acl; +	int retval; + +	if (!test_opt(sbi, POSIX_ACL)) +		return NULL; + +	acl = get_cached_acl(inode, type); +	if (acl != ACL_NOT_CACHED) +		return acl; + +	if (type == ACL_TYPE_ACCESS) +		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; + +	retval = f2fs_getxattr(inode, name_index, "", NULL, 0); +	if (retval > 0) { +		value = kmalloc(retval, GFP_KERNEL); +		if (!value) +			return ERR_PTR(-ENOMEM); +		retval = f2fs_getxattr(inode, name_index, "", value, retval); +	} + +	if (retval > 0) +		acl = f2fs_acl_from_disk(value, retval); +	else if (retval == -ENODATA) +		acl = NULL; +	else +		acl = ERR_PTR(retval); +	kfree(value); + +	if (!IS_ERR(acl)) +		set_cached_acl(inode, type, acl); + +	return acl; +} + +static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct f2fs_inode_info *fi = F2FS_I(inode); +	int name_index; +	void *value = NULL; +	size_t size = 0; +	int error; + +	if (!test_opt(sbi, POSIX_ACL)) +		return 0; +	if (S_ISLNK(inode->i_mode)) +		return -EOPNOTSUPP; + +	switch (type) { +	case ACL_TYPE_ACCESS: +		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; +		if (acl) { +			error = posix_acl_equiv_mode(acl, &inode->i_mode); +			if (error < 0) +				return error; +			set_acl_inode(fi, inode->i_mode); +			if (error == 0) +				acl = NULL; +		} +		break; + +	case ACL_TYPE_DEFAULT: +		name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; +		if (!S_ISDIR(inode->i_mode)) +			return acl ? -EACCES : 0; +		break; + +	default: +		return -EINVAL; +	} + +	if (acl) { +		value = f2fs_acl_to_disk(acl, &size); +		if (IS_ERR(value)) { +			cond_clear_inode_flag(fi, FI_ACL_MODE); +			return (int)PTR_ERR(value); +		} +	} + +	error = f2fs_setxattr(inode, name_index, "", value, size); + +	kfree(value); +	if (!error) +		set_cached_acl(inode, type, acl); + +	cond_clear_inode_flag(fi, FI_ACL_MODE); +	return error; +} + +int f2fs_init_acl(struct inode *inode, struct inode *dir) +{ +	struct posix_acl *acl = NULL; +	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); +	int error = 0; + +	if (!S_ISLNK(inode->i_mode)) { +		if (test_opt(sbi, POSIX_ACL)) { +			acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT); +			if (IS_ERR(acl)) +				return PTR_ERR(acl); +		} +		if (!acl) +			inode->i_mode &= ~current_umask(); +	} + +	if (test_opt(sbi, POSIX_ACL) && acl) { + +		if (S_ISDIR(inode->i_mode)) { +			error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl); +			if (error) +				goto cleanup; +		} +		error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); +		if (error < 0) +			return error; +		if (error > 0) +			error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); +	} +cleanup: +	posix_acl_release(acl); +	return error; +} + +int f2fs_acl_chmod(struct inode *inode) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct posix_acl *acl; +	int error; +	mode_t mode = get_inode_mode(inode); + +	if (!test_opt(sbi, POSIX_ACL)) +		return 0; +	if (S_ISLNK(mode)) +		return -EOPNOTSUPP; + +	acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS); +	if (IS_ERR(acl) || !acl) +		return PTR_ERR(acl); + +	error = posix_acl_chmod(&acl, GFP_KERNEL, mode); +	if (error) +		return error; +	error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); +	posix_acl_release(acl); +	return error; +} + +static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list, +		size_t list_size, const char *name, size_t name_len, int type) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); +	const char *xname = POSIX_ACL_XATTR_DEFAULT; +	size_t size; + +	if (!test_opt(sbi, POSIX_ACL)) +		return 0; + +	if (type == ACL_TYPE_ACCESS) +		xname = POSIX_ACL_XATTR_ACCESS; + +	size = strlen(xname) + 1; +	if (list && size <= list_size) +		memcpy(list, xname, size); +	return size; +} + +static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name, +		void *buffer, size_t size, int type) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); +	struct posix_acl *acl; +	int error; + +	if (strcmp(name, "") != 0) +		return -EINVAL; +	if (!test_opt(sbi, POSIX_ACL)) +		return -EOPNOTSUPP; + +	acl = f2fs_get_acl(dentry->d_inode, type); +	if (IS_ERR(acl)) +		return PTR_ERR(acl); +	if (!acl) +		return -ENODATA; +	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); +	posix_acl_release(acl); + +	return error; +} + +static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name, +		const void *value, size_t size, int flags, int type) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); +	struct inode *inode = dentry->d_inode; +	struct posix_acl *acl = NULL; +	int error; + +	if (strcmp(name, "") != 0) +		return -EINVAL; +	if (!test_opt(sbi, POSIX_ACL)) +		return -EOPNOTSUPP; +	if (!inode_owner_or_capable(inode)) +		return -EPERM; + +	if (value) { +		acl = posix_acl_from_xattr(&init_user_ns, value, size); +		if (IS_ERR(acl)) +			return PTR_ERR(acl); +		if (acl) { +			error = posix_acl_valid(acl); +			if (error) +				goto release_and_out; +		} +	} else { +		acl = NULL; +	} + +	error = f2fs_set_acl(inode, type, acl); + +release_and_out: +	posix_acl_release(acl); +	return error; +} + +const struct xattr_handler f2fs_xattr_acl_default_handler = { +	.prefix = POSIX_ACL_XATTR_DEFAULT, +	.flags = ACL_TYPE_DEFAULT, +	.list = f2fs_xattr_list_acl, +	.get = f2fs_xattr_get_acl, +	.set = f2fs_xattr_set_acl, +}; + +const struct xattr_handler f2fs_xattr_acl_access_handler = { +	.prefix = POSIX_ACL_XATTR_ACCESS, +	.flags = ACL_TYPE_ACCESS, +	.list = f2fs_xattr_list_acl, +	.get = f2fs_xattr_get_acl, +	.set = f2fs_xattr_set_acl, +}; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h new file mode 100644 index 00000000000..80f43067441 --- /dev/null +++ b/fs/f2fs/acl.h @@ -0,0 +1,57 @@ +/* + * fs/f2fs/acl.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.h + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_ACL_H__ +#define __F2FS_ACL_H__ + +#include <linux/posix_acl_xattr.h> + +#define F2FS_ACL_VERSION	0x0001 + +struct f2fs_acl_entry { +	__le16 e_tag; +	__le16 e_perm; +	__le32 e_id; +}; + +struct f2fs_acl_entry_short { +	__le16 e_tag; +	__le16 e_perm; +}; + +struct f2fs_acl_header { +	__le32 a_version; +}; + +#ifdef CONFIG_F2FS_FS_POSIX_ACL + +extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type); +extern int f2fs_acl_chmod(struct inode *inode); +extern int f2fs_init_acl(struct inode *inode, struct inode *dir); +#else +#define f2fs_check_acl	NULL +#define f2fs_get_acl	NULL +#define f2fs_set_acl	NULL + +static inline int f2fs_acl_chmod(struct inode *inode) +{ +	return 0; +} + +static inline int f2fs_init_acl(struct inode *inode, struct inode *dir) +{ +	return 0; +} +#endif +#endif /* __F2FS_ACL_H__ */ diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c new file mode 100644 index 00000000000..2b6fc131e2c --- /dev/null +++ b/fs/f2fs/checkpoint.c @@ -0,0 +1,784 @@ +/* + * fs/f2fs/checkpoint.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/bio.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/f2fs_fs.h> +#include <linux/pagevec.h> +#include <linux/swap.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +static struct kmem_cache *orphan_entry_slab; +static struct kmem_cache *inode_entry_slab; + +/* + * We guarantee no failure on the returned page. + */ +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ +	struct address_space *mapping = sbi->meta_inode->i_mapping; +	struct page *page = NULL; +repeat: +	page = grab_cache_page(mapping, index); +	if (!page) { +		cond_resched(); +		goto repeat; +	} + +	/* We wait writeback only inside grab_meta_page() */ +	wait_on_page_writeback(page); +	SetPageUptodate(page); +	return page; +} + +/* + * We guarantee no failure on the returned page. + */ +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ +	struct address_space *mapping = sbi->meta_inode->i_mapping; +	struct page *page; +repeat: +	page = grab_cache_page(mapping, index); +	if (!page) { +		cond_resched(); +		goto repeat; +	} +	if (f2fs_readpage(sbi, page, index, READ_SYNC)) { +		f2fs_put_page(page, 1); +		goto repeat; +	} +	mark_page_accessed(page); + +	/* We do not allow returning an errorneous page */ +	return page; +} + +static int f2fs_write_meta_page(struct page *page, +				struct writeback_control *wbc) +{ +	struct inode *inode = page->mapping->host; +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + +	/* Should not write any meta pages, if any IO error was occurred */ +	if (wbc->for_reclaim || +			is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { +		dec_page_count(sbi, F2FS_DIRTY_META); +		wbc->pages_skipped++; +		set_page_dirty(page); +		return AOP_WRITEPAGE_ACTIVATE; +	} + +	wait_on_page_writeback(page); + +	write_meta_page(sbi, page); +	dec_page_count(sbi, F2FS_DIRTY_META); +	unlock_page(page); +	return 0; +} + +static int f2fs_write_meta_pages(struct address_space *mapping, +				struct writeback_control *wbc) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); +	struct block_device *bdev = sbi->sb->s_bdev; +	long written; + +	if (wbc->for_kupdate) +		return 0; + +	if (get_pages(sbi, F2FS_DIRTY_META) == 0) +		return 0; + +	/* if mounting is failed, skip writing node pages */ +	mutex_lock(&sbi->cp_mutex); +	written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev)); +	mutex_unlock(&sbi->cp_mutex); +	wbc->nr_to_write -= written; +	return 0; +} + +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, +						long nr_to_write) +{ +	struct address_space *mapping = sbi->meta_inode->i_mapping; +	pgoff_t index = 0, end = LONG_MAX; +	struct pagevec pvec; +	long nwritten = 0; +	struct writeback_control wbc = { +		.for_reclaim = 0, +	}; + +	pagevec_init(&pvec, 0); + +	while (index <= end) { +		int i, nr_pages; +		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, +				PAGECACHE_TAG_DIRTY, +				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); +		if (nr_pages == 0) +			break; + +		for (i = 0; i < nr_pages; i++) { +			struct page *page = pvec.pages[i]; +			lock_page(page); +			BUG_ON(page->mapping != mapping); +			BUG_ON(!PageDirty(page)); +			clear_page_dirty_for_io(page); +			if (f2fs_write_meta_page(page, &wbc)) { +				unlock_page(page); +				break; +			} +			if (nwritten++ >= nr_to_write) +				break; +		} +		pagevec_release(&pvec); +		cond_resched(); +	} + +	if (nwritten) +		f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX); + +	return nwritten; +} + +static int f2fs_set_meta_page_dirty(struct page *page) +{ +	struct address_space *mapping = page->mapping; +	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + +	SetPageUptodate(page); +	if (!PageDirty(page)) { +		__set_page_dirty_nobuffers(page); +		inc_page_count(sbi, F2FS_DIRTY_META); +		return 1; +	} +	return 0; +} + +const struct address_space_operations f2fs_meta_aops = { +	.writepage	= f2fs_write_meta_page, +	.writepages	= f2fs_write_meta_pages, +	.set_page_dirty	= f2fs_set_meta_page_dirty, +}; + +int check_orphan_space(struct f2fs_sb_info *sbi) +{ +	unsigned int max_orphans; +	int err = 0; + +	/* +	 * considering 512 blocks in a segment 5 blocks are needed for cp +	 * and log segment summaries. Remaining blocks are used to keep +	 * orphan entries with the limitation one reserved segment +	 * for cp pack we can have max 1020*507 orphan entries +	 */ +	max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK; +	mutex_lock(&sbi->orphan_inode_mutex); +	if (sbi->n_orphans >= max_orphans) +		err = -ENOSPC; +	mutex_unlock(&sbi->orphan_inode_mutex); +	return err; +} + +void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ +	struct list_head *head, *this; +	struct orphan_inode_entry *new = NULL, *orphan = NULL; + +	mutex_lock(&sbi->orphan_inode_mutex); +	head = &sbi->orphan_inode_list; +	list_for_each(this, head) { +		orphan = list_entry(this, struct orphan_inode_entry, list); +		if (orphan->ino == ino) +			goto out; +		if (orphan->ino > ino) +			break; +		orphan = NULL; +	} +retry: +	new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); +	if (!new) { +		cond_resched(); +		goto retry; +	} +	new->ino = ino; + +	/* add new_oentry into list which is sorted by inode number */ +	if (orphan) +		list_add(&new->list, this->prev); +	else +		list_add_tail(&new->list, head); + +	sbi->n_orphans++; +out: +	mutex_unlock(&sbi->orphan_inode_mutex); +} + +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ +	struct list_head *this, *next, *head; +	struct orphan_inode_entry *orphan; + +	mutex_lock(&sbi->orphan_inode_mutex); +	head = &sbi->orphan_inode_list; +	list_for_each_safe(this, next, head) { +		orphan = list_entry(this, struct orphan_inode_entry, list); +		if (orphan->ino == ino) { +			list_del(&orphan->list); +			kmem_cache_free(orphan_entry_slab, orphan); +			sbi->n_orphans--; +			break; +		} +	} +	mutex_unlock(&sbi->orphan_inode_mutex); +} + +static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ +	struct inode *inode = f2fs_iget(sbi->sb, ino); +	BUG_ON(IS_ERR(inode)); +	clear_nlink(inode); + +	/* truncate all the data during iput */ +	iput(inode); +} + +int recover_orphan_inodes(struct f2fs_sb_info *sbi) +{ +	block_t start_blk, orphan_blkaddr, i, j; + +	if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) +		return 0; + +	sbi->por_doing = 1; +	start_blk = __start_cp_addr(sbi) + 1; +	orphan_blkaddr = __start_sum_addr(sbi) - 1; + +	for (i = 0; i < orphan_blkaddr; i++) { +		struct page *page = get_meta_page(sbi, start_blk + i); +		struct f2fs_orphan_block *orphan_blk; + +		orphan_blk = (struct f2fs_orphan_block *)page_address(page); +		for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { +			nid_t ino = le32_to_cpu(orphan_blk->ino[j]); +			recover_orphan_inode(sbi, ino); +		} +		f2fs_put_page(page, 1); +	} +	/* clear Orphan Flag */ +	clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); +	sbi->por_doing = 0; +	return 0; +} + +static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) +{ +	struct list_head *head, *this, *next; +	struct f2fs_orphan_block *orphan_blk = NULL; +	struct page *page = NULL; +	unsigned int nentries = 0; +	unsigned short index = 1; +	unsigned short orphan_blocks; + +	orphan_blocks = (unsigned short)((sbi->n_orphans + +		(F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); + +	mutex_lock(&sbi->orphan_inode_mutex); +	head = &sbi->orphan_inode_list; + +	/* loop for each orphan inode entry and write them in Jornal block */ +	list_for_each_safe(this, next, head) { +		struct orphan_inode_entry *orphan; + +		orphan = list_entry(this, struct orphan_inode_entry, list); + +		if (nentries == F2FS_ORPHANS_PER_BLOCK) { +			/* +			 * an orphan block is full of 1020 entries, +			 * then we need to flush current orphan blocks +			 * and bring another one in memory +			 */ +			orphan_blk->blk_addr = cpu_to_le16(index); +			orphan_blk->blk_count = cpu_to_le16(orphan_blocks); +			orphan_blk->entry_count = cpu_to_le32(nentries); +			set_page_dirty(page); +			f2fs_put_page(page, 1); +			index++; +			start_blk++; +			nentries = 0; +			page = NULL; +		} +		if (page) +			goto page_exist; + +		page = grab_meta_page(sbi, start_blk); +		orphan_blk = (struct f2fs_orphan_block *)page_address(page); +		memset(orphan_blk, 0, sizeof(*orphan_blk)); +page_exist: +		orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); +	} +	if (!page) +		goto end; + +	orphan_blk->blk_addr = cpu_to_le16(index); +	orphan_blk->blk_count = cpu_to_le16(orphan_blocks); +	orphan_blk->entry_count = cpu_to_le32(nentries); +	set_page_dirty(page); +	f2fs_put_page(page, 1); +end: +	mutex_unlock(&sbi->orphan_inode_mutex); +} + +static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, +				block_t cp_addr, unsigned long long *version) +{ +	struct page *cp_page_1, *cp_page_2 = NULL; +	unsigned long blk_size = sbi->blocksize; +	struct f2fs_checkpoint *cp_block; +	unsigned long long cur_version = 0, pre_version = 0; +	unsigned int crc = 0; +	size_t crc_offset; + +	/* Read the 1st cp block in this CP pack */ +	cp_page_1 = get_meta_page(sbi, cp_addr); + +	/* get the version number */ +	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); +	crc_offset = le32_to_cpu(cp_block->checksum_offset); +	if (crc_offset >= blk_size) +		goto invalid_cp1; + +	crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); +	if (!f2fs_crc_valid(crc, cp_block, crc_offset)) +		goto invalid_cp1; + +	pre_version = le64_to_cpu(cp_block->checkpoint_ver); + +	/* Read the 2nd cp block in this CP pack */ +	cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; +	cp_page_2 = get_meta_page(sbi, cp_addr); + +	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); +	crc_offset = le32_to_cpu(cp_block->checksum_offset); +	if (crc_offset >= blk_size) +		goto invalid_cp2; + +	crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); +	if (!f2fs_crc_valid(crc, cp_block, crc_offset)) +		goto invalid_cp2; + +	cur_version = le64_to_cpu(cp_block->checkpoint_ver); + +	if (cur_version == pre_version) { +		*version = cur_version; +		f2fs_put_page(cp_page_2, 1); +		return cp_page_1; +	} +invalid_cp2: +	f2fs_put_page(cp_page_2, 1); +invalid_cp1: +	f2fs_put_page(cp_page_1, 1); +	return NULL; +} + +int get_valid_checkpoint(struct f2fs_sb_info *sbi) +{ +	struct f2fs_checkpoint *cp_block; +	struct f2fs_super_block *fsb = sbi->raw_super; +	struct page *cp1, *cp2, *cur_page; +	unsigned long blk_size = sbi->blocksize; +	unsigned long long cp1_version = 0, cp2_version = 0; +	unsigned long long cp_start_blk_no; + +	sbi->ckpt = kzalloc(blk_size, GFP_KERNEL); +	if (!sbi->ckpt) +		return -ENOMEM; +	/* +	 * Finding out valid cp block involves read both +	 * sets( cp pack1 and cp pack 2) +	 */ +	cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); +	cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); + +	/* The second checkpoint pack should start at the next segment */ +	cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); +	cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); + +	if (cp1 && cp2) { +		if (ver_after(cp2_version, cp1_version)) +			cur_page = cp2; +		else +			cur_page = cp1; +	} else if (cp1) { +		cur_page = cp1; +	} else if (cp2) { +		cur_page = cp2; +	} else { +		goto fail_no_cp; +	} + +	cp_block = (struct f2fs_checkpoint *)page_address(cur_page); +	memcpy(sbi->ckpt, cp_block, blk_size); + +	f2fs_put_page(cp1, 1); +	f2fs_put_page(cp2, 1); +	return 0; + +fail_no_cp: +	kfree(sbi->ckpt); +	return -EINVAL; +} + +void set_dirty_dir_page(struct inode *inode, struct page *page) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct list_head *head = &sbi->dir_inode_list; +	struct dir_inode_entry *new; +	struct list_head *this; + +	if (!S_ISDIR(inode->i_mode)) +		return; +retry: +	new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); +	if (!new) { +		cond_resched(); +		goto retry; +	} +	new->inode = inode; +	INIT_LIST_HEAD(&new->list); + +	spin_lock(&sbi->dir_inode_lock); +	list_for_each(this, head) { +		struct dir_inode_entry *entry; +		entry = list_entry(this, struct dir_inode_entry, list); +		if (entry->inode == inode) { +			kmem_cache_free(inode_entry_slab, new); +			goto out; +		} +	} +	list_add_tail(&new->list, head); +	sbi->n_dirty_dirs++; + +	BUG_ON(!S_ISDIR(inode->i_mode)); +out: +	inc_page_count(sbi, F2FS_DIRTY_DENTS); +	inode_inc_dirty_dents(inode); +	SetPagePrivate(page); + +	spin_unlock(&sbi->dir_inode_lock); +} + +void remove_dirty_dir_inode(struct inode *inode) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct list_head *head = &sbi->dir_inode_list; +	struct list_head *this; + +	if (!S_ISDIR(inode->i_mode)) +		return; + +	spin_lock(&sbi->dir_inode_lock); +	if (atomic_read(&F2FS_I(inode)->dirty_dents)) +		goto out; + +	list_for_each(this, head) { +		struct dir_inode_entry *entry; +		entry = list_entry(this, struct dir_inode_entry, list); +		if (entry->inode == inode) { +			list_del(&entry->list); +			kmem_cache_free(inode_entry_slab, entry); +			sbi->n_dirty_dirs--; +			break; +		} +	} +out: +	spin_unlock(&sbi->dir_inode_lock); +} + +void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +{ +	struct list_head *head = &sbi->dir_inode_list; +	struct dir_inode_entry *entry; +	struct inode *inode; +retry: +	spin_lock(&sbi->dir_inode_lock); +	if (list_empty(head)) { +		spin_unlock(&sbi->dir_inode_lock); +		return; +	} +	entry = list_entry(head->next, struct dir_inode_entry, list); +	inode = igrab(entry->inode); +	spin_unlock(&sbi->dir_inode_lock); +	if (inode) { +		filemap_flush(inode->i_mapping); +		iput(inode); +	} else { +		/* +		 * We should submit bio, since it exists several +		 * wribacking dentry pages in the freeing inode. +		 */ +		f2fs_submit_bio(sbi, DATA, true); +	} +	goto retry; +} + +/* + * Freeze all the FS-operations for checkpoint. + */ +static void block_operations(struct f2fs_sb_info *sbi) +{ +	int t; +	struct writeback_control wbc = { +		.sync_mode = WB_SYNC_ALL, +		.nr_to_write = LONG_MAX, +		.for_reclaim = 0, +	}; + +	/* Stop renaming operation */ +	mutex_lock_op(sbi, RENAME); +	mutex_lock_op(sbi, DENTRY_OPS); + +retry_dents: +	/* write all the dirty dentry pages */ +	sync_dirty_dir_inodes(sbi); + +	mutex_lock_op(sbi, DATA_WRITE); +	if (get_pages(sbi, F2FS_DIRTY_DENTS)) { +		mutex_unlock_op(sbi, DATA_WRITE); +		goto retry_dents; +	} + +	/* block all the operations */ +	for (t = DATA_NEW; t <= NODE_TRUNC; t++) +		mutex_lock_op(sbi, t); + +	mutex_lock(&sbi->write_inode); + +	/* +	 * POR: we should ensure that there is no dirty node pages +	 * until finishing nat/sit flush. +	 */ +retry: +	sync_node_pages(sbi, 0, &wbc); + +	mutex_lock_op(sbi, NODE_WRITE); + +	if (get_pages(sbi, F2FS_DIRTY_NODES)) { +		mutex_unlock_op(sbi, NODE_WRITE); +		goto retry; +	} +	mutex_unlock(&sbi->write_inode); +} + +static void unblock_operations(struct f2fs_sb_info *sbi) +{ +	int t; +	for (t = NODE_WRITE; t >= RENAME; t--) +		mutex_unlock_op(sbi, t); +} + +static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +{ +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); +	nid_t last_nid = 0; +	block_t start_blk; +	struct page *cp_page; +	unsigned int data_sum_blocks, orphan_blocks; +	unsigned int crc32 = 0; +	void *kaddr; +	int i; + +	/* Flush all the NAT/SIT pages */ +	while (get_pages(sbi, F2FS_DIRTY_META)) +		sync_meta_pages(sbi, META, LONG_MAX); + +	next_free_nid(sbi, &last_nid); + +	/* +	 * modify checkpoint +	 * version number is already updated +	 */ +	ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); +	ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); +	ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); +	for (i = 0; i < 3; i++) { +		ckpt->cur_node_segno[i] = +			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); +		ckpt->cur_node_blkoff[i] = +			cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE)); +		ckpt->alloc_type[i + CURSEG_HOT_NODE] = +				curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); +	} +	for (i = 0; i < 3; i++) { +		ckpt->cur_data_segno[i] = +			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); +		ckpt->cur_data_blkoff[i] = +			cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA)); +		ckpt->alloc_type[i + CURSEG_HOT_DATA] = +				curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); +	} + +	ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); +	ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); +	ckpt->next_free_nid = cpu_to_le32(last_nid); + +	/* 2 cp  + n data seg summary + orphan inode blocks */ +	data_sum_blocks = npages_for_summary_flush(sbi); +	if (data_sum_blocks < 3) +		set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); +	else +		clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + +	orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) +					/ F2FS_ORPHANS_PER_BLOCK; +	ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks); + +	if (is_umount) { +		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); +		ckpt->cp_pack_total_block_count = cpu_to_le32(2 + +			data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); +	} else { +		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); +		ckpt->cp_pack_total_block_count = cpu_to_le32(2 + +			data_sum_blocks + orphan_blocks); +	} + +	if (sbi->n_orphans) +		set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); +	else +		clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + +	/* update SIT/NAT bitmap */ +	get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); +	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); + +	crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); +	*(__le32 *)((unsigned char *)ckpt + +				le32_to_cpu(ckpt->checksum_offset)) +				= cpu_to_le32(crc32); + +	start_blk = __start_cp_addr(sbi); + +	/* write out checkpoint buffer at block 0 */ +	cp_page = grab_meta_page(sbi, start_blk++); +	kaddr = page_address(cp_page); +	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); +	set_page_dirty(cp_page); +	f2fs_put_page(cp_page, 1); + +	if (sbi->n_orphans) { +		write_orphan_inodes(sbi, start_blk); +		start_blk += orphan_blocks; +	} + +	write_data_summaries(sbi, start_blk); +	start_blk += data_sum_blocks; +	if (is_umount) { +		write_node_summaries(sbi, start_blk); +		start_blk += NR_CURSEG_NODE_TYPE; +	} + +	/* writeout checkpoint block */ +	cp_page = grab_meta_page(sbi, start_blk); +	kaddr = page_address(cp_page); +	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); +	set_page_dirty(cp_page); +	f2fs_put_page(cp_page, 1); + +	/* wait for previous submitted node/meta pages writeback */ +	while (get_pages(sbi, F2FS_WRITEBACK)) +		congestion_wait(BLK_RW_ASYNC, HZ / 50); + +	filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); +	filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); + +	/* update user_block_counts */ +	sbi->last_valid_block_count = sbi->total_valid_block_count; +	sbi->alloc_valid_block_count = 0; + +	/* Here, we only have one bio having CP pack */ +	sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + +	if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { +		clear_prefree_segments(sbi); +		F2FS_RESET_SB_DIRT(sbi); +	} +} + +/* + * We guarantee that this checkpoint procedure should not fail. + */ +void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +{ +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); +	unsigned long long ckpt_ver; + +	mutex_lock(&sbi->cp_mutex); +	block_operations(sbi); + +	f2fs_submit_bio(sbi, DATA, true); +	f2fs_submit_bio(sbi, NODE, true); +	f2fs_submit_bio(sbi, META, true); + +	/* +	 * update checkpoint pack index +	 * Increase the version number so that +	 * SIT entries and seg summaries are written at correct place +	 */ +	ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver); +	ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); + +	/* write cached NAT/SIT entries to NAT/SIT area */ +	flush_nat_entries(sbi); +	flush_sit_entries(sbi); + +	reset_victim_segmap(sbi); + +	/* unlock all the fs_lock[] in do_checkpoint() */ +	do_checkpoint(sbi, is_umount); + +	unblock_operations(sbi); +	mutex_unlock(&sbi->cp_mutex); +} + +void init_orphan_info(struct f2fs_sb_info *sbi) +{ +	mutex_init(&sbi->orphan_inode_mutex); +	INIT_LIST_HEAD(&sbi->orphan_inode_list); +	sbi->n_orphans = 0; +} + +int __init create_checkpoint_caches(void) +{ +	orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", +			sizeof(struct orphan_inode_entry), NULL); +	if (unlikely(!orphan_entry_slab)) +		return -ENOMEM; +	inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", +			sizeof(struct dir_inode_entry), NULL); +	if (unlikely(!inode_entry_slab)) { +		kmem_cache_destroy(orphan_entry_slab); +		return -ENOMEM; +	} +	return 0; +} + +void destroy_checkpoint_caches(void) +{ +	kmem_cache_destroy(orphan_entry_slab); +	kmem_cache_destroy(inode_entry_slab); +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c new file mode 100644 index 00000000000..7bd22a20112 --- /dev/null +++ b/fs/f2fs/data.c @@ -0,0 +1,718 @@ +/* + * fs/f2fs/data.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/prefetch.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +/* + * Lock ordering for the change of data block address: + * ->data_page + *  ->node_page + *    update block addresses in the node page + */ +static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) +{ +	struct f2fs_node *rn; +	__le32 *addr_array; +	struct page *node_page = dn->node_page; +	unsigned int ofs_in_node = dn->ofs_in_node; + +	wait_on_page_writeback(node_page); + +	rn = (struct f2fs_node *)page_address(node_page); + +	/* Get physical address of data block */ +	addr_array = blkaddr_in_node(rn); +	addr_array[ofs_in_node] = cpu_to_le32(new_addr); +	set_page_dirty(node_page); +} + +int reserve_new_block(struct dnode_of_data *dn) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + +	if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) +		return -EPERM; +	if (!inc_valid_block_count(sbi, dn->inode, 1)) +		return -ENOSPC; + +	__set_data_blkaddr(dn, NEW_ADDR); +	dn->data_blkaddr = NEW_ADDR; +	sync_inode_page(dn); +	return 0; +} + +static int check_extent_cache(struct inode *inode, pgoff_t pgofs, +					struct buffer_head *bh_result) +{ +	struct f2fs_inode_info *fi = F2FS_I(inode); +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	pgoff_t start_fofs, end_fofs; +	block_t start_blkaddr; + +	read_lock(&fi->ext.ext_lock); +	if (fi->ext.len == 0) { +		read_unlock(&fi->ext.ext_lock); +		return 0; +	} + +	sbi->total_hit_ext++; +	start_fofs = fi->ext.fofs; +	end_fofs = fi->ext.fofs + fi->ext.len - 1; +	start_blkaddr = fi->ext.blk_addr; + +	if (pgofs >= start_fofs && pgofs <= end_fofs) { +		unsigned int blkbits = inode->i_sb->s_blocksize_bits; +		size_t count; + +		clear_buffer_new(bh_result); +		map_bh(bh_result, inode->i_sb, +				start_blkaddr + pgofs - start_fofs); +		count = end_fofs - pgofs + 1; +		if (count < (UINT_MAX >> blkbits)) +			bh_result->b_size = (count << blkbits); +		else +			bh_result->b_size = UINT_MAX; + +		sbi->read_hit_ext++; +		read_unlock(&fi->ext.ext_lock); +		return 1; +	} +	read_unlock(&fi->ext.ext_lock); +	return 0; +} + +void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) +{ +	struct f2fs_inode_info *fi = F2FS_I(dn->inode); +	pgoff_t fofs, start_fofs, end_fofs; +	block_t start_blkaddr, end_blkaddr; + +	BUG_ON(blk_addr == NEW_ADDR); +	fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node; + +	/* Update the page address in the parent node */ +	__set_data_blkaddr(dn, blk_addr); + +	write_lock(&fi->ext.ext_lock); + +	start_fofs = fi->ext.fofs; +	end_fofs = fi->ext.fofs + fi->ext.len - 1; +	start_blkaddr = fi->ext.blk_addr; +	end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1; + +	/* Drop and initialize the matched extent */ +	if (fi->ext.len == 1 && fofs == start_fofs) +		fi->ext.len = 0; + +	/* Initial extent */ +	if (fi->ext.len == 0) { +		if (blk_addr != NULL_ADDR) { +			fi->ext.fofs = fofs; +			fi->ext.blk_addr = blk_addr; +			fi->ext.len = 1; +		} +		goto end_update; +	} + +	/* Frone merge */ +	if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { +		fi->ext.fofs--; +		fi->ext.blk_addr--; +		fi->ext.len++; +		goto end_update; +	} + +	/* Back merge */ +	if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) { +		fi->ext.len++; +		goto end_update; +	} + +	/* Split the existing extent */ +	if (fi->ext.len > 1 && +		fofs >= start_fofs && fofs <= end_fofs) { +		if ((end_fofs - fofs) < (fi->ext.len >> 1)) { +			fi->ext.len = fofs - start_fofs; +		} else { +			fi->ext.fofs = fofs + 1; +			fi->ext.blk_addr = start_blkaddr + +					fofs - start_fofs + 1; +			fi->ext.len -= fofs - start_fofs + 1; +		} +		goto end_update; +	} +	write_unlock(&fi->ext.ext_lock); +	return; + +end_update: +	write_unlock(&fi->ext.ext_lock); +	sync_inode_page(dn); +	return; +} + +struct page *find_data_page(struct inode *inode, pgoff_t index) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct address_space *mapping = inode->i_mapping; +	struct dnode_of_data dn; +	struct page *page; +	int err; + +	page = find_get_page(mapping, index); +	if (page && PageUptodate(page)) +		return page; +	f2fs_put_page(page, 0); + +	set_new_dnode(&dn, inode, NULL, NULL, 0); +	err = get_dnode_of_data(&dn, index, RDONLY_NODE); +	if (err) +		return ERR_PTR(err); +	f2fs_put_dnode(&dn); + +	if (dn.data_blkaddr == NULL_ADDR) +		return ERR_PTR(-ENOENT); + +	/* By fallocate(), there is no cached page, but with NEW_ADDR */ +	if (dn.data_blkaddr == NEW_ADDR) +		return ERR_PTR(-EINVAL); + +	page = grab_cache_page(mapping, index); +	if (!page) +		return ERR_PTR(-ENOMEM); + +	err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); +	if (err) { +		f2fs_put_page(page, 1); +		return ERR_PTR(err); +	} +	unlock_page(page); +	return page; +} + +/* + * If it tries to access a hole, return an error. + * Because, the callers, functions in dir.c and GC, should be able to know + * whether this page exists or not. + */ +struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct address_space *mapping = inode->i_mapping; +	struct dnode_of_data dn; +	struct page *page; +	int err; + +	set_new_dnode(&dn, inode, NULL, NULL, 0); +	err = get_dnode_of_data(&dn, index, RDONLY_NODE); +	if (err) +		return ERR_PTR(err); +	f2fs_put_dnode(&dn); + +	if (dn.data_blkaddr == NULL_ADDR) +		return ERR_PTR(-ENOENT); + +	page = grab_cache_page(mapping, index); +	if (!page) +		return ERR_PTR(-ENOMEM); + +	if (PageUptodate(page)) +		return page; + +	BUG_ON(dn.data_blkaddr == NEW_ADDR); +	BUG_ON(dn.data_blkaddr == NULL_ADDR); + +	err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); +	if (err) { +		f2fs_put_page(page, 1); +		return ERR_PTR(err); +	} +	return page; +} + +/* + * Caller ensures that this data page is never allocated. + * A new zero-filled data page is allocated in the page cache. + */ +struct page *get_new_data_page(struct inode *inode, pgoff_t index, +						bool new_i_size) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct address_space *mapping = inode->i_mapping; +	struct page *page; +	struct dnode_of_data dn; +	int err; + +	set_new_dnode(&dn, inode, NULL, NULL, 0); +	err = get_dnode_of_data(&dn, index, 0); +	if (err) +		return ERR_PTR(err); + +	if (dn.data_blkaddr == NULL_ADDR) { +		if (reserve_new_block(&dn)) { +			f2fs_put_dnode(&dn); +			return ERR_PTR(-ENOSPC); +		} +	} +	f2fs_put_dnode(&dn); + +	page = grab_cache_page(mapping, index); +	if (!page) +		return ERR_PTR(-ENOMEM); + +	if (PageUptodate(page)) +		return page; + +	if (dn.data_blkaddr == NEW_ADDR) { +		zero_user_segment(page, 0, PAGE_CACHE_SIZE); +	} else { +		err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); +		if (err) { +			f2fs_put_page(page, 1); +			return ERR_PTR(err); +		} +	} +	SetPageUptodate(page); + +	if (new_i_size && +		i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { +		i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); +		mark_inode_dirty_sync(inode); +	} +	return page; +} + +static void read_end_io(struct bio *bio, int err) +{ +	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + +	do { +		struct page *page = bvec->bv_page; + +		if (--bvec >= bio->bi_io_vec) +			prefetchw(&bvec->bv_page->flags); + +		if (uptodate) { +			SetPageUptodate(page); +		} else { +			ClearPageUptodate(page); +			SetPageError(page); +		} +		unlock_page(page); +	} while (bvec >= bio->bi_io_vec); +	kfree(bio->bi_private); +	bio_put(bio); +} + +/* + * Fill the locked page with data located in the block address. + * Read operation is synchronous, and caller must unlock the page. + */ +int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, +					block_t blk_addr, int type) +{ +	struct block_device *bdev = sbi->sb->s_bdev; +	bool sync = (type == READ_SYNC); +	struct bio *bio; + +	/* This page can be already read by other threads */ +	if (PageUptodate(page)) { +		if (!sync) +			unlock_page(page); +		return 0; +	} + +	down_read(&sbi->bio_sem); + +	/* Allocate a new bio */ +	bio = f2fs_bio_alloc(bdev, 1); + +	/* Initialize the bio */ +	bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); +	bio->bi_end_io = read_end_io; + +	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { +		kfree(bio->bi_private); +		bio_put(bio); +		up_read(&sbi->bio_sem); +		return -EFAULT; +	} + +	submit_bio(type, bio); +	up_read(&sbi->bio_sem); + +	/* wait for read completion if sync */ +	if (sync) { +		lock_page(page); +		if (PageError(page)) +			return -EIO; +	} +	return 0; +} + +/* + * This function should be used by the data read flow only where it + * does not check the "create" flag that indicates block allocation. + * The reason for this special functionality is to exploit VFS readahead + * mechanism. + */ +static int get_data_block_ro(struct inode *inode, sector_t iblock, +			struct buffer_head *bh_result, int create) +{ +	unsigned int blkbits = inode->i_sb->s_blocksize_bits; +	unsigned maxblocks = bh_result->b_size >> blkbits; +	struct dnode_of_data dn; +	pgoff_t pgofs; +	int err; + +	/* Get the page offset from the block offset(iblock) */ +	pgofs =	(pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); + +	if (check_extent_cache(inode, pgofs, bh_result)) +		return 0; + +	/* When reading holes, we need its node page */ +	set_new_dnode(&dn, inode, NULL, NULL, 0); +	err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE); +	if (err) +		return (err == -ENOENT) ? 0 : err; + +	/* It does not support data allocation */ +	BUG_ON(create); + +	if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { +		int i; +		unsigned int end_offset; + +		end_offset = IS_INODE(dn.node_page) ? +				ADDRS_PER_INODE : +				ADDRS_PER_BLOCK; + +		clear_buffer_new(bh_result); + +		/* Give more consecutive addresses for the read ahead */ +		for (i = 0; i < end_offset - dn.ofs_in_node; i++) +			if (((datablock_addr(dn.node_page, +							dn.ofs_in_node + i)) +				!= (dn.data_blkaddr + i)) || maxblocks == i) +				break; +		map_bh(bh_result, inode->i_sb, dn.data_blkaddr); +		bh_result->b_size = (i << blkbits); +	} +	f2fs_put_dnode(&dn); +	return 0; +} + +static int f2fs_read_data_page(struct file *file, struct page *page) +{ +	return mpage_readpage(page, get_data_block_ro); +} + +static int f2fs_read_data_pages(struct file *file, +			struct address_space *mapping, +			struct list_head *pages, unsigned nr_pages) +{ +	return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); +} + +int do_write_data_page(struct page *page) +{ +	struct inode *inode = page->mapping->host; +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	block_t old_blk_addr, new_blk_addr; +	struct dnode_of_data dn; +	int err = 0; + +	set_new_dnode(&dn, inode, NULL, NULL, 0); +	err = get_dnode_of_data(&dn, page->index, RDONLY_NODE); +	if (err) +		return err; + +	old_blk_addr = dn.data_blkaddr; + +	/* This page is already truncated */ +	if (old_blk_addr == NULL_ADDR) +		goto out_writepage; + +	set_page_writeback(page); + +	/* +	 * If current allocation needs SSR, +	 * it had better in-place writes for updated data. +	 */ +	if (old_blk_addr != NEW_ADDR && !is_cold_data(page) && +				need_inplace_update(inode)) { +		rewrite_data_page(F2FS_SB(inode->i_sb), page, +						old_blk_addr); +	} else { +		write_data_page(inode, page, &dn, +				old_blk_addr, &new_blk_addr); +		update_extent_cache(new_blk_addr, &dn); +		F2FS_I(inode)->data_version = +			le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); +	} +out_writepage: +	f2fs_put_dnode(&dn); +	return err; +} + +static int f2fs_write_data_page(struct page *page, +					struct writeback_control *wbc) +{ +	struct inode *inode = page->mapping->host; +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	loff_t i_size = i_size_read(inode); +	const pgoff_t end_index = ((unsigned long long) i_size) +							>> PAGE_CACHE_SHIFT; +	unsigned offset; +	int err = 0; + +	if (page->index < end_index) +		goto out; + +	/* +	 * If the offset is out-of-range of file size, +	 * this page does not have to be written to disk. +	 */ +	offset = i_size & (PAGE_CACHE_SIZE - 1); +	if ((page->index >= end_index + 1) || !offset) { +		if (S_ISDIR(inode->i_mode)) { +			dec_page_count(sbi, F2FS_DIRTY_DENTS); +			inode_dec_dirty_dents(inode); +		} +		goto unlock_out; +	} + +	zero_user_segment(page, offset, PAGE_CACHE_SIZE); +out: +	if (sbi->por_doing) +		goto redirty_out; + +	if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page)) +		goto redirty_out; + +	mutex_lock_op(sbi, DATA_WRITE); +	if (S_ISDIR(inode->i_mode)) { +		dec_page_count(sbi, F2FS_DIRTY_DENTS); +		inode_dec_dirty_dents(inode); +	} +	err = do_write_data_page(page); +	if (err && err != -ENOENT) { +		wbc->pages_skipped++; +		set_page_dirty(page); +	} +	mutex_unlock_op(sbi, DATA_WRITE); + +	if (wbc->for_reclaim) +		f2fs_submit_bio(sbi, DATA, true); + +	if (err == -ENOENT) +		goto unlock_out; + +	clear_cold_data(page); +	unlock_page(page); + +	if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode)) +		f2fs_balance_fs(sbi); +	return 0; + +unlock_out: +	unlock_page(page); +	return (err == -ENOENT) ? 0 : err; + +redirty_out: +	wbc->pages_skipped++; +	set_page_dirty(page); +	return AOP_WRITEPAGE_ACTIVATE; +} + +#define MAX_DESIRED_PAGES_WP	4096 + +static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, +			void *data) +{ +	struct address_space *mapping = data; +	int ret = mapping->a_ops->writepage(page, wbc); +	mapping_set_error(mapping, ret); +	return ret; +} + +static int f2fs_write_data_pages(struct address_space *mapping, +			    struct writeback_control *wbc) +{ +	struct inode *inode = mapping->host; +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	int ret; +	long excess_nrtw = 0, desired_nrtw; + +	if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { +		desired_nrtw = MAX_DESIRED_PAGES_WP; +		excess_nrtw = desired_nrtw - wbc->nr_to_write; +		wbc->nr_to_write = desired_nrtw; +	} + +	if (!S_ISDIR(inode->i_mode)) +		mutex_lock(&sbi->writepages); +	ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); +	if (!S_ISDIR(inode->i_mode)) +		mutex_unlock(&sbi->writepages); +	f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); + +	remove_dirty_dir_inode(inode); + +	wbc->nr_to_write -= excess_nrtw; +	return ret; +} + +static int f2fs_write_begin(struct file *file, struct address_space *mapping, +		loff_t pos, unsigned len, unsigned flags, +		struct page **pagep, void **fsdata) +{ +	struct inode *inode = mapping->host; +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct page *page; +	pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; +	struct dnode_of_data dn; +	int err = 0; + +	/* for nobh_write_end */ +	*fsdata = NULL; + +	f2fs_balance_fs(sbi); + +	page = grab_cache_page_write_begin(mapping, index, flags); +	if (!page) +		return -ENOMEM; +	*pagep = page; + +	mutex_lock_op(sbi, DATA_NEW); + +	set_new_dnode(&dn, inode, NULL, NULL, 0); +	err = get_dnode_of_data(&dn, index, 0); +	if (err) { +		mutex_unlock_op(sbi, DATA_NEW); +		f2fs_put_page(page, 1); +		return err; +	} + +	if (dn.data_blkaddr == NULL_ADDR) { +		err = reserve_new_block(&dn); +		if (err) { +			f2fs_put_dnode(&dn); +			mutex_unlock_op(sbi, DATA_NEW); +			f2fs_put_page(page, 1); +			return err; +		} +	} +	f2fs_put_dnode(&dn); + +	mutex_unlock_op(sbi, DATA_NEW); + +	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) +		return 0; + +	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { +		unsigned start = pos & (PAGE_CACHE_SIZE - 1); +		unsigned end = start + len; + +		/* Reading beyond i_size is simple: memset to zero */ +		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); +		return 0; +	} + +	if (dn.data_blkaddr == NEW_ADDR) { +		zero_user_segment(page, 0, PAGE_CACHE_SIZE); +	} else { +		err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); +		if (err) { +			f2fs_put_page(page, 1); +			return err; +		} +	} +	SetPageUptodate(page); +	clear_cold_data(page); +	return 0; +} + +static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, +		const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ +	struct file *file = iocb->ki_filp; +	struct inode *inode = file->f_mapping->host; + +	if (rw == WRITE) +		return 0; + +	/* Needs synchronization with the cleaner */ +	return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, +						  get_data_block_ro); +} + +static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) +{ +	struct inode *inode = page->mapping->host; +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	if (S_ISDIR(inode->i_mode) && PageDirty(page)) { +		dec_page_count(sbi, F2FS_DIRTY_DENTS); +		inode_dec_dirty_dents(inode); +	} +	ClearPagePrivate(page); +} + +static int f2fs_release_data_page(struct page *page, gfp_t wait) +{ +	ClearPagePrivate(page); +	return 0; +} + +static int f2fs_set_data_page_dirty(struct page *page) +{ +	struct address_space *mapping = page->mapping; +	struct inode *inode = mapping->host; + +	SetPageUptodate(page); +	if (!PageDirty(page)) { +		__set_page_dirty_nobuffers(page); +		set_dirty_dir_page(inode, page); +		return 1; +	} +	return 0; +} + +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +{ +	return generic_block_bmap(mapping, block, get_data_block_ro); +} + +const struct address_space_operations f2fs_dblock_aops = { +	.readpage	= f2fs_read_data_page, +	.readpages	= f2fs_read_data_pages, +	.writepage	= f2fs_write_data_page, +	.writepages	= f2fs_write_data_pages, +	.write_begin	= f2fs_write_begin, +	.write_end	= nobh_write_end, +	.set_page_dirty	= f2fs_set_data_page_dirty, +	.invalidatepage	= f2fs_invalidate_data_page, +	.releasepage	= f2fs_release_data_page, +	.direct_IO	= f2fs_direct_IO, +	.bmap		= f2fs_bmap, +}; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c new file mode 100644 index 00000000000..025b9e2f935 --- /dev/null +++ b/fs/f2fs/debug.c @@ -0,0 +1,355 @@ +/* + * f2fs debugging statistics + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * Copyright (c) 2012 Linux Foundation + * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/backing-dev.h> +#include <linux/proc_fs.h> +#include <linux/f2fs_fs.h> +#include <linux/blkdev.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" + +static LIST_HEAD(f2fs_stat_list); +static struct dentry *debugfs_root; +static DEFINE_MUTEX(f2fs_stat_mutex); + +static void update_general_status(struct f2fs_sb_info *sbi) +{ +	struct f2fs_stat_info *si = sbi->stat_info; +	int i; + +	/* valid check of the segment numbers */ +	si->hit_ext = sbi->read_hit_ext; +	si->total_ext = sbi->total_hit_ext; +	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); +	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); +	si->ndirty_dirs = sbi->n_dirty_dirs; +	si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); +	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; +	si->rsvd_segs = reserved_segments(sbi); +	si->overp_segs = overprovision_segments(sbi); +	si->valid_count = valid_user_blocks(sbi); +	si->valid_node_count = valid_node_count(sbi); +	si->valid_inode_count = valid_inode_count(sbi); +	si->utilization = utilization(sbi); + +	si->free_segs = free_segments(sbi); +	si->free_secs = free_sections(sbi); +	si->prefree_count = prefree_segments(sbi); +	si->dirty_count = dirty_segments(sbi); +	si->node_pages = sbi->node_inode->i_mapping->nrpages; +	si->meta_pages = sbi->meta_inode->i_mapping->nrpages; +	si->nats = NM_I(sbi)->nat_cnt; +	si->sits = SIT_I(sbi)->dirty_sentries; +	si->fnids = NM_I(sbi)->fcnt; +	si->bg_gc = sbi->bg_gc; +	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) +		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) +		/ 2; +	si->util_valid = (int)(written_block_count(sbi) >> +						sbi->log_blocks_per_seg) +		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) +		/ 2; +	si->util_invalid = 50 - si->util_free - si->util_valid; +	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { +		struct curseg_info *curseg = CURSEG_I(sbi, i); +		si->curseg[i] = curseg->segno; +		si->cursec[i] = curseg->segno / sbi->segs_per_sec; +		si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; +	} + +	for (i = 0; i < 2; i++) { +		si->segment_count[i] = sbi->segment_count[i]; +		si->block_count[i] = sbi->block_count[i]; +	} +} + +/* + * This function calculates BDF of every segments + */ +static void update_sit_info(struct f2fs_sb_info *sbi) +{ +	struct f2fs_stat_info *si = sbi->stat_info; +	unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; +	struct sit_info *sit_i = SIT_I(sbi); +	unsigned int segno, vblocks; +	int ndirty = 0; + +	bimodal = 0; +	total_vblocks = 0; +	blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); +	hblks_per_sec = blks_per_sec / 2; +	mutex_lock(&sit_i->sentry_lock); +	for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { +		vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); +		dist = abs(vblocks - hblks_per_sec); +		bimodal += dist * dist; + +		if (vblocks > 0 && vblocks < blks_per_sec) { +			total_vblocks += vblocks; +			ndirty++; +		} +	} +	mutex_unlock(&sit_i->sentry_lock); +	dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100; +	si->bimodal = bimodal / dist; +	if (si->dirty_count) +		si->avg_vblocks = total_vblocks / ndirty; +	else +		si->avg_vblocks = 0; +} + +/* + * This function calculates memory footprint. + */ +static void update_mem_info(struct f2fs_sb_info *sbi) +{ +	struct f2fs_stat_info *si = sbi->stat_info; +	unsigned npages; + +	if (si->base_mem) +		goto get_cache; + +	si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; +	si->base_mem += 2 * sizeof(struct f2fs_inode_info); +	si->base_mem += sizeof(*sbi->ckpt); + +	/* build sm */ +	si->base_mem += sizeof(struct f2fs_sm_info); + +	/* build sit */ +	si->base_mem += sizeof(struct sit_info); +	si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry); +	si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); +	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); +	if (sbi->segs_per_sec > 1) +		si->base_mem += sbi->total_sections * +			sizeof(struct sec_entry); +	si->base_mem += __bitmap_size(sbi, SIT_BITMAP); + +	/* build free segmap */ +	si->base_mem += sizeof(struct free_segmap_info); +	si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); +	si->base_mem += f2fs_bitmap_size(sbi->total_sections); + +	/* build curseg */ +	si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; +	si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE; + +	/* build dirty segmap */ +	si->base_mem += sizeof(struct dirty_seglist_info); +	si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); +	si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi)); + +	/* buld nm */ +	si->base_mem += sizeof(struct f2fs_nm_info); +	si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + +	/* build gc */ +	si->base_mem += sizeof(struct f2fs_gc_kthread); + +get_cache: +	/* free nids */ +	si->cache_mem = NM_I(sbi)->fcnt; +	si->cache_mem += NM_I(sbi)->nat_cnt; +	npages = sbi->node_inode->i_mapping->nrpages; +	si->cache_mem += npages << PAGE_CACHE_SHIFT; +	npages = sbi->meta_inode->i_mapping->nrpages; +	si->cache_mem += npages << PAGE_CACHE_SHIFT; +	si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); +	si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); +} + +static int stat_show(struct seq_file *s, void *v) +{ +	struct f2fs_stat_info *si, *next; +	int i = 0; +	int j; + +	mutex_lock(&f2fs_stat_mutex); +	list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { +		char devname[BDEVNAME_SIZE]; + +		update_general_status(si->sbi); + +		seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", +			bdevname(si->sbi->sb->s_bdev, devname), i++); +		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", +			   si->sit_area_segs, si->nat_area_segs); +		seq_printf(s, "[SSA: %d] [MAIN: %d", +			   si->ssa_area_segs, si->main_area_segs); +		seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", +			   si->overp_segs, si->rsvd_segs); +		seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", +			   si->utilization, si->valid_count); +		seq_printf(s, "  - Node: %u (Inode: %u, ", +			   si->valid_node_count, si->valid_inode_count); +		seq_printf(s, "Other: %u)\n  - Data: %u\n", +			   si->valid_node_count - si->valid_inode_count, +			   si->valid_count - si->valid_node_count); +		seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", +			   si->main_area_segs, si->main_area_sections, +			   si->main_area_zones); +		seq_printf(s, "  - COLD  data: %d, %d, %d\n", +			   si->curseg[CURSEG_COLD_DATA], +			   si->cursec[CURSEG_COLD_DATA], +			   si->curzone[CURSEG_COLD_DATA]); +		seq_printf(s, "  - WARM  data: %d, %d, %d\n", +			   si->curseg[CURSEG_WARM_DATA], +			   si->cursec[CURSEG_WARM_DATA], +			   si->curzone[CURSEG_WARM_DATA]); +		seq_printf(s, "  - HOT   data: %d, %d, %d\n", +			   si->curseg[CURSEG_HOT_DATA], +			   si->cursec[CURSEG_HOT_DATA], +			   si->curzone[CURSEG_HOT_DATA]); +		seq_printf(s, "  - Dir   dnode: %d, %d, %d\n", +			   si->curseg[CURSEG_HOT_NODE], +			   si->cursec[CURSEG_HOT_NODE], +			   si->curzone[CURSEG_HOT_NODE]); +		seq_printf(s, "  - File   dnode: %d, %d, %d\n", +			   si->curseg[CURSEG_WARM_NODE], +			   si->cursec[CURSEG_WARM_NODE], +			   si->curzone[CURSEG_WARM_NODE]); +		seq_printf(s, "  - Indir nodes: %d, %d, %d\n", +			   si->curseg[CURSEG_COLD_NODE], +			   si->cursec[CURSEG_COLD_NODE], +			   si->curzone[CURSEG_COLD_NODE]); +		seq_printf(s, "\n  - Valid: %d\n  - Dirty: %d\n", +			   si->main_area_segs - si->dirty_count - +			   si->prefree_count - si->free_segs, +			   si->dirty_count); +		seq_printf(s, "  - Prefree: %d\n  - Free: %d (%d)\n\n", +			   si->prefree_count, si->free_segs, si->free_secs); +		seq_printf(s, "GC calls: %d (BG: %d)\n", +			   si->call_count, si->bg_gc); +		seq_printf(s, "  - data segments : %d\n", si->data_segs); +		seq_printf(s, "  - node segments : %d\n", si->node_segs); +		seq_printf(s, "Try to move %d blocks\n", si->tot_blks); +		seq_printf(s, "  - data blocks : %d\n", si->data_blks); +		seq_printf(s, "  - node blocks : %d\n", si->node_blks); +		seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", +			   si->hit_ext, si->total_ext); +		seq_printf(s, "\nBalancing F2FS Async:\n"); +		seq_printf(s, "  - nodes %4d in %4d\n", +			   si->ndirty_node, si->node_pages); +		seq_printf(s, "  - dents %4d in dirs:%4d\n", +			   si->ndirty_dent, si->ndirty_dirs); +		seq_printf(s, "  - meta %4d in %4d\n", +			   si->ndirty_meta, si->meta_pages); +		seq_printf(s, "  - NATs %5d > %lu\n", +			   si->nats, NM_WOUT_THRESHOLD); +		seq_printf(s, "  - SITs: %5d\n  - free_nids: %5d\n", +			   si->sits, si->fnids); +		seq_printf(s, "\nDistribution of User Blocks:"); +		seq_printf(s, " [ valid | invalid | free ]\n"); +		seq_printf(s, "  ["); + +		for (j = 0; j < si->util_valid; j++) +			seq_printf(s, "-"); +		seq_printf(s, "|"); + +		for (j = 0; j < si->util_invalid; j++) +			seq_printf(s, "-"); +		seq_printf(s, "|"); + +		for (j = 0; j < si->util_free; j++) +			seq_printf(s, "-"); +		seq_printf(s, "]\n\n"); +		seq_printf(s, "SSR: %u blocks in %u segments\n", +			   si->block_count[SSR], si->segment_count[SSR]); +		seq_printf(s, "LFS: %u blocks in %u segments\n", +			   si->block_count[LFS], si->segment_count[LFS]); + +		/* segment usage info */ +		update_sit_info(si->sbi); +		seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", +			   si->bimodal, si->avg_vblocks); + +		/* memory footprint */ +		update_mem_info(si->sbi); +		seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", +				(si->base_mem + si->cache_mem) >> 10, +				si->base_mem >> 10, si->cache_mem >> 10); +	} +	mutex_unlock(&f2fs_stat_mutex); +	return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ +	return single_open(file, stat_show, inode->i_private); +} + +static const struct file_operations stat_fops = { +	.open = stat_open, +	.read = seq_read, +	.llseek = seq_lseek, +	.release = single_release, +}; + +int f2fs_build_stats(struct f2fs_sb_info *sbi) +{ +	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); +	struct f2fs_stat_info *si; + +	sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); +	if (!sbi->stat_info) +		return -ENOMEM; + +	si = sbi->stat_info; +	si->all_area_segs = le32_to_cpu(raw_super->segment_count); +	si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); +	si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); +	si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa); +	si->main_area_segs = le32_to_cpu(raw_super->segment_count_main); +	si->main_area_sections = le32_to_cpu(raw_super->section_count); +	si->main_area_zones = si->main_area_sections / +				le32_to_cpu(raw_super->secs_per_zone); +	si->sbi = sbi; + +	mutex_lock(&f2fs_stat_mutex); +	list_add_tail(&si->stat_list, &f2fs_stat_list); +	mutex_unlock(&f2fs_stat_mutex); + +	return 0; +} + +void f2fs_destroy_stats(struct f2fs_sb_info *sbi) +{ +	struct f2fs_stat_info *si = sbi->stat_info; + +	mutex_lock(&f2fs_stat_mutex); +	list_del(&si->stat_list); +	mutex_unlock(&f2fs_stat_mutex); + +	kfree(sbi->stat_info); +} + +void __init f2fs_create_root_stats(void) +{ +	debugfs_root = debugfs_create_dir("f2fs", NULL); +	if (debugfs_root) +		debugfs_create_file("status", S_IRUGO, debugfs_root, +					 NULL, &stat_fops); +} + +void f2fs_destroy_root_stats(void) +{ +	debugfs_remove_recursive(debugfs_root); +	debugfs_root = NULL; +} diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c new file mode 100644 index 00000000000..a1f38443ece --- /dev/null +++ b/fs/f2fs/dir.c @@ -0,0 +1,671 @@ +/* + * fs/f2fs/dir.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include "f2fs.h" +#include "node.h" +#include "acl.h" + +static unsigned long dir_blocks(struct inode *inode) +{ +	return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1)) +							>> PAGE_CACHE_SHIFT; +} + +static unsigned int dir_buckets(unsigned int level) +{ +	if (level < MAX_DIR_HASH_DEPTH / 2) +		return 1 << level; +	else +		return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); +} + +static unsigned int bucket_blocks(unsigned int level) +{ +	if (level < MAX_DIR_HASH_DEPTH / 2) +		return 2; +	else +		return 4; +} + +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { +	[F2FS_FT_UNKNOWN]	= DT_UNKNOWN, +	[F2FS_FT_REG_FILE]	= DT_REG, +	[F2FS_FT_DIR]		= DT_DIR, +	[F2FS_FT_CHRDEV]	= DT_CHR, +	[F2FS_FT_BLKDEV]	= DT_BLK, +	[F2FS_FT_FIFO]		= DT_FIFO, +	[F2FS_FT_SOCK]		= DT_SOCK, +	[F2FS_FT_SYMLINK]	= DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { +	[S_IFREG >> S_SHIFT]	= F2FS_FT_REG_FILE, +	[S_IFDIR >> S_SHIFT]	= F2FS_FT_DIR, +	[S_IFCHR >> S_SHIFT]	= F2FS_FT_CHRDEV, +	[S_IFBLK >> S_SHIFT]	= F2FS_FT_BLKDEV, +	[S_IFIFO >> S_SHIFT]	= F2FS_FT_FIFO, +	[S_IFSOCK >> S_SHIFT]	= F2FS_FT_SOCK, +	[S_IFLNK >> S_SHIFT]	= F2FS_FT_SYMLINK, +}; + +static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) +{ +	mode_t mode = inode->i_mode; +	de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +} + +static unsigned long dir_block_index(unsigned int level, unsigned int idx) +{ +	unsigned long i; +	unsigned long bidx = 0; + +	for (i = 0; i < level; i++) +		bidx += dir_buckets(i) * bucket_blocks(i); +	bidx += idx * bucket_blocks(level); +	return bidx; +} + +static bool early_match_name(const char *name, size_t namelen, +			f2fs_hash_t namehash, struct f2fs_dir_entry *de) +{ +	if (le16_to_cpu(de->name_len) != namelen) +		return false; + +	if (de->hash_code != namehash) +		return false; + +	return true; +} + +static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, +			const char *name, size_t namelen, int *max_slots, +			f2fs_hash_t namehash, struct page **res_page) +{ +	struct f2fs_dir_entry *de; +	unsigned long bit_pos, end_pos, next_pos; +	struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); +	int slots; + +	bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, +					NR_DENTRY_IN_BLOCK, 0); +	while (bit_pos < NR_DENTRY_IN_BLOCK) { +		de = &dentry_blk->dentry[bit_pos]; +		slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + +		if (early_match_name(name, namelen, namehash, de)) { +			if (!memcmp(dentry_blk->filename[bit_pos], +							name, namelen)) { +				*res_page = dentry_page; +				goto found; +			} +		} +		next_pos = bit_pos + slots; +		bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, +				NR_DENTRY_IN_BLOCK, next_pos); +		if (bit_pos >= NR_DENTRY_IN_BLOCK) +			end_pos = NR_DENTRY_IN_BLOCK; +		else +			end_pos = bit_pos; +		if (*max_slots < end_pos - next_pos) +			*max_slots = end_pos - next_pos; +	} + +	de = NULL; +	kunmap(dentry_page); +found: +	return de; +} + +static struct f2fs_dir_entry *find_in_level(struct inode *dir, +		unsigned int level, const char *name, size_t namelen, +			f2fs_hash_t namehash, struct page **res_page) +{ +	int s = GET_DENTRY_SLOTS(namelen); +	unsigned int nbucket, nblock; +	unsigned int bidx, end_block; +	struct page *dentry_page; +	struct f2fs_dir_entry *de = NULL; +	bool room = false; +	int max_slots = 0; + +	BUG_ON(level > MAX_DIR_HASH_DEPTH); + +	nbucket = dir_buckets(level); +	nblock = bucket_blocks(level); + +	bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); +	end_block = bidx + nblock; + +	for (; bidx < end_block; bidx++) { +		/* no need to allocate new dentry pages to all the indices */ +		dentry_page = find_data_page(dir, bidx); +		if (IS_ERR(dentry_page)) { +			room = true; +			continue; +		} + +		de = find_in_block(dentry_page, name, namelen, +					&max_slots, namehash, res_page); +		if (de) +			break; + +		if (max_slots >= s) +			room = true; +		f2fs_put_page(dentry_page, 0); +	} + +	if (!de && room && F2FS_I(dir)->chash != namehash) { +		F2FS_I(dir)->chash = namehash; +		F2FS_I(dir)->clevel = level; +	} + +	return de; +} + +/* + * Find an entry in the specified directory with the wanted name. + * It returns the page where the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, +			struct qstr *child, struct page **res_page) +{ +	const char *name = child->name; +	size_t namelen = child->len; +	unsigned long npages = dir_blocks(dir); +	struct f2fs_dir_entry *de = NULL; +	f2fs_hash_t name_hash; +	unsigned int max_depth; +	unsigned int level; + +	if (npages == 0) +		return NULL; + +	*res_page = NULL; + +	name_hash = f2fs_dentry_hash(name, namelen); +	max_depth = F2FS_I(dir)->i_current_depth; + +	for (level = 0; level < max_depth; level++) { +		de = find_in_level(dir, level, name, +				namelen, name_hash, res_page); +		if (de) +			break; +	} +	if (!de && F2FS_I(dir)->chash != name_hash) { +		F2FS_I(dir)->chash = name_hash; +		F2FS_I(dir)->clevel = level - 1; +	} +	return de; +} + +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) +{ +	struct page *page = NULL; +	struct f2fs_dir_entry *de = NULL; +	struct f2fs_dentry_block *dentry_blk = NULL; + +	page = get_lock_data_page(dir, 0); +	if (IS_ERR(page)) +		return NULL; + +	dentry_blk = kmap(page); +	de = &dentry_blk->dentry[1]; +	*p = page; +	unlock_page(page); +	return de; +} + +ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +{ +	ino_t res = 0; +	struct f2fs_dir_entry *de; +	struct page *page; + +	de = f2fs_find_entry(dir, qstr, &page); +	if (de) { +		res = le32_to_cpu(de->ino); +		kunmap(page); +		f2fs_put_page(page, 0); +	} + +	return res; +} + +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, +		struct page *page, struct inode *inode) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + +	mutex_lock_op(sbi, DENTRY_OPS); +	lock_page(page); +	wait_on_page_writeback(page); +	de->ino = cpu_to_le32(inode->i_ino); +	set_de_type(de, inode); +	kunmap(page); +	set_page_dirty(page); +	dir->i_mtime = dir->i_ctime = CURRENT_TIME; +	mark_inode_dirty(dir); + +	/* update parent inode number before releasing dentry page */ +	F2FS_I(inode)->i_pino = dir->i_ino; + +	f2fs_put_page(page, 1); +	mutex_unlock_op(sbi, DENTRY_OPS); +} + +void init_dent_inode(const struct qstr *name, struct page *ipage) +{ +	struct f2fs_node *rn; + +	if (IS_ERR(ipage)) +		return; + +	wait_on_page_writeback(ipage); + +	/* copy name info. to this inode page */ +	rn = (struct f2fs_node *)page_address(ipage); +	rn->i.i_namelen = cpu_to_le32(name->len); +	memcpy(rn->i.i_name, name->name, name->len); +	set_page_dirty(ipage); +} + +static int init_inode_metadata(struct inode *inode, +		struct inode *dir, const struct qstr *name) +{ +	if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { +		int err; +		err = new_inode_page(inode, name); +		if (err) +			return err; + +		if (S_ISDIR(inode->i_mode)) { +			err = f2fs_make_empty(inode, dir); +			if (err) { +				remove_inode_page(inode); +				return err; +			} +		} + +		err = f2fs_init_acl(inode, dir); +		if (err) { +			remove_inode_page(inode); +			return err; +		} +	} else { +		struct page *ipage; +		ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); +		if (IS_ERR(ipage)) +			return PTR_ERR(ipage); +		set_cold_node(inode, ipage); +		init_dent_inode(name, ipage); +		f2fs_put_page(ipage, 1); +	} +	if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { +		inc_nlink(inode); +		f2fs_write_inode(inode, NULL); +	} +	return 0; +} + +static void update_parent_metadata(struct inode *dir, struct inode *inode, +						unsigned int current_depth) +{ +	bool need_dir_update = false; + +	if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { +		if (S_ISDIR(inode->i_mode)) { +			inc_nlink(dir); +			need_dir_update = true; +		} +		clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); +	} +	dir->i_mtime = dir->i_ctime = CURRENT_TIME; +	if (F2FS_I(dir)->i_current_depth != current_depth) { +		F2FS_I(dir)->i_current_depth = current_depth; +		need_dir_update = true; +	} + +	if (need_dir_update) +		f2fs_write_inode(dir, NULL); +	else +		mark_inode_dirty(dir); + +	if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) +		clear_inode_flag(F2FS_I(inode), FI_INC_LINK); +} + +static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots) +{ +	int bit_start = 0; +	int zero_start, zero_end; +next: +	zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap, +						NR_DENTRY_IN_BLOCK, +						bit_start); +	if (zero_start >= NR_DENTRY_IN_BLOCK) +		return NR_DENTRY_IN_BLOCK; + +	zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap, +						NR_DENTRY_IN_BLOCK, +						zero_start); +	if (zero_end - zero_start >= slots) +		return zero_start; + +	bit_start = zero_end + 1; + +	if (zero_end + 1 >= NR_DENTRY_IN_BLOCK) +		return NR_DENTRY_IN_BLOCK; +	goto next; +} + +int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) +{ +	unsigned int bit_pos; +	unsigned int level; +	unsigned int current_depth; +	unsigned long bidx, block; +	f2fs_hash_t dentry_hash; +	struct f2fs_dir_entry *de; +	unsigned int nbucket, nblock; +	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); +	size_t namelen = name->len; +	struct page *dentry_page = NULL; +	struct f2fs_dentry_block *dentry_blk = NULL; +	int slots = GET_DENTRY_SLOTS(namelen); +	int err = 0; +	int i; + +	dentry_hash = f2fs_dentry_hash(name->name, name->len); +	level = 0; +	current_depth = F2FS_I(dir)->i_current_depth; +	if (F2FS_I(dir)->chash == dentry_hash) { +		level = F2FS_I(dir)->clevel; +		F2FS_I(dir)->chash = 0; +	} + +start: +	if (current_depth == MAX_DIR_HASH_DEPTH) +		return -ENOSPC; + +	/* Increase the depth, if required */ +	if (level == current_depth) +		++current_depth; + +	nbucket = dir_buckets(level); +	nblock = bucket_blocks(level); + +	bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); + +	for (block = bidx; block <= (bidx + nblock - 1); block++) { +		mutex_lock_op(sbi, DENTRY_OPS); +		dentry_page = get_new_data_page(dir, block, true); +		if (IS_ERR(dentry_page)) { +			mutex_unlock_op(sbi, DENTRY_OPS); +			return PTR_ERR(dentry_page); +		} + +		dentry_blk = kmap(dentry_page); +		bit_pos = room_for_filename(dentry_blk, slots); +		if (bit_pos < NR_DENTRY_IN_BLOCK) +			goto add_dentry; + +		kunmap(dentry_page); +		f2fs_put_page(dentry_page, 1); +		mutex_unlock_op(sbi, DENTRY_OPS); +	} + +	/* Move to next level to find the empty slot for new dentry */ +	++level; +	goto start; +add_dentry: +	err = init_inode_metadata(inode, dir, name); +	if (err) +		goto fail; + +	wait_on_page_writeback(dentry_page); + +	de = &dentry_blk->dentry[bit_pos]; +	de->hash_code = dentry_hash; +	de->name_len = cpu_to_le16(namelen); +	memcpy(dentry_blk->filename[bit_pos], name->name, name->len); +	de->ino = cpu_to_le32(inode->i_ino); +	set_de_type(de, inode); +	for (i = 0; i < slots; i++) +		test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); +	set_page_dirty(dentry_page); + +	update_parent_metadata(dir, inode, current_depth); + +	/* update parent inode number before releasing dentry page */ +	F2FS_I(inode)->i_pino = dir->i_ino; +fail: +	kunmap(dentry_page); +	f2fs_put_page(dentry_page, 1); +	mutex_unlock_op(sbi, DENTRY_OPS); +	return err; +} + +/* + * It only removes the dentry from the dentry page,corresponding name + * entry in name page does not need to be touched during deletion. + */ +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, +						struct inode *inode) +{ +	struct	f2fs_dentry_block *dentry_blk; +	unsigned int bit_pos; +	struct address_space *mapping = page->mapping; +	struct inode *dir = mapping->host; +	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); +	int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); +	void *kaddr = page_address(page); +	int i; + +	mutex_lock_op(sbi, DENTRY_OPS); + +	lock_page(page); +	wait_on_page_writeback(page); + +	dentry_blk = (struct f2fs_dentry_block *)kaddr; +	bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; +	for (i = 0; i < slots; i++) +		test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + +	/* Let's check and deallocate this dentry page */ +	bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, +			NR_DENTRY_IN_BLOCK, +			0); +	kunmap(page); /* kunmap - pair of f2fs_find_entry */ +	set_page_dirty(page); + +	dir->i_ctime = dir->i_mtime = CURRENT_TIME; + +	if (inode && S_ISDIR(inode->i_mode)) { +		drop_nlink(dir); +		f2fs_write_inode(dir, NULL); +	} else { +		mark_inode_dirty(dir); +	} + +	if (inode) { +		inode->i_ctime = CURRENT_TIME; +		drop_nlink(inode); +		if (S_ISDIR(inode->i_mode)) { +			drop_nlink(inode); +			i_size_write(inode, 0); +		} +		f2fs_write_inode(inode, NULL); +		if (inode->i_nlink == 0) +			add_orphan_inode(sbi, inode->i_ino); +	} + +	if (bit_pos == NR_DENTRY_IN_BLOCK) { +		truncate_hole(dir, page->index, page->index + 1); +		clear_page_dirty_for_io(page); +		ClearPageUptodate(page); +		dec_page_count(sbi, F2FS_DIRTY_DENTS); +		inode_dec_dirty_dents(dir); +	} +	f2fs_put_page(page, 1); + +	mutex_unlock_op(sbi, DENTRY_OPS); +} + +int f2fs_make_empty(struct inode *inode, struct inode *parent) +{ +	struct page *dentry_page; +	struct f2fs_dentry_block *dentry_blk; +	struct f2fs_dir_entry *de; +	void *kaddr; + +	dentry_page = get_new_data_page(inode, 0, true); +	if (IS_ERR(dentry_page)) +		return PTR_ERR(dentry_page); + +	kaddr = kmap_atomic(dentry_page); +	dentry_blk = (struct f2fs_dentry_block *)kaddr; + +	de = &dentry_blk->dentry[0]; +	de->name_len = cpu_to_le16(1); +	de->hash_code = f2fs_dentry_hash(".", 1); +	de->ino = cpu_to_le32(inode->i_ino); +	memcpy(dentry_blk->filename[0], ".", 1); +	set_de_type(de, inode); + +	de = &dentry_blk->dentry[1]; +	de->hash_code = f2fs_dentry_hash("..", 2); +	de->name_len = cpu_to_le16(2); +	de->ino = cpu_to_le32(parent->i_ino); +	memcpy(dentry_blk->filename[1], "..", 2); +	set_de_type(de, inode); + +	test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); +	test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); +	kunmap_atomic(kaddr); + +	set_page_dirty(dentry_page); +	f2fs_put_page(dentry_page, 1); +	return 0; +} + +bool f2fs_empty_dir(struct inode *dir) +{ +	unsigned long bidx; +	struct page *dentry_page; +	unsigned int bit_pos; +	struct	f2fs_dentry_block *dentry_blk; +	unsigned long nblock = dir_blocks(dir); + +	for (bidx = 0; bidx < nblock; bidx++) { +		void *kaddr; +		dentry_page = get_lock_data_page(dir, bidx); +		if (IS_ERR(dentry_page)) { +			if (PTR_ERR(dentry_page) == -ENOENT) +				continue; +			else +				return false; +		} + +		kaddr = kmap_atomic(dentry_page); +		dentry_blk = (struct f2fs_dentry_block *)kaddr; +		if (bidx == 0) +			bit_pos = 2; +		else +			bit_pos = 0; +		bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, +						NR_DENTRY_IN_BLOCK, +						bit_pos); +		kunmap_atomic(kaddr); + +		f2fs_put_page(dentry_page, 1); + +		if (bit_pos < NR_DENTRY_IN_BLOCK) +			return false; +	} +	return true; +} + +static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ +	unsigned long pos = file->f_pos; +	struct inode *inode = file_inode(file); +	unsigned long npages = dir_blocks(inode); +	unsigned char *types = NULL; +	unsigned int bit_pos = 0, start_bit_pos = 0; +	int over = 0; +	struct f2fs_dentry_block *dentry_blk = NULL; +	struct f2fs_dir_entry *de = NULL; +	struct page *dentry_page = NULL; +	unsigned int n = 0; +	unsigned char d_type = DT_UNKNOWN; +	int slots; + +	types = f2fs_filetype_table; +	bit_pos = (pos % NR_DENTRY_IN_BLOCK); +	n = (pos / NR_DENTRY_IN_BLOCK); + +	for ( ; n < npages; n++) { +		dentry_page = get_lock_data_page(inode, n); +		if (IS_ERR(dentry_page)) +			continue; + +		start_bit_pos = bit_pos; +		dentry_blk = kmap(dentry_page); +		while (bit_pos < NR_DENTRY_IN_BLOCK) { +			d_type = DT_UNKNOWN; +			bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, +							NR_DENTRY_IN_BLOCK, +							bit_pos); +			if (bit_pos >= NR_DENTRY_IN_BLOCK) +				break; + +			de = &dentry_blk->dentry[bit_pos]; +			if (types && de->file_type < F2FS_FT_MAX) +				d_type = types[de->file_type]; + +			over = filldir(dirent, +					dentry_blk->filename[bit_pos], +					le16_to_cpu(de->name_len), +					(n * NR_DENTRY_IN_BLOCK) + bit_pos, +					le32_to_cpu(de->ino), d_type); +			if (over) { +				file->f_pos += bit_pos - start_bit_pos; +				goto success; +			} +			slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); +			bit_pos += slots; +		} +		bit_pos = 0; +		file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK; +		kunmap(dentry_page); +		f2fs_put_page(dentry_page, 1); +		dentry_page = NULL; +	} +success: +	if (dentry_page && !IS_ERR(dentry_page)) { +		kunmap(dentry_page); +		f2fs_put_page(dentry_page, 1); +	} + +	return 0; +} + +const struct file_operations f2fs_dir_operations = { +	.llseek		= generic_file_llseek, +	.read		= generic_read_dir, +	.readdir	= f2fs_readdir, +	.fsync		= f2fs_sync_file, +	.unlocked_ioctl	= f2fs_ioctl, +}; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h new file mode 100644 index 00000000000..cc2213afdcc --- /dev/null +++ b/fs/f2fs/f2fs.h @@ -0,0 +1,1113 @@ +/* + * fs/f2fs/f2fs.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _LINUX_F2FS_H +#define _LINUX_F2FS_H + +#include <linux/types.h> +#include <linux/page-flags.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/crc32.h> +#include <linux/magic.h> + +/* + * For mount options + */ +#define F2FS_MOUNT_BG_GC		0x00000001 +#define F2FS_MOUNT_DISABLE_ROLL_FORWARD	0x00000002 +#define F2FS_MOUNT_DISCARD		0x00000004 +#define F2FS_MOUNT_NOHEAP		0x00000008 +#define F2FS_MOUNT_XATTR_USER		0x00000010 +#define F2FS_MOUNT_POSIX_ACL		0x00000020 +#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY	0x00000040 + +#define clear_opt(sbi, option)	(sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option)	(sbi->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option)	(sbi->mount_opt.opt & F2FS_MOUNT_##option) + +#define ver_after(a, b)	(typecheck(unsigned long long, a) &&		\ +		typecheck(unsigned long long, b) &&			\ +		((long long)((a) - (b)) > 0)) + +typedef u64 block_t; +typedef u32 nid_t; + +struct f2fs_mount_info { +	unsigned int	opt; +}; + +static inline __u32 f2fs_crc32(void *buff, size_t len) +{ +	return crc32_le(F2FS_SUPER_MAGIC, buff, len); +} + +static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size) +{ +	return f2fs_crc32(buff, buff_size) == blk_crc; +} + +/* + * For checkpoint manager + */ +enum { +	NAT_BITMAP, +	SIT_BITMAP +}; + +/* for the list of orphan inodes */ +struct orphan_inode_entry { +	struct list_head list;	/* list head */ +	nid_t ino;		/* inode number */ +}; + +/* for the list of directory inodes */ +struct dir_inode_entry { +	struct list_head list;	/* list head */ +	struct inode *inode;	/* vfs inode pointer */ +}; + +/* for the list of fsync inodes, used only during recovery */ +struct fsync_inode_entry { +	struct list_head list;	/* list head */ +	struct inode *inode;	/* vfs inode pointer */ +	block_t blkaddr;	/* block address locating the last inode */ +}; + +#define nats_in_cursum(sum)		(le16_to_cpu(sum->n_nats)) +#define sits_in_cursum(sum)		(le16_to_cpu(sum->n_sits)) + +#define nat_in_journal(sum, i)		(sum->nat_j.entries[i].ne) +#define nid_in_journal(sum, i)		(sum->nat_j.entries[i].nid) +#define sit_in_journal(sum, i)		(sum->sit_j.entries[i].se) +#define segno_in_journal(sum, i)	(sum->sit_j.entries[i].segno) + +static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) +{ +	int before = nats_in_cursum(rs); +	rs->n_nats = cpu_to_le16(before + i); +	return before; +} + +static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) +{ +	int before = sits_in_cursum(rs); +	rs->n_sits = cpu_to_le16(before + i); +	return before; +} + +/* + * ioctl commands + */ +#define F2FS_IOC_GETFLAGS               FS_IOC_GETFLAGS +#define F2FS_IOC_SETFLAGS               FS_IOC_SETFLAGS + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define F2FS_IOC32_GETFLAGS             FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS +#endif + +/* + * For INODE and NODE manager + */ +#define XATTR_NODE_OFFSET	(-1)	/* +					 * store xattrs to one node block per +					 * file keeping -1 as its node offset to +					 * distinguish from index node blocks. +					 */ +#define RDONLY_NODE		1	/* +					 * specify a read-only mode when getting +					 * a node block. 0 is read-write mode. +					 * used by get_dnode_of_data(). +					 */ +#define F2FS_LINK_MAX		32000	/* maximum link count per file */ + +/* for in-memory extent cache entry */ +struct extent_info { +	rwlock_t ext_lock;	/* rwlock for consistency */ +	unsigned int fofs;	/* start offset in a file */ +	u32 blk_addr;		/* start block address of the extent */ +	unsigned int len;	/* lenth of the extent */ +}; + +/* + * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. + */ +#define FADVISE_COLD_BIT	0x01 + +struct f2fs_inode_info { +	struct inode vfs_inode;		/* serve a vfs inode */ +	unsigned long i_flags;		/* keep an inode flags for ioctl */ +	unsigned char i_advise;		/* use to give file attribute hints */ +	unsigned int i_current_depth;	/* use only in directory structure */ +	unsigned int i_pino;		/* parent inode number */ +	umode_t i_acl_mode;		/* keep file acl mode temporarily */ + +	/* Use below internally in f2fs*/ +	unsigned long flags;		/* use to pass per-file flags */ +	unsigned long long data_version;/* latest version of data for fsync */ +	atomic_t dirty_dents;		/* # of dirty dentry pages */ +	f2fs_hash_t chash;		/* hash value of given file name */ +	unsigned int clevel;		/* maximum level of given file name */ +	nid_t i_xattr_nid;		/* node id that contains xattrs */ +	struct extent_info ext;		/* in-memory extent cache entry */ +}; + +static inline void get_extent_info(struct extent_info *ext, +					struct f2fs_extent i_ext) +{ +	write_lock(&ext->ext_lock); +	ext->fofs = le32_to_cpu(i_ext.fofs); +	ext->blk_addr = le32_to_cpu(i_ext.blk_addr); +	ext->len = le32_to_cpu(i_ext.len); +	write_unlock(&ext->ext_lock); +} + +static inline void set_raw_extent(struct extent_info *ext, +					struct f2fs_extent *i_ext) +{ +	read_lock(&ext->ext_lock); +	i_ext->fofs = cpu_to_le32(ext->fofs); +	i_ext->blk_addr = cpu_to_le32(ext->blk_addr); +	i_ext->len = cpu_to_le32(ext->len); +	read_unlock(&ext->ext_lock); +} + +struct f2fs_nm_info { +	block_t nat_blkaddr;		/* base disk address of NAT */ +	nid_t max_nid;			/* maximum possible node ids */ +	nid_t init_scan_nid;		/* the first nid to be scanned */ +	nid_t next_scan_nid;		/* the next nid to be scanned */ + +	/* NAT cache management */ +	struct radix_tree_root nat_root;/* root of the nat entry cache */ +	rwlock_t nat_tree_lock;		/* protect nat_tree_lock */ +	unsigned int nat_cnt;		/* the # of cached nat entries */ +	struct list_head nat_entries;	/* cached nat entry list (clean) */ +	struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ + +	/* free node ids management */ +	struct list_head free_nid_list;	/* a list for free nids */ +	spinlock_t free_nid_list_lock;	/* protect free nid list */ +	unsigned int fcnt;		/* the number of free node id */ +	struct mutex build_lock;	/* lock for build free nids */ + +	/* for checkpoint */ +	char *nat_bitmap;		/* NAT bitmap pointer */ +	int bitmap_size;		/* bitmap size */ +}; + +/* + * this structure is used as one of function parameters. + * all the information are dedicated to a given direct node block determined + * by the data offset in a file. + */ +struct dnode_of_data { +	struct inode *inode;		/* vfs inode pointer */ +	struct page *inode_page;	/* its inode page, NULL is possible */ +	struct page *node_page;		/* cached direct node page */ +	nid_t nid;			/* node id of the direct node block */ +	unsigned int ofs_in_node;	/* data offset in the node page */ +	bool inode_page_locked;		/* inode page is locked or not */ +	block_t	data_blkaddr;		/* block address of the node block */ +}; + +static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, +		struct page *ipage, struct page *npage, nid_t nid) +{ +	memset(dn, 0, sizeof(*dn)); +	dn->inode = inode; +	dn->inode_page = ipage; +	dn->node_page = npage; +	dn->nid = nid; +} + +/* + * For SIT manager + * + * By default, there are 6 active log areas across the whole main area. + * When considering hot and cold data separation to reduce cleaning overhead, + * we split 3 for data logs and 3 for node logs as hot, warm, and cold types, + * respectively. + * In the current design, you should not change the numbers intentionally. + * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6 + * logs individually according to the underlying devices. (default: 6) + * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for + * data and 8 for node logs. + */ +#define	NR_CURSEG_DATA_TYPE	(3) +#define NR_CURSEG_NODE_TYPE	(3) +#define NR_CURSEG_TYPE	(NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) + +enum { +	CURSEG_HOT_DATA	= 0,	/* directory entry blocks */ +	CURSEG_WARM_DATA,	/* data blocks */ +	CURSEG_COLD_DATA,	/* multimedia or GCed data blocks */ +	CURSEG_HOT_NODE,	/* direct node blocks of directory files */ +	CURSEG_WARM_NODE,	/* direct node blocks of normal files */ +	CURSEG_COLD_NODE,	/* indirect node blocks */ +	NO_CHECK_TYPE +}; + +struct f2fs_sm_info { +	struct sit_info *sit_info;		/* whole segment information */ +	struct free_segmap_info *free_info;	/* free segment information */ +	struct dirty_seglist_info *dirty_info;	/* dirty segment information */ +	struct curseg_info *curseg_array;	/* active segment information */ + +	struct list_head wblist_head;	/* list of under-writeback pages */ +	spinlock_t wblist_lock;		/* lock for checkpoint */ + +	block_t seg0_blkaddr;		/* block address of 0'th segment */ +	block_t main_blkaddr;		/* start block address of main area */ +	block_t ssa_blkaddr;		/* start block address of SSA area */ + +	unsigned int segment_count;	/* total # of segments */ +	unsigned int main_segments;	/* # of segments in main area */ +	unsigned int reserved_segments;	/* # of reserved segments */ +	unsigned int ovp_segments;	/* # of overprovision segments */ +}; + +/* + * For directory operation + */ +#define	NODE_DIR1_BLOCK		(ADDRS_PER_INODE + 1) +#define	NODE_DIR2_BLOCK		(ADDRS_PER_INODE + 2) +#define	NODE_IND1_BLOCK		(ADDRS_PER_INODE + 3) +#define	NODE_IND2_BLOCK		(ADDRS_PER_INODE + 4) +#define	NODE_DIND_BLOCK		(ADDRS_PER_INODE + 5) + +/* + * For superblock + */ +/* + * COUNT_TYPE for monitoring + * + * f2fs monitors the number of several block types such as on-writeback, + * dirty dentry blocks, dirty node blocks, and dirty meta blocks. + */ +enum count_type { +	F2FS_WRITEBACK, +	F2FS_DIRTY_DENTS, +	F2FS_DIRTY_NODES, +	F2FS_DIRTY_META, +	NR_COUNT_TYPE, +}; + +/* + * FS_LOCK nesting subclasses for the lock validator: + * + * The locking order between these classes is + * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW + *    -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC + */ +enum lock_type { +	RENAME,		/* for renaming operations */ +	DENTRY_OPS,	/* for directory operations */ +	DATA_WRITE,	/* for data write */ +	DATA_NEW,	/* for data allocation */ +	DATA_TRUNC,	/* for data truncate */ +	NODE_NEW,	/* for node allocation */ +	NODE_TRUNC,	/* for node truncate */ +	NODE_WRITE,	/* for node write */ +	NR_LOCK_TYPE, +}; + +/* + * The below are the page types of bios used in submti_bio(). + * The available types are: + * DATA			User data pages. It operates as async mode. + * NODE			Node pages. It operates as async mode. + * META			FS metadata pages such as SIT, NAT, CP. + * NR_PAGE_TYPE		The number of page types. + * META_FLUSH		Make sure the previous pages are written + *			with waiting the bio's completion + * ...			Only can be used with META. + */ +enum page_type { +	DATA, +	NODE, +	META, +	NR_PAGE_TYPE, +	META_FLUSH, +}; + +struct f2fs_sb_info { +	struct super_block *sb;			/* pointer to VFS super block */ +	struct buffer_head *raw_super_buf;	/* buffer head of raw sb */ +	struct f2fs_super_block *raw_super;	/* raw super block pointer */ +	int s_dirty;				/* dirty flag for checkpoint */ + +	/* for node-related operations */ +	struct f2fs_nm_info *nm_info;		/* node manager */ +	struct inode *node_inode;		/* cache node blocks */ + +	/* for segment-related operations */ +	struct f2fs_sm_info *sm_info;		/* segment manager */ +	struct bio *bio[NR_PAGE_TYPE];		/* bios to merge */ +	sector_t last_block_in_bio[NR_PAGE_TYPE];	/* last block number */ +	struct rw_semaphore bio_sem;		/* IO semaphore */ + +	/* for checkpoint */ +	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */ +	struct inode *meta_inode;		/* cache meta blocks */ +	struct mutex cp_mutex;			/* for checkpoint procedure */ +	struct mutex fs_lock[NR_LOCK_TYPE];	/* for blocking FS operations */ +	struct mutex write_inode;		/* mutex for write inode */ +	struct mutex writepages;		/* mutex for writepages() */ +	int por_doing;				/* recovery is doing or not */ + +	/* for orphan inode management */ +	struct list_head orphan_inode_list;	/* orphan inode list */ +	struct mutex orphan_inode_mutex;	/* for orphan inode list */ +	unsigned int n_orphans;			/* # of orphan inodes */ + +	/* for directory inode management */ +	struct list_head dir_inode_list;	/* dir inode list */ +	spinlock_t dir_inode_lock;		/* for dir inode list lock */ +	unsigned int n_dirty_dirs;		/* # of dir inodes */ + +	/* basic file system units */ +	unsigned int log_sectors_per_block;	/* log2 sectors per block */ +	unsigned int log_blocksize;		/* log2 block size */ +	unsigned int blocksize;			/* block size */ +	unsigned int root_ino_num;		/* root inode number*/ +	unsigned int node_ino_num;		/* node inode number*/ +	unsigned int meta_ino_num;		/* meta inode number*/ +	unsigned int log_blocks_per_seg;	/* log2 blocks per segment */ +	unsigned int blocks_per_seg;		/* blocks per segment */ +	unsigned int segs_per_sec;		/* segments per section */ +	unsigned int secs_per_zone;		/* sections per zone */ +	unsigned int total_sections;		/* total section count */ +	unsigned int total_node_count;		/* total node block count */ +	unsigned int total_valid_node_count;	/* valid node block count */ +	unsigned int total_valid_inode_count;	/* valid inode count */ +	int active_logs;			/* # of active logs */ + +	block_t user_block_count;		/* # of user blocks */ +	block_t total_valid_block_count;	/* # of valid blocks */ +	block_t alloc_valid_block_count;	/* # of allocated blocks */ +	block_t last_valid_block_count;		/* for recovery */ +	u32 s_next_generation;			/* for NFS support */ +	atomic_t nr_pages[NR_COUNT_TYPE];	/* # of pages, see count_type */ + +	struct f2fs_mount_info mount_opt;	/* mount options */ + +	/* for cleaning operations */ +	struct mutex gc_mutex;			/* mutex for GC */ +	struct f2fs_gc_kthread	*gc_thread;	/* GC thread */ + +	/* +	 * for stat information. +	 * one is for the LFS mode, and the other is for the SSR mode. +	 */ +	struct f2fs_stat_info *stat_info;	/* FS status information */ +	unsigned int segment_count[2];		/* # of allocated segments */ +	unsigned int block_count[2];		/* # of allocated blocks */ +	unsigned int last_victim[2];		/* last victim segment # */ +	int total_hit_ext, read_hit_ext;	/* extent cache hit ratio */ +	int bg_gc;				/* background gc calls */ +	spinlock_t stat_lock;			/* lock for stat operations */ +}; + +/* + * Inline functions + */ +static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) +{ +	return container_of(inode, struct f2fs_inode_info, vfs_inode); +} + +static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) +{ +	return sb->s_fs_info; +} + +static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) +{ +	return (struct f2fs_super_block *)(sbi->raw_super); +} + +static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) +{ +	return (struct f2fs_checkpoint *)(sbi->ckpt); +} + +static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) +{ +	return (struct f2fs_nm_info *)(sbi->nm_info); +} + +static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi) +{ +	return (struct f2fs_sm_info *)(sbi->sm_info); +} + +static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi) +{ +	return (struct sit_info *)(SM_I(sbi)->sit_info); +} + +static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi) +{ +	return (struct free_segmap_info *)(SM_I(sbi)->free_info); +} + +static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) +{ +	return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); +} + +static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) +{ +	sbi->s_dirty = 1; +} + +static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) +{ +	sbi->s_dirty = 0; +} + +static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ +	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); +	return ckpt_flags & f; +} + +static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ +	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); +	ckpt_flags |= f; +	cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + +static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ +	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); +	ckpt_flags &= (~f); +	cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + +static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t) +{ +	mutex_lock_nested(&sbi->fs_lock[t], t); +} + +static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t) +{ +	mutex_unlock(&sbi->fs_lock[t]); +} + +/* + * Check whether the given nid is within node id range. + */ +static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +{ +	BUG_ON((nid >= NM_I(sbi)->max_nid)); +} + +#define F2FS_DEFAULT_ALLOCATED_BLOCKS	1 + +/* + * Check whether the inode has blocks or not + */ +static inline int F2FS_HAS_BLOCKS(struct inode *inode) +{ +	if (F2FS_I(inode)->i_xattr_nid) +		return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1); +	else +		return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS); +} + +static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, +				 struct inode *inode, blkcnt_t count) +{ +	block_t	valid_block_count; + +	spin_lock(&sbi->stat_lock); +	valid_block_count = +		sbi->total_valid_block_count + (block_t)count; +	if (valid_block_count > sbi->user_block_count) { +		spin_unlock(&sbi->stat_lock); +		return false; +	} +	inode->i_blocks += count; +	sbi->total_valid_block_count = valid_block_count; +	sbi->alloc_valid_block_count += (block_t)count; +	spin_unlock(&sbi->stat_lock); +	return true; +} + +static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, +						struct inode *inode, +						blkcnt_t count) +{ +	spin_lock(&sbi->stat_lock); +	BUG_ON(sbi->total_valid_block_count < (block_t) count); +	BUG_ON(inode->i_blocks < count); +	inode->i_blocks -= count; +	sbi->total_valid_block_count -= (block_t)count; +	spin_unlock(&sbi->stat_lock); +	return 0; +} + +static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) +{ +	atomic_inc(&sbi->nr_pages[count_type]); +	F2FS_SET_SB_DIRT(sbi); +} + +static inline void inode_inc_dirty_dents(struct inode *inode) +{ +	atomic_inc(&F2FS_I(inode)->dirty_dents); +} + +static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) +{ +	atomic_dec(&sbi->nr_pages[count_type]); +} + +static inline void inode_dec_dirty_dents(struct inode *inode) +{ +	atomic_dec(&F2FS_I(inode)->dirty_dents); +} + +static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +{ +	return atomic_read(&sbi->nr_pages[count_type]); +} + +static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) +{ +	unsigned int pages_per_sec = sbi->segs_per_sec * +					(1 << sbi->log_blocks_per_seg); +	return ((get_pages(sbi, block_type) + pages_per_sec - 1) +			>> sbi->log_blocks_per_seg) / sbi->segs_per_sec; +} + +static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) +{ +	block_t ret; +	spin_lock(&sbi->stat_lock); +	ret = sbi->total_valid_block_count; +	spin_unlock(&sbi->stat_lock); +	return ret; +} + +static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) +{ +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + +	/* return NAT or SIT bitmap */ +	if (flag == NAT_BITMAP) +		return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); +	else if (flag == SIT_BITMAP) +		return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + +	return 0; +} + +static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) +{ +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); +	int offset = (flag == NAT_BITMAP) ? +			le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; +	return &ckpt->sit_nat_version_bitmap + offset; +} + +static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) +{ +	block_t start_addr; +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); +	unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver); + +	start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + +	/* +	 * odd numbered checkpoint should at cp segment 0 +	 * and even segent must be at cp segment 1 +	 */ +	if (!(ckpt_version & 1)) +		start_addr += sbi->blocks_per_seg; + +	return start_addr; +} + +static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) +{ +	return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, +						struct inode *inode, +						unsigned int count) +{ +	block_t	valid_block_count; +	unsigned int valid_node_count; + +	spin_lock(&sbi->stat_lock); + +	valid_block_count = sbi->total_valid_block_count + (block_t)count; +	sbi->alloc_valid_block_count += (block_t)count; +	valid_node_count = sbi->total_valid_node_count + count; + +	if (valid_block_count > sbi->user_block_count) { +		spin_unlock(&sbi->stat_lock); +		return false; +	} + +	if (valid_node_count > sbi->total_node_count) { +		spin_unlock(&sbi->stat_lock); +		return false; +	} + +	if (inode) +		inode->i_blocks += count; +	sbi->total_valid_node_count = valid_node_count; +	sbi->total_valid_block_count = valid_block_count; +	spin_unlock(&sbi->stat_lock); + +	return true; +} + +static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, +						struct inode *inode, +						unsigned int count) +{ +	spin_lock(&sbi->stat_lock); + +	BUG_ON(sbi->total_valid_block_count < count); +	BUG_ON(sbi->total_valid_node_count < count); +	BUG_ON(inode->i_blocks < count); + +	inode->i_blocks -= count; +	sbi->total_valid_node_count -= count; +	sbi->total_valid_block_count -= (block_t)count; + +	spin_unlock(&sbi->stat_lock); +} + +static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) +{ +	unsigned int ret; +	spin_lock(&sbi->stat_lock); +	ret = sbi->total_valid_node_count; +	spin_unlock(&sbi->stat_lock); +	return ret; +} + +static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) +{ +	spin_lock(&sbi->stat_lock); +	BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count); +	sbi->total_valid_inode_count++; +	spin_unlock(&sbi->stat_lock); +} + +static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) +{ +	spin_lock(&sbi->stat_lock); +	BUG_ON(!sbi->total_valid_inode_count); +	sbi->total_valid_inode_count--; +	spin_unlock(&sbi->stat_lock); +	return 0; +} + +static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +{ +	unsigned int ret; +	spin_lock(&sbi->stat_lock); +	ret = sbi->total_valid_inode_count; +	spin_unlock(&sbi->stat_lock); +	return ret; +} + +static inline void f2fs_put_page(struct page *page, int unlock) +{ +	if (!page || IS_ERR(page)) +		return; + +	if (unlock) { +		BUG_ON(!PageLocked(page)); +		unlock_page(page); +	} +	page_cache_release(page); +} + +static inline void f2fs_put_dnode(struct dnode_of_data *dn) +{ +	if (dn->node_page) +		f2fs_put_page(dn->node_page, 1); +	if (dn->inode_page && dn->node_page != dn->inode_page) +		f2fs_put_page(dn->inode_page, 0); +	dn->node_page = NULL; +	dn->inode_page = NULL; +} + +static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, +					size_t size, void (*ctor)(void *)) +{ +	return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); +} + +#define RAW_IS_INODE(p)	((p)->footer.nid == (p)->footer.ino) + +static inline bool IS_INODE(struct page *page) +{ +	struct f2fs_node *p = (struct f2fs_node *)page_address(page); +	return RAW_IS_INODE(p); +} + +static inline __le32 *blkaddr_in_node(struct f2fs_node *node) +{ +	return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; +} + +static inline block_t datablock_addr(struct page *node_page, +		unsigned int offset) +{ +	struct f2fs_node *raw_node; +	__le32 *addr_array; +	raw_node = (struct f2fs_node *)page_address(node_page); +	addr_array = blkaddr_in_node(raw_node); +	return le32_to_cpu(addr_array[offset]); +} + +static inline int f2fs_test_bit(unsigned int nr, char *addr) +{ +	int mask; + +	addr += (nr >> 3); +	mask = 1 << (7 - (nr & 0x07)); +	return mask & *addr; +} + +static inline int f2fs_set_bit(unsigned int nr, char *addr) +{ +	int mask; +	int ret; + +	addr += (nr >> 3); +	mask = 1 << (7 - (nr & 0x07)); +	ret = mask & *addr; +	*addr |= mask; +	return ret; +} + +static inline int f2fs_clear_bit(unsigned int nr, char *addr) +{ +	int mask; +	int ret; + +	addr += (nr >> 3); +	mask = 1 << (7 - (nr & 0x07)); +	ret = mask & *addr; +	*addr &= ~mask; +	return ret; +} + +/* used for f2fs_inode_info->flags */ +enum { +	FI_NEW_INODE,		/* indicate newly allocated inode */ +	FI_NEED_CP,		/* need to do checkpoint during fsync */ +	FI_INC_LINK,		/* need to increment i_nlink */ +	FI_ACL_MODE,		/* indicate acl mode */ +	FI_NO_ALLOC,		/* should not allocate any blocks */ +}; + +static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +{ +	set_bit(flag, &fi->flags); +} + +static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +{ +	return test_bit(flag, &fi->flags); +} + +static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +{ +	clear_bit(flag, &fi->flags); +} + +static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +{ +	fi->i_acl_mode = mode; +	set_inode_flag(fi, FI_ACL_MODE); +} + +static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag) +{ +	if (is_inode_flag_set(fi, FI_ACL_MODE)) { +		clear_inode_flag(fi, FI_ACL_MODE); +		return 1; +	} +	return 0; +} + +/* + * file.c + */ +int f2fs_sync_file(struct file *, loff_t, loff_t, int); +void truncate_data_blocks(struct dnode_of_data *); +void f2fs_truncate(struct inode *); +int f2fs_setattr(struct dentry *, struct iattr *); +int truncate_hole(struct inode *, pgoff_t, pgoff_t); +long f2fs_ioctl(struct file *, unsigned int, unsigned long); +long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* + * inode.c + */ +void f2fs_set_inode_flags(struct inode *); +struct inode *f2fs_iget(struct super_block *, unsigned long); +void update_inode(struct inode *, struct page *); +int f2fs_write_inode(struct inode *, struct writeback_control *); +void f2fs_evict_inode(struct inode *); + +/* + * namei.c + */ +struct dentry *f2fs_get_parent(struct dentry *child); + +/* + * dir.c + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, +							struct page **); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); +ino_t f2fs_inode_by_name(struct inode *, struct qstr *); +void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, +				struct page *, struct inode *); +void init_dent_inode(const struct qstr *, struct page *); +int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); +void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); +int f2fs_make_empty(struct inode *, struct inode *); +bool f2fs_empty_dir(struct inode *); + +static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) +{ +	return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name, +				inode); +} + +/* + * super.c + */ +int f2fs_sync_fs(struct super_block *, int); +extern __printf(3, 4) +void f2fs_msg(struct super_block *, const char *, const char *, ...); + +/* + * hash.c + */ +f2fs_hash_t f2fs_dentry_hash(const char *, size_t); + +/* + * node.c + */ +struct dnode_of_data; +struct node_info; + +int is_checkpointed_node(struct f2fs_sb_info *, nid_t); +void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); +int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); +int truncate_inode_blocks(struct inode *, pgoff_t); +int remove_inode_page(struct inode *); +int new_inode_page(struct inode *, const struct qstr *); +struct page *new_node_page(struct dnode_of_data *, unsigned int); +void ra_node_page(struct f2fs_sb_info *, nid_t); +struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_node_page_ra(struct page *, int); +void sync_inode_page(struct dnode_of_data *); +int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +bool alloc_nid(struct f2fs_sb_info *, nid_t *); +void alloc_nid_done(struct f2fs_sb_info *, nid_t); +void alloc_nid_failed(struct f2fs_sb_info *, nid_t); +void recover_node_page(struct f2fs_sb_info *, struct page *, +		struct f2fs_summary *, struct node_info *, block_t); +int recover_inode_page(struct f2fs_sb_info *, struct page *); +int restore_node_summary(struct f2fs_sb_info *, unsigned int, +				struct f2fs_summary_block *); +void flush_nat_entries(struct f2fs_sb_info *); +int build_node_manager(struct f2fs_sb_info *); +void destroy_node_manager(struct f2fs_sb_info *); +int __init create_node_manager_caches(void); +void destroy_node_manager_caches(void); + +/* + * segment.c + */ +void f2fs_balance_fs(struct f2fs_sb_info *); +void invalidate_blocks(struct f2fs_sb_info *, block_t); +void locate_dirty_segment(struct f2fs_sb_info *, unsigned int); +void clear_prefree_segments(struct f2fs_sb_info *); +int npages_for_summary_flush(struct f2fs_sb_info *); +void allocate_new_segments(struct f2fs_sb_info *); +struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); +struct bio *f2fs_bio_alloc(struct block_device *, int); +void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); +void write_meta_page(struct f2fs_sb_info *, struct page *); +void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, +					block_t, block_t *); +void write_data_page(struct inode *, struct page *, struct dnode_of_data*, +					block_t, block_t *); +void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t); +void recover_data_page(struct f2fs_sb_info *, struct page *, +				struct f2fs_summary *, block_t, block_t); +void rewrite_node_page(struct f2fs_sb_info *, struct page *, +				struct f2fs_summary *, block_t, block_t); +void write_data_summaries(struct f2fs_sb_info *, block_t); +void write_node_summaries(struct f2fs_sb_info *, block_t); +int lookup_journal_in_cursum(struct f2fs_summary_block *, +					int, unsigned int, int); +void flush_sit_entries(struct f2fs_sb_info *); +int build_segment_manager(struct f2fs_sb_info *); +void reset_victim_segmap(struct f2fs_sb_info *); +void destroy_segment_manager(struct f2fs_sb_info *); + +/* + * checkpoint.c + */ +struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); +int check_orphan_space(struct f2fs_sb_info *); +void add_orphan_inode(struct f2fs_sb_info *, nid_t); +void remove_orphan_inode(struct f2fs_sb_info *, nid_t); +int recover_orphan_inodes(struct f2fs_sb_info *); +int get_valid_checkpoint(struct f2fs_sb_info *); +void set_dirty_dir_page(struct inode *, struct page *); +void remove_dirty_dir_inode(struct inode *); +void sync_dirty_dir_inodes(struct f2fs_sb_info *); +void write_checkpoint(struct f2fs_sb_info *, bool); +void init_orphan_info(struct f2fs_sb_info *); +int __init create_checkpoint_caches(void); +void destroy_checkpoint_caches(void); + +/* + * data.c + */ +int reserve_new_block(struct dnode_of_data *); +void update_extent_cache(block_t, struct dnode_of_data *); +struct page *find_data_page(struct inode *, pgoff_t); +struct page *get_lock_data_page(struct inode *, pgoff_t); +struct page *get_new_data_page(struct inode *, pgoff_t, bool); +int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); +int do_write_data_page(struct page *); + +/* + * gc.c + */ +int start_gc_thread(struct f2fs_sb_info *); +void stop_gc_thread(struct f2fs_sb_info *); +block_t start_bidx_of_node(unsigned int); +int f2fs_gc(struct f2fs_sb_info *); +void build_gc_manager(struct f2fs_sb_info *); +int __init create_gc_caches(void); +void destroy_gc_caches(void); + +/* + * recovery.c + */ +void recover_fsync_data(struct f2fs_sb_info *); +bool space_for_roll_forward(struct f2fs_sb_info *); + +/* + * debug.c + */ +#ifdef CONFIG_F2FS_STAT_FS +struct f2fs_stat_info { +	struct list_head stat_list; +	struct f2fs_sb_info *sbi; +	struct mutex stat_lock; +	int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; +	int main_area_segs, main_area_sections, main_area_zones; +	int hit_ext, total_ext; +	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; +	int nats, sits, fnids; +	int total_count, utilization; +	int bg_gc; +	unsigned int valid_count, valid_node_count, valid_inode_count; +	unsigned int bimodal, avg_vblocks; +	int util_free, util_valid, util_invalid; +	int rsvd_segs, overp_segs; +	int dirty_count, node_pages, meta_pages; +	int prefree_count, call_count; +	int tot_segs, node_segs, data_segs, free_segs, free_secs; +	int tot_blks, data_blks, node_blks; +	int curseg[NR_CURSEG_TYPE]; +	int cursec[NR_CURSEG_TYPE]; +	int curzone[NR_CURSEG_TYPE]; + +	unsigned int segment_count[2]; +	unsigned int block_count[2]; +	unsigned base_mem, cache_mem; +}; + +#define stat_inc_call_count(si)	((si)->call_count++) + +#define stat_inc_seg_count(sbi, type)					\ +	do {								\ +		struct f2fs_stat_info *si = sbi->stat_info;		\ +		(si)->tot_segs++;					\ +		if (type == SUM_TYPE_DATA)				\ +			si->data_segs++;				\ +		else							\ +			si->node_segs++;				\ +	} while (0) + +#define stat_inc_tot_blk_count(si, blks)				\ +	(si->tot_blks += (blks)) + +#define stat_inc_data_blk_count(sbi, blks)				\ +	do {								\ +		struct f2fs_stat_info *si = sbi->stat_info;		\ +		stat_inc_tot_blk_count(si, blks);			\ +		si->data_blks += (blks);				\ +	} while (0) + +#define stat_inc_node_blk_count(sbi, blks)				\ +	do {								\ +		struct f2fs_stat_info *si = sbi->stat_info;		\ +		stat_inc_tot_blk_count(si, blks);			\ +		si->node_blks += (blks);				\ +	} while (0) + +int f2fs_build_stats(struct f2fs_sb_info *); +void f2fs_destroy_stats(struct f2fs_sb_info *); +void __init f2fs_create_root_stats(void); +void f2fs_destroy_root_stats(void); +#else +#define stat_inc_call_count(si) +#define stat_inc_seg_count(si, type) +#define stat_inc_tot_blk_count(si, blks) +#define stat_inc_data_blk_count(si, blks) +#define stat_inc_node_blk_count(sbi, blks) + +static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } +static inline void __init f2fs_create_root_stats(void) { } +static inline void f2fs_destroy_root_stats(void) { } +#endif + +extern const struct file_operations f2fs_dir_operations; +extern const struct file_operations f2fs_file_operations; +extern const struct inode_operations f2fs_file_inode_operations; +extern const struct address_space_operations f2fs_dblock_aops; +extern const struct address_space_operations f2fs_node_aops; +extern const struct address_space_operations f2fs_meta_aops; +extern const struct inode_operations f2fs_dir_inode_operations; +extern const struct inode_operations f2fs_symlink_inode_operations; +extern const struct inode_operations f2fs_special_inode_operations; +#endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c new file mode 100644 index 00000000000..958a46da19a --- /dev/null +++ b/fs/f2fs/file.c @@ -0,0 +1,671 @@ +/* + * fs/f2fs/file.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/stat.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/falloc.h> +#include <linux/types.h> +#include <linux/compat.h> +#include <linux/uaccess.h> +#include <linux/mount.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "acl.h" + +static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, +						struct vm_fault *vmf) +{ +	struct page *page = vmf->page; +	struct inode *inode = file_inode(vma->vm_file); +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	block_t old_blk_addr; +	struct dnode_of_data dn; +	int err; + +	f2fs_balance_fs(sbi); + +	sb_start_pagefault(inode->i_sb); + +	mutex_lock_op(sbi, DATA_NEW); + +	/* block allocation */ +	set_new_dnode(&dn, inode, NULL, NULL, 0); +	err = get_dnode_of_data(&dn, page->index, 0); +	if (err) { +		mutex_unlock_op(sbi, DATA_NEW); +		goto out; +	} + +	old_blk_addr = dn.data_blkaddr; + +	if (old_blk_addr == NULL_ADDR) { +		err = reserve_new_block(&dn); +		if (err) { +			f2fs_put_dnode(&dn); +			mutex_unlock_op(sbi, DATA_NEW); +			goto out; +		} +	} +	f2fs_put_dnode(&dn); + +	mutex_unlock_op(sbi, DATA_NEW); + +	lock_page(page); +	if (page->mapping != inode->i_mapping || +			page_offset(page) >= i_size_read(inode) || +			!PageUptodate(page)) { +		unlock_page(page); +		err = -EFAULT; +		goto out; +	} + +	/* +	 * check to see if the page is mapped already (no holes) +	 */ +	if (PageMappedToDisk(page)) +		goto out; + +	/* fill the page */ +	wait_on_page_writeback(page); + +	/* page is wholly or partially inside EOF */ +	if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { +		unsigned offset; +		offset = i_size_read(inode) & ~PAGE_CACHE_MASK; +		zero_user_segment(page, offset, PAGE_CACHE_SIZE); +	} +	set_page_dirty(page); +	SetPageUptodate(page); + +	file_update_time(vma->vm_file); +out: +	sb_end_pagefault(inode->i_sb); +	return block_page_mkwrite_return(err); +} + +static const struct vm_operations_struct f2fs_file_vm_ops = { +	.fault		= filemap_fault, +	.page_mkwrite	= f2fs_vm_page_mkwrite, +	.remap_pages	= generic_file_remap_pages, +}; + +static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) +{ +	struct dentry *dentry; +	nid_t pino; + +	inode = igrab(inode); +	dentry = d_find_any_alias(inode); +	if (!dentry) { +		iput(inode); +		return 0; +	} +	pino = dentry->d_parent->d_inode->i_ino; +	dput(dentry); +	iput(inode); +	return !is_checkpointed_node(sbi, pino); +} + +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ +	struct inode *inode = file->f_mapping->host; +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	unsigned long long cur_version; +	int ret = 0; +	bool need_cp = false; +	struct writeback_control wbc = { +		.sync_mode = WB_SYNC_ALL, +		.nr_to_write = LONG_MAX, +		.for_reclaim = 0, +	}; + +	if (inode->i_sb->s_flags & MS_RDONLY) +		return 0; + +	ret = filemap_write_and_wait_range(inode->i_mapping, start, end); +	if (ret) +		return ret; + +	/* guarantee free sections for fsync */ +	f2fs_balance_fs(sbi); + +	mutex_lock(&inode->i_mutex); + +	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) +		goto out; + +	mutex_lock(&sbi->cp_mutex); +	cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); +	mutex_unlock(&sbi->cp_mutex); + +	if (F2FS_I(inode)->data_version != cur_version && +					!(inode->i_state & I_DIRTY)) +		goto out; +	F2FS_I(inode)->data_version--; + +	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) +		need_cp = true; +	else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) +		need_cp = true; +	else if (!space_for_roll_forward(sbi)) +		need_cp = true; +	else if (need_to_sync_dir(sbi, inode)) +		need_cp = true; + +	if (need_cp) { +		/* all the dirty node pages should be flushed for POR */ +		ret = f2fs_sync_fs(inode->i_sb, 1); +		clear_inode_flag(F2FS_I(inode), FI_NEED_CP); +	} else { +		/* if there is no written node page, write its inode page */ +		while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { +			ret = f2fs_write_inode(inode, NULL); +			if (ret) +				goto out; +		} +		filemap_fdatawait_range(sbi->node_inode->i_mapping, +							0, LONG_MAX); +	} +out: +	mutex_unlock(&inode->i_mutex); +	return ret; +} + +static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ +	file_accessed(file); +	vma->vm_ops = &f2fs_file_vm_ops; +	return 0; +} + +static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +{ +	int nr_free = 0, ofs = dn->ofs_in_node; +	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); +	struct f2fs_node *raw_node; +	__le32 *addr; + +	raw_node = page_address(dn->node_page); +	addr = blkaddr_in_node(raw_node) + ofs; + +	for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { +		block_t blkaddr = le32_to_cpu(*addr); +		if (blkaddr == NULL_ADDR) +			continue; + +		update_extent_cache(NULL_ADDR, dn); +		invalidate_blocks(sbi, blkaddr); +		dec_valid_block_count(sbi, dn->inode, 1); +		nr_free++; +	} +	if (nr_free) { +		set_page_dirty(dn->node_page); +		sync_inode_page(dn); +	} +	dn->ofs_in_node = ofs; +	return nr_free; +} + +void truncate_data_blocks(struct dnode_of_data *dn) +{ +	truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); +} + +static void truncate_partial_data_page(struct inode *inode, u64 from) +{ +	unsigned offset = from & (PAGE_CACHE_SIZE - 1); +	struct page *page; + +	if (!offset) +		return; + +	page = find_data_page(inode, from >> PAGE_CACHE_SHIFT); +	if (IS_ERR(page)) +		return; + +	lock_page(page); +	wait_on_page_writeback(page); +	zero_user(page, offset, PAGE_CACHE_SIZE - offset); +	set_page_dirty(page); +	f2fs_put_page(page, 1); +} + +static int truncate_blocks(struct inode *inode, u64 from) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	unsigned int blocksize = inode->i_sb->s_blocksize; +	struct dnode_of_data dn; +	pgoff_t free_from; +	int count = 0; +	int err; + +	free_from = (pgoff_t) +			((from + blocksize - 1) >> (sbi->log_blocksize)); + +	mutex_lock_op(sbi, DATA_TRUNC); + +	set_new_dnode(&dn, inode, NULL, NULL, 0); +	err = get_dnode_of_data(&dn, free_from, RDONLY_NODE); +	if (err) { +		if (err == -ENOENT) +			goto free_next; +		mutex_unlock_op(sbi, DATA_TRUNC); +		return err; +	} + +	if (IS_INODE(dn.node_page)) +		count = ADDRS_PER_INODE; +	else +		count = ADDRS_PER_BLOCK; + +	count -= dn.ofs_in_node; +	BUG_ON(count < 0); +	if (dn.ofs_in_node || IS_INODE(dn.node_page)) { +		truncate_data_blocks_range(&dn, count); +		free_from += count; +	} + +	f2fs_put_dnode(&dn); +free_next: +	err = truncate_inode_blocks(inode, free_from); +	mutex_unlock_op(sbi, DATA_TRUNC); + +	/* lastly zero out the first data page */ +	truncate_partial_data_page(inode, from); + +	return err; +} + +void f2fs_truncate(struct inode *inode) +{ +	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || +				S_ISLNK(inode->i_mode))) +		return; + +	if (!truncate_blocks(inode, i_size_read(inode))) { +		inode->i_mtime = inode->i_ctime = CURRENT_TIME; +		mark_inode_dirty(inode); +	} +} + +static int f2fs_getattr(struct vfsmount *mnt, +			 struct dentry *dentry, struct kstat *stat) +{ +	struct inode *inode = dentry->d_inode; +	generic_fillattr(inode, stat); +	stat->blocks <<= 3; +	return 0; +} + +#ifdef CONFIG_F2FS_FS_POSIX_ACL +static void __setattr_copy(struct inode *inode, const struct iattr *attr) +{ +	struct f2fs_inode_info *fi = F2FS_I(inode); +	unsigned int ia_valid = attr->ia_valid; + +	if (ia_valid & ATTR_UID) +		inode->i_uid = attr->ia_uid; +	if (ia_valid & ATTR_GID) +		inode->i_gid = attr->ia_gid; +	if (ia_valid & ATTR_ATIME) +		inode->i_atime = timespec_trunc(attr->ia_atime, +						inode->i_sb->s_time_gran); +	if (ia_valid & ATTR_MTIME) +		inode->i_mtime = timespec_trunc(attr->ia_mtime, +						inode->i_sb->s_time_gran); +	if (ia_valid & ATTR_CTIME) +		inode->i_ctime = timespec_trunc(attr->ia_ctime, +						inode->i_sb->s_time_gran); +	if (ia_valid & ATTR_MODE) { +		umode_t mode = attr->ia_mode; + +		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) +			mode &= ~S_ISGID; +		set_acl_inode(fi, mode); +	} +} +#else +#define __setattr_copy setattr_copy +#endif + +int f2fs_setattr(struct dentry *dentry, struct iattr *attr) +{ +	struct inode *inode = dentry->d_inode; +	struct f2fs_inode_info *fi = F2FS_I(inode); +	int err; + +	err = inode_change_ok(inode, attr); +	if (err) +		return err; + +	if ((attr->ia_valid & ATTR_SIZE) && +			attr->ia_size != i_size_read(inode)) { +		truncate_setsize(inode, attr->ia_size); +		f2fs_truncate(inode); +		f2fs_balance_fs(F2FS_SB(inode->i_sb)); +	} + +	__setattr_copy(inode, attr); + +	if (attr->ia_valid & ATTR_MODE) { +		err = f2fs_acl_chmod(inode); +		if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { +			inode->i_mode = fi->i_acl_mode; +			clear_inode_flag(fi, FI_ACL_MODE); +		} +	} + +	mark_inode_dirty(inode); +	return err; +} + +const struct inode_operations f2fs_file_inode_operations = { +	.getattr	= f2fs_getattr, +	.setattr	= f2fs_setattr, +	.get_acl	= f2fs_get_acl, +#ifdef CONFIG_F2FS_FS_XATTR +	.setxattr	= generic_setxattr, +	.getxattr	= generic_getxattr, +	.listxattr	= f2fs_listxattr, +	.removexattr	= generic_removexattr, +#endif +}; + +static void fill_zero(struct inode *inode, pgoff_t index, +					loff_t start, loff_t len) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct page *page; + +	if (!len) +		return; + +	f2fs_balance_fs(sbi); + +	mutex_lock_op(sbi, DATA_NEW); +	page = get_new_data_page(inode, index, false); +	mutex_unlock_op(sbi, DATA_NEW); + +	if (!IS_ERR(page)) { +		wait_on_page_writeback(page); +		zero_user(page, start, len); +		set_page_dirty(page); +		f2fs_put_page(page, 1); +	} +} + +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) +{ +	pgoff_t index; +	int err; + +	for (index = pg_start; index < pg_end; index++) { +		struct dnode_of_data dn; +		struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + +		f2fs_balance_fs(sbi); + +		mutex_lock_op(sbi, DATA_TRUNC); +		set_new_dnode(&dn, inode, NULL, NULL, 0); +		err = get_dnode_of_data(&dn, index, RDONLY_NODE); +		if (err) { +			mutex_unlock_op(sbi, DATA_TRUNC); +			if (err == -ENOENT) +				continue; +			return err; +		} + +		if (dn.data_blkaddr != NULL_ADDR) +			truncate_data_blocks_range(&dn, 1); +		f2fs_put_dnode(&dn); +		mutex_unlock_op(sbi, DATA_TRUNC); +	} +	return 0; +} + +static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) +{ +	pgoff_t pg_start, pg_end; +	loff_t off_start, off_end; +	int ret = 0; + +	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; +	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + +	off_start = offset & (PAGE_CACHE_SIZE - 1); +	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + +	if (pg_start == pg_end) { +		fill_zero(inode, pg_start, off_start, +						off_end - off_start); +	} else { +		if (off_start) +			fill_zero(inode, pg_start++, off_start, +					PAGE_CACHE_SIZE - off_start); +		if (off_end) +			fill_zero(inode, pg_end, 0, off_end); + +		if (pg_start < pg_end) { +			struct address_space *mapping = inode->i_mapping; +			loff_t blk_start, blk_end; + +			blk_start = pg_start << PAGE_CACHE_SHIFT; +			blk_end = pg_end << PAGE_CACHE_SHIFT; +			truncate_inode_pages_range(mapping, blk_start, +					blk_end - 1); +			ret = truncate_hole(inode, pg_start, pg_end); +		} +	} + +	if (!(mode & FALLOC_FL_KEEP_SIZE) && +		i_size_read(inode) <= (offset + len)) { +		i_size_write(inode, offset); +		mark_inode_dirty(inode); +	} + +	return ret; +} + +static int expand_inode_data(struct inode *inode, loff_t offset, +					loff_t len, int mode) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	pgoff_t index, pg_start, pg_end; +	loff_t new_size = i_size_read(inode); +	loff_t off_start, off_end; +	int ret = 0; + +	ret = inode_newsize_ok(inode, (len + offset)); +	if (ret) +		return ret; + +	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; +	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + +	off_start = offset & (PAGE_CACHE_SIZE - 1); +	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + +	for (index = pg_start; index <= pg_end; index++) { +		struct dnode_of_data dn; + +		mutex_lock_op(sbi, DATA_NEW); + +		set_new_dnode(&dn, inode, NULL, NULL, 0); +		ret = get_dnode_of_data(&dn, index, 0); +		if (ret) { +			mutex_unlock_op(sbi, DATA_NEW); +			break; +		} + +		if (dn.data_blkaddr == NULL_ADDR) { +			ret = reserve_new_block(&dn); +			if (ret) { +				f2fs_put_dnode(&dn); +				mutex_unlock_op(sbi, DATA_NEW); +				break; +			} +		} +		f2fs_put_dnode(&dn); + +		mutex_unlock_op(sbi, DATA_NEW); + +		if (pg_start == pg_end) +			new_size = offset + len; +		else if (index == pg_start && off_start) +			new_size = (index + 1) << PAGE_CACHE_SHIFT; +		else if (index == pg_end) +			new_size = (index << PAGE_CACHE_SHIFT) + off_end; +		else +			new_size += PAGE_CACHE_SIZE; +	} + +	if (!(mode & FALLOC_FL_KEEP_SIZE) && +		i_size_read(inode) < new_size) { +		i_size_write(inode, new_size); +		mark_inode_dirty(inode); +	} + +	return ret; +} + +static long f2fs_fallocate(struct file *file, int mode, +				loff_t offset, loff_t len) +{ +	struct inode *inode = file_inode(file); +	long ret; + +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) +		return -EOPNOTSUPP; + +	if (mode & FALLOC_FL_PUNCH_HOLE) +		ret = punch_hole(inode, offset, len, mode); +	else +		ret = expand_inode_data(inode, offset, len, mode); + +	if (!ret) { +		inode->i_mtime = inode->i_ctime = CURRENT_TIME; +		mark_inode_dirty(inode); +	} +	return ret; +} + +#define F2FS_REG_FLMASK		(~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK	(FS_NODUMP_FL | FS_NOATIME_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ +	if (S_ISDIR(mode)) +		return flags; +	else if (S_ISREG(mode)) +		return flags & F2FS_REG_FLMASK; +	else +		return flags & F2FS_OTHER_FLMASK; +} + +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ +	struct inode *inode = file_inode(filp); +	struct f2fs_inode_info *fi = F2FS_I(inode); +	unsigned int flags; +	int ret; + +	switch (cmd) { +	case FS_IOC_GETFLAGS: +		flags = fi->i_flags & FS_FL_USER_VISIBLE; +		return put_user(flags, (int __user *) arg); +	case FS_IOC_SETFLAGS: +	{ +		unsigned int oldflags; + +		ret = mnt_want_write(filp->f_path.mnt); +		if (ret) +			return ret; + +		if (!inode_owner_or_capable(inode)) { +			ret = -EACCES; +			goto out; +		} + +		if (get_user(flags, (int __user *) arg)) { +			ret = -EFAULT; +			goto out; +		} + +		flags = f2fs_mask_flags(inode->i_mode, flags); + +		mutex_lock(&inode->i_mutex); + +		oldflags = fi->i_flags; + +		if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { +			if (!capable(CAP_LINUX_IMMUTABLE)) { +				mutex_unlock(&inode->i_mutex); +				ret = -EPERM; +				goto out; +			} +		} + +		flags = flags & FS_FL_USER_MODIFIABLE; +		flags |= oldflags & ~FS_FL_USER_MODIFIABLE; +		fi->i_flags = flags; +		mutex_unlock(&inode->i_mutex); + +		f2fs_set_inode_flags(inode); +		inode->i_ctime = CURRENT_TIME; +		mark_inode_dirty(inode); +out: +		mnt_drop_write(filp->f_path.mnt); +		return ret; +	} +	default: +		return -ENOTTY; +	} +} + +#ifdef CONFIG_COMPAT +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ +	switch (cmd) { +	case F2FS_IOC32_GETFLAGS: +		cmd = F2FS_IOC_GETFLAGS; +		break; +	case F2FS_IOC32_SETFLAGS: +		cmd = F2FS_IOC_SETFLAGS; +		break; +	default: +		return -ENOIOCTLCMD; +	} +	return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +const struct file_operations f2fs_file_operations = { +	.llseek		= generic_file_llseek, +	.read		= do_sync_read, +	.write		= do_sync_write, +	.aio_read	= generic_file_aio_read, +	.aio_write	= generic_file_aio_write, +	.open		= generic_file_open, +	.mmap		= f2fs_file_mmap, +	.fsync		= f2fs_sync_file, +	.fallocate	= f2fs_fallocate, +	.unlocked_ioctl	= f2fs_ioctl, +#ifdef CONFIG_COMPAT +	.compat_ioctl	= f2fs_compat_ioctl, +#endif +	.splice_read	= generic_file_splice_read, +	.splice_write	= generic_file_splice_write, +}; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c new file mode 100644 index 00000000000..94b8a0c4845 --- /dev/null +++ b/fs/f2fs/gc.c @@ -0,0 +1,698 @@ +/* + * fs/f2fs/gc.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/f2fs_fs.h> +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/blkdev.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" + +static struct kmem_cache *winode_slab; + +static int gc_thread_func(void *data) +{ +	struct f2fs_sb_info *sbi = data; +	wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; +	long wait_ms; + +	wait_ms = GC_THREAD_MIN_SLEEP_TIME; + +	do { +		if (try_to_freeze()) +			continue; +		else +			wait_event_interruptible_timeout(*wq, +						kthread_should_stop(), +						msecs_to_jiffies(wait_ms)); +		if (kthread_should_stop()) +			break; + +		if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { +			wait_ms = GC_THREAD_MAX_SLEEP_TIME; +			continue; +		} + +		/* +		 * [GC triggering condition] +		 * 0. GC is not conducted currently. +		 * 1. There are enough dirty segments. +		 * 2. IO subsystem is idle by checking the # of writeback pages. +		 * 3. IO subsystem is idle by checking the # of requests in +		 *    bdev's request list. +		 * +		 * Note) We have to avoid triggering GCs too much frequently. +		 * Because it is possible that some segments can be +		 * invalidated soon after by user update or deletion. +		 * So, I'd like to wait some time to collect dirty segments. +		 */ +		if (!mutex_trylock(&sbi->gc_mutex)) +			continue; + +		if (!is_idle(sbi)) { +			wait_ms = increase_sleep_time(wait_ms); +			mutex_unlock(&sbi->gc_mutex); +			continue; +		} + +		if (has_enough_invalid_blocks(sbi)) +			wait_ms = decrease_sleep_time(wait_ms); +		else +			wait_ms = increase_sleep_time(wait_ms); + +		sbi->bg_gc++; + +		/* if return value is not zero, no victim was selected */ +		if (f2fs_gc(sbi)) +			wait_ms = GC_THREAD_NOGC_SLEEP_TIME; +		else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) +			wait_ms = GC_THREAD_MAX_SLEEP_TIME; + +	} while (!kthread_should_stop()); +	return 0; +} + +int start_gc_thread(struct f2fs_sb_info *sbi) +{ +	struct f2fs_gc_kthread *gc_th; +	dev_t dev = sbi->sb->s_bdev->bd_dev; + +	if (!test_opt(sbi, BG_GC)) +		return 0; +	gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); +	if (!gc_th) +		return -ENOMEM; + +	sbi->gc_thread = gc_th; +	init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); +	sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, +			"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); +	if (IS_ERR(gc_th->f2fs_gc_task)) { +		kfree(gc_th); +		sbi->gc_thread = NULL; +		return -ENOMEM; +	} +	return 0; +} + +void stop_gc_thread(struct f2fs_sb_info *sbi) +{ +	struct f2fs_gc_kthread *gc_th = sbi->gc_thread; +	if (!gc_th) +		return; +	kthread_stop(gc_th->f2fs_gc_task); +	kfree(gc_th); +	sbi->gc_thread = NULL; +} + +static int select_gc_type(int gc_type) +{ +	return (gc_type == BG_GC) ? GC_CB : GC_GREEDY; +} + +static void select_policy(struct f2fs_sb_info *sbi, int gc_type, +			int type, struct victim_sel_policy *p) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + +	if (p->alloc_mode) { +		p->gc_mode = GC_GREEDY; +		p->dirty_segmap = dirty_i->dirty_segmap[type]; +		p->ofs_unit = 1; +	} else { +		p->gc_mode = select_gc_type(gc_type); +		p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; +		p->ofs_unit = sbi->segs_per_sec; +	} +	p->offset = sbi->last_victim[p->gc_mode]; +} + +static unsigned int get_max_cost(struct f2fs_sb_info *sbi, +				struct victim_sel_policy *p) +{ +	/* SSR allocates in a segment unit */ +	if (p->alloc_mode == SSR) +		return 1 << sbi->log_blocks_per_seg; +	if (p->gc_mode == GC_GREEDY) +		return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; +	else if (p->gc_mode == GC_CB) +		return UINT_MAX; +	else /* No other gc_mode */ +		return 0; +} + +static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	unsigned int segno; + +	/* +	 * If the gc_type is FG_GC, we can select victim segments +	 * selected by background GC before. +	 * Those segments guarantee they have small valid blocks. +	 */ +	segno = find_next_bit(dirty_i->victim_segmap[BG_GC], +						TOTAL_SEGS(sbi), 0); +	if (segno < TOTAL_SEGS(sbi)) { +		clear_bit(segno, dirty_i->victim_segmap[BG_GC]); +		return segno; +	} +	return NULL_SEGNO; +} + +static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	unsigned int secno = GET_SECNO(sbi, segno); +	unsigned int start = secno * sbi->segs_per_sec; +	unsigned long long mtime = 0; +	unsigned int vblocks; +	unsigned char age = 0; +	unsigned char u; +	unsigned int i; + +	for (i = 0; i < sbi->segs_per_sec; i++) +		mtime += get_seg_entry(sbi, start + i)->mtime; +	vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + +	mtime = div_u64(mtime, sbi->segs_per_sec); +	vblocks = div_u64(vblocks, sbi->segs_per_sec); + +	u = (vblocks * 100) >> sbi->log_blocks_per_seg; + +	/* Handle if the system time is changed by user */ +	if (mtime < sit_i->min_mtime) +		sit_i->min_mtime = mtime; +	if (mtime > sit_i->max_mtime) +		sit_i->max_mtime = mtime; +	if (sit_i->max_mtime != sit_i->min_mtime) +		age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), +				sit_i->max_mtime - sit_i->min_mtime); + +	return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); +} + +static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, +					struct victim_sel_policy *p) +{ +	if (p->alloc_mode == SSR) +		return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + +	/* alloc_mode == LFS */ +	if (p->gc_mode == GC_GREEDY) +		return get_valid_blocks(sbi, segno, sbi->segs_per_sec); +	else +		return get_cb_cost(sbi, segno); +} + +/* + * This function is called from two pathes. + * One is garbage collection and the other is SSR segment selection. + * When it is called during GC, it just gets a victim segment + * and it does not remove it from dirty seglist. + * When it is called from SSR segment selection, it finds a segment + * which has minimum valid blocks and removes it from dirty seglist. + */ +static int get_victim_by_default(struct f2fs_sb_info *sbi, +		unsigned int *result, int gc_type, int type, char alloc_mode) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	struct victim_sel_policy p; +	unsigned int segno; +	int nsearched = 0; + +	p.alloc_mode = alloc_mode; +	select_policy(sbi, gc_type, type, &p); + +	p.min_segno = NULL_SEGNO; +	p.min_cost = get_max_cost(sbi, &p); + +	mutex_lock(&dirty_i->seglist_lock); + +	if (p.alloc_mode == LFS && gc_type == FG_GC) { +		p.min_segno = check_bg_victims(sbi); +		if (p.min_segno != NULL_SEGNO) +			goto got_it; +	} + +	while (1) { +		unsigned long cost; + +		segno = find_next_bit(p.dirty_segmap, +						TOTAL_SEGS(sbi), p.offset); +		if (segno >= TOTAL_SEGS(sbi)) { +			if (sbi->last_victim[p.gc_mode]) { +				sbi->last_victim[p.gc_mode] = 0; +				p.offset = 0; +				continue; +			} +			break; +		} +		p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; + +		if (test_bit(segno, dirty_i->victim_segmap[FG_GC])) +			continue; +		if (gc_type == BG_GC && +				test_bit(segno, dirty_i->victim_segmap[BG_GC])) +			continue; +		if (IS_CURSEC(sbi, GET_SECNO(sbi, segno))) +			continue; + +		cost = get_gc_cost(sbi, segno, &p); + +		if (p.min_cost > cost) { +			p.min_segno = segno; +			p.min_cost = cost; +		} + +		if (cost == get_max_cost(sbi, &p)) +			continue; + +		if (nsearched++ >= MAX_VICTIM_SEARCH) { +			sbi->last_victim[p.gc_mode] = segno; +			break; +		} +	} +got_it: +	if (p.min_segno != NULL_SEGNO) { +		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit; +		if (p.alloc_mode == LFS) { +			int i; +			for (i = 0; i < p.ofs_unit; i++) +				set_bit(*result + i, +					dirty_i->victim_segmap[gc_type]); +		} +	} +	mutex_unlock(&dirty_i->seglist_lock); + +	return (p.min_segno == NULL_SEGNO) ? 0 : 1; +} + +static const struct victim_selection default_v_ops = { +	.get_victim = get_victim_by_default, +}; + +static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) +{ +	struct list_head *this; +	struct inode_entry *ie; + +	list_for_each(this, ilist) { +		ie = list_entry(this, struct inode_entry, list); +		if (ie->inode->i_ino == ino) +			return ie->inode; +	} +	return NULL; +} + +static void add_gc_inode(struct inode *inode, struct list_head *ilist) +{ +	struct list_head *this; +	struct inode_entry *new_ie, *ie; + +	list_for_each(this, ilist) { +		ie = list_entry(this, struct inode_entry, list); +		if (ie->inode == inode) { +			iput(inode); +			return; +		} +	} +repeat: +	new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); +	if (!new_ie) { +		cond_resched(); +		goto repeat; +	} +	new_ie->inode = inode; +	list_add_tail(&new_ie->list, ilist); +} + +static void put_gc_inode(struct list_head *ilist) +{ +	struct inode_entry *ie, *next_ie; +	list_for_each_entry_safe(ie, next_ie, ilist, list) { +		iput(ie->inode); +		list_del(&ie->list); +		kmem_cache_free(winode_slab, ie); +	} +} + +static int check_valid_map(struct f2fs_sb_info *sbi, +				unsigned int segno, int offset) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	struct seg_entry *sentry; +	int ret; + +	mutex_lock(&sit_i->sentry_lock); +	sentry = get_seg_entry(sbi, segno); +	ret = f2fs_test_bit(offset, sentry->cur_valid_map); +	mutex_unlock(&sit_i->sentry_lock); +	return ret; +} + +/* + * This function compares node address got in summary with that in NAT. + * On validity, copy that node with cold status, otherwise (invalid node) + * ignore that. + */ +static void gc_node_segment(struct f2fs_sb_info *sbi, +		struct f2fs_summary *sum, unsigned int segno, int gc_type) +{ +	bool initial = true; +	struct f2fs_summary *entry; +	int off; + +next_step: +	entry = sum; +	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { +		nid_t nid = le32_to_cpu(entry->nid); +		struct page *node_page; + +		/* stop BG_GC if there is not enough free sections. */ +		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) +			return; + +		if (check_valid_map(sbi, segno, off) == 0) +			continue; + +		if (initial) { +			ra_node_page(sbi, nid); +			continue; +		} +		node_page = get_node_page(sbi, nid); +		if (IS_ERR(node_page)) +			continue; + +		/* set page dirty and write it */ +		if (!PageWriteback(node_page)) +			set_page_dirty(node_page); +		f2fs_put_page(node_page, 1); +		stat_inc_node_blk_count(sbi, 1); +	} +	if (initial) { +		initial = false; +		goto next_step; +	} + +	if (gc_type == FG_GC) { +		struct writeback_control wbc = { +			.sync_mode = WB_SYNC_ALL, +			.nr_to_write = LONG_MAX, +			.for_reclaim = 0, +		}; +		sync_node_pages(sbi, 0, &wbc); +	} +} + +/* + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. + */ +block_t start_bidx_of_node(unsigned int node_ofs) +{ +	unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; +	unsigned int bidx; + +	if (node_ofs == 0) +		return 0; + +	if (node_ofs <= 2) { +		bidx = node_ofs - 1; +	} else if (node_ofs <= indirect_blks) { +		int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); +		bidx = node_ofs - 2 - dec; +	} else { +		int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); +		bidx = node_ofs - 5 - dec; +	} +	return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; +} + +static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +		struct node_info *dni, block_t blkaddr, unsigned int *nofs) +{ +	struct page *node_page; +	nid_t nid; +	unsigned int ofs_in_node; +	block_t source_blkaddr; + +	nid = le32_to_cpu(sum->nid); +	ofs_in_node = le16_to_cpu(sum->ofs_in_node); + +	node_page = get_node_page(sbi, nid); +	if (IS_ERR(node_page)) +		return 0; + +	get_node_info(sbi, nid, dni); + +	if (sum->version != dni->version) { +		f2fs_put_page(node_page, 1); +		return 0; +	} + +	*nofs = ofs_of_node(node_page); +	source_blkaddr = datablock_addr(node_page, ofs_in_node); +	f2fs_put_page(node_page, 1); + +	if (source_blkaddr != blkaddr) +		return 0; +	return 1; +} + +static void move_data_page(struct inode *inode, struct page *page, int gc_type) +{ +	if (page->mapping != inode->i_mapping) +		goto out; + +	if (inode != page->mapping->host) +		goto out; + +	if (PageWriteback(page)) +		goto out; + +	if (gc_type == BG_GC) { +		set_page_dirty(page); +		set_cold_data(page); +	} else { +		struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +		mutex_lock_op(sbi, DATA_WRITE); +		if (clear_page_dirty_for_io(page) && +			S_ISDIR(inode->i_mode)) { +			dec_page_count(sbi, F2FS_DIRTY_DENTS); +			inode_dec_dirty_dents(inode); +		} +		set_cold_data(page); +		do_write_data_page(page); +		mutex_unlock_op(sbi, DATA_WRITE); +		clear_cold_data(page); +	} +out: +	f2fs_put_page(page, 1); +} + +/* + * This function tries to get parent node of victim data block, and identifies + * data block validity. If the block is valid, copy that with cold status and + * modify parent node. + * If the parent node is not valid or the data block address is different, + * the victim data block is ignored. + */ +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +		struct list_head *ilist, unsigned int segno, int gc_type) +{ +	struct super_block *sb = sbi->sb; +	struct f2fs_summary *entry; +	block_t start_addr; +	int off; +	int phase = 0; + +	start_addr = START_BLOCK(sbi, segno); + +next_step: +	entry = sum; +	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { +		struct page *data_page; +		struct inode *inode; +		struct node_info dni; /* dnode info for the data */ +		unsigned int ofs_in_node, nofs; +		block_t start_bidx; + +		/* stop BG_GC if there is not enough free sections. */ +		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) +			return; + +		if (check_valid_map(sbi, segno, off) == 0) +			continue; + +		if (phase == 0) { +			ra_node_page(sbi, le32_to_cpu(entry->nid)); +			continue; +		} + +		/* Get an inode by ino with checking validity */ +		if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0) +			continue; + +		if (phase == 1) { +			ra_node_page(sbi, dni.ino); +			continue; +		} + +		start_bidx = start_bidx_of_node(nofs); +		ofs_in_node = le16_to_cpu(entry->ofs_in_node); + +		if (phase == 2) { +			inode = f2fs_iget(sb, dni.ino); +			if (IS_ERR(inode)) +				continue; + +			data_page = find_data_page(inode, +					start_bidx + ofs_in_node); +			if (IS_ERR(data_page)) +				goto next_iput; + +			f2fs_put_page(data_page, 0); +			add_gc_inode(inode, ilist); +		} else { +			inode = find_gc_inode(dni.ino, ilist); +			if (inode) { +				data_page = get_lock_data_page(inode, +						start_bidx + ofs_in_node); +				if (IS_ERR(data_page)) +					continue; +				move_data_page(inode, data_page, gc_type); +				stat_inc_data_blk_count(sbi, 1); +			} +		} +		continue; +next_iput: +		iput(inode); +	} +	if (++phase < 4) +		goto next_step; + +	if (gc_type == FG_GC) +		f2fs_submit_bio(sbi, DATA, true); +} + +static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, +						int gc_type, int type) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	int ret; +	mutex_lock(&sit_i->sentry_lock); +	ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS); +	mutex_unlock(&sit_i->sentry_lock); +	return ret; +} + +static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +				struct list_head *ilist, int gc_type) +{ +	struct page *sum_page; +	struct f2fs_summary_block *sum; + +	/* read segment summary of victim */ +	sum_page = get_sum_page(sbi, segno); +	if (IS_ERR(sum_page)) +		return; + +	/* +	 * CP needs to lock sum_page. In this time, we don't need +	 * to lock this page, because this summary page is not gone anywhere. +	 * Also, this page is not gonna be updated before GC is done. +	 */ +	unlock_page(sum_page); +	sum = page_address(sum_page); + +	switch (GET_SUM_TYPE((&sum->footer))) { +	case SUM_TYPE_NODE: +		gc_node_segment(sbi, sum->entries, segno, gc_type); +		break; +	case SUM_TYPE_DATA: +		gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); +		break; +	} +	stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); +	stat_inc_call_count(sbi->stat_info); + +	f2fs_put_page(sum_page, 0); +} + +int f2fs_gc(struct f2fs_sb_info *sbi) +{ +	struct list_head ilist; +	unsigned int segno, i; +	int gc_type = BG_GC; +	int nfree = 0; +	int ret = -1; + +	INIT_LIST_HEAD(&ilist); +gc_more: +	if (!(sbi->sb->s_flags & MS_ACTIVE)) +		goto stop; + +	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) +		gc_type = FG_GC; + +	if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) +		goto stop; +	ret = 0; + +	for (i = 0; i < sbi->segs_per_sec; i++) +		do_garbage_collect(sbi, segno + i, &ilist, gc_type); + +	if (gc_type == FG_GC && +			get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) +		nfree++; + +	if (has_not_enough_free_secs(sbi, nfree)) +		goto gc_more; + +	if (gc_type == FG_GC) +		write_checkpoint(sbi, false); +stop: +	mutex_unlock(&sbi->gc_mutex); + +	put_gc_inode(&ilist); +	return ret; +} + +void build_gc_manager(struct f2fs_sb_info *sbi) +{ +	DIRTY_I(sbi)->v_ops = &default_v_ops; +} + +int __init create_gc_caches(void) +{ +	winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", +			sizeof(struct inode_entry), NULL); +	if (!winode_slab) +		return -ENOMEM; +	return 0; +} + +void destroy_gc_caches(void) +{ +	kmem_cache_destroy(winode_slab); +} diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h new file mode 100644 index 00000000000..30b2db003ac --- /dev/null +++ b/fs/f2fs/gc.h @@ -0,0 +1,96 @@ +/* + * fs/f2fs/gc.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define GC_THREAD_MIN_WB_PAGES		1	/* +						 * a threshold to determine +						 * whether IO subsystem is idle +						 * or not +						 */ +#define GC_THREAD_MIN_SLEEP_TIME	10000 /* milliseconds */ +#define GC_THREAD_MAX_SLEEP_TIME	30000 +#define GC_THREAD_NOGC_SLEEP_TIME	10000 +#define LIMIT_INVALID_BLOCK	40 /* percentage over total user space */ +#define LIMIT_FREE_BLOCK	40 /* percentage over invalid + free space */ + +/* Search max. number of dirty segments to select a victim segment */ +#define MAX_VICTIM_SEARCH	20 + +struct f2fs_gc_kthread { +	struct task_struct *f2fs_gc_task; +	wait_queue_head_t gc_wait_queue_head; +}; + +struct inode_entry { +	struct list_head list; +	struct inode *inode; +}; + +/* + * inline functions + */ +static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) +{ +	if (free_segments(sbi) < overprovision_segments(sbi)) +		return 0; +	else +		return (free_segments(sbi) - overprovision_segments(sbi)) +			<< sbi->log_blocks_per_seg; +} + +static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) +{ +	return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100; +} + +static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) +{ +	block_t reclaimable_user_blocks = sbi->user_block_count - +		written_block_count(sbi); +	return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; +} + +static inline long increase_sleep_time(long wait) +{ +	wait += GC_THREAD_MIN_SLEEP_TIME; +	if (wait > GC_THREAD_MAX_SLEEP_TIME) +		wait = GC_THREAD_MAX_SLEEP_TIME; +	return wait; +} + +static inline long decrease_sleep_time(long wait) +{ +	wait -= GC_THREAD_MIN_SLEEP_TIME; +	if (wait <= GC_THREAD_MIN_SLEEP_TIME) +		wait = GC_THREAD_MIN_SLEEP_TIME; +	return wait; +} + +static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) +{ +	block_t invalid_user_blocks = sbi->user_block_count - +					written_block_count(sbi); +	/* +	 * Background GC is triggered with the following condition. +	 * 1. There are a number of invalid blocks. +	 * 2. There is not enough free space. +	 */ +	if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && +			free_user_blocks(sbi) < limit_free_user_blocks(sbi)) +		return true; +	return false; +} + +static inline int is_idle(struct f2fs_sb_info *sbi) +{ +	struct block_device *bdev = sbi->sb->s_bdev; +	struct request_queue *q = bdev_get_queue(bdev); +	struct request_list *rl = &q->root_rl; +	return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); +} diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c new file mode 100644 index 00000000000..6eb8d269b53 --- /dev/null +++ b/fs/f2fs/hash.c @@ -0,0 +1,101 @@ +/* + * fs/f2fs/hash.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/cryptohash.h> +#include <linux/pagemap.h> + +#include "f2fs.h" + +/* + * Hashing code copied from ext3 + */ +#define DELTA 0x9E3779B9 + +static void TEA_transform(unsigned int buf[4], unsigned int const in[]) +{ +	__u32 sum = 0; +	__u32 b0 = buf[0], b1 = buf[1]; +	__u32 a = in[0], b = in[1], c = in[2], d = in[3]; +	int n = 16; + +	do { +		sum += DELTA; +		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); +		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); +	} while (--n); + +	buf[0] += b0; +	buf[1] += b1; +} + +static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) +{ +	unsigned pad, val; +	int i; + +	pad = (__u32)len | ((__u32)len << 8); +	pad |= pad << 16; + +	val = pad; +	if (len > num * 4) +		len = num * 4; +	for (i = 0; i < len; i++) { +		if ((i % 4) == 0) +			val = pad; +		val = msg[i] + (val << 8); +		if ((i % 4) == 3) { +			*buf++ = val; +			val = pad; +			num--; +		} +	} +	if (--num >= 0) +		*buf++ = val; +	while (--num >= 0) +		*buf++ = pad; +} + +f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) +{ +	__u32 hash; +	f2fs_hash_t f2fs_hash; +	const char *p; +	__u32 in[8], buf[4]; + +	if ((len <= 2) && (name[0] == '.') && +		(name[1] == '.' || name[1] == '\0')) +		return 0; + +	/* Initialize the default seed for the hash checksum functions */ +	buf[0] = 0x67452301; +	buf[1] = 0xefcdab89; +	buf[2] = 0x98badcfe; +	buf[3] = 0x10325476; + +	p = name; +	while (1) { +		str2hashbuf(p, len, in, 4); +		TEA_transform(buf, in); +		p += 16; +		if (len <= 16) +			break; +		len -= 16; +	} +	hash = buf[0]; +	f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); +	return f2fs_hash; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c new file mode 100644 index 00000000000..ddae412d30c --- /dev/null +++ b/fs/f2fs/inode.c @@ -0,0 +1,259 @@ +/* + * fs/f2fs/inode.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> + +#include "f2fs.h" +#include "node.h" + +void f2fs_set_inode_flags(struct inode *inode) +{ +	unsigned int flags = F2FS_I(inode)->i_flags; + +	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | +			S_NOATIME | S_DIRSYNC); + +	if (flags & FS_SYNC_FL) +		inode->i_flags |= S_SYNC; +	if (flags & FS_APPEND_FL) +		inode->i_flags |= S_APPEND; +	if (flags & FS_IMMUTABLE_FL) +		inode->i_flags |= S_IMMUTABLE; +	if (flags & FS_NOATIME_FL) +		inode->i_flags |= S_NOATIME; +	if (flags & FS_DIRSYNC_FL) +		inode->i_flags |= S_DIRSYNC; +} + +static int do_read_inode(struct inode *inode) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct f2fs_inode_info *fi = F2FS_I(inode); +	struct page *node_page; +	struct f2fs_node *rn; +	struct f2fs_inode *ri; + +	/* Check if ino is within scope */ +	check_nid_range(sbi, inode->i_ino); + +	node_page = get_node_page(sbi, inode->i_ino); +	if (IS_ERR(node_page)) +		return PTR_ERR(node_page); + +	rn = page_address(node_page); +	ri = &(rn->i); + +	inode->i_mode = le16_to_cpu(ri->i_mode); +	i_uid_write(inode, le32_to_cpu(ri->i_uid)); +	i_gid_write(inode, le32_to_cpu(ri->i_gid)); +	set_nlink(inode, le32_to_cpu(ri->i_links)); +	inode->i_size = le64_to_cpu(ri->i_size); +	inode->i_blocks = le64_to_cpu(ri->i_blocks); + +	inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); +	inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); +	inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); +	inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); +	inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); +	inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); +	inode->i_generation = le32_to_cpu(ri->i_generation); +	if (ri->i_addr[0]) +		inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); +	else +		inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); + +	fi->i_current_depth = le32_to_cpu(ri->i_current_depth); +	fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); +	fi->i_flags = le32_to_cpu(ri->i_flags); +	fi->flags = 0; +	fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1; +	fi->i_advise = ri->i_advise; +	fi->i_pino = le32_to_cpu(ri->i_pino); +	get_extent_info(&fi->ext, ri->i_ext); +	f2fs_put_page(node_page, 1); +	return 0; +} + +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	struct inode *inode; +	int ret; + +	inode = iget_locked(sb, ino); +	if (!inode) +		return ERR_PTR(-ENOMEM); +	if (!(inode->i_state & I_NEW)) +		return inode; +	if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) +		goto make_now; + +	ret = do_read_inode(inode); +	if (ret) +		goto bad_inode; + +	if (!sbi->por_doing && inode->i_nlink == 0) { +		ret = -ENOENT; +		goto bad_inode; +	} + +make_now: +	if (ino == F2FS_NODE_INO(sbi)) { +		inode->i_mapping->a_ops = &f2fs_node_aops; +		mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); +	} else if (ino == F2FS_META_INO(sbi)) { +		inode->i_mapping->a_ops = &f2fs_meta_aops; +		mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); +	} else if (S_ISREG(inode->i_mode)) { +		inode->i_op = &f2fs_file_inode_operations; +		inode->i_fop = &f2fs_file_operations; +		inode->i_mapping->a_ops = &f2fs_dblock_aops; +	} else if (S_ISDIR(inode->i_mode)) { +		inode->i_op = &f2fs_dir_inode_operations; +		inode->i_fop = &f2fs_dir_operations; +		inode->i_mapping->a_ops = &f2fs_dblock_aops; +		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE | +				__GFP_ZERO); +	} else if (S_ISLNK(inode->i_mode)) { +		inode->i_op = &f2fs_symlink_inode_operations; +		inode->i_mapping->a_ops = &f2fs_dblock_aops; +	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || +			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { +		inode->i_op = &f2fs_special_inode_operations; +		init_special_inode(inode, inode->i_mode, inode->i_rdev); +	} else { +		ret = -EIO; +		goto bad_inode; +	} +	unlock_new_inode(inode); + +	return inode; + +bad_inode: +	iget_failed(inode); +	return ERR_PTR(ret); +} + +void update_inode(struct inode *inode, struct page *node_page) +{ +	struct f2fs_node *rn; +	struct f2fs_inode *ri; + +	wait_on_page_writeback(node_page); + +	rn = page_address(node_page); +	ri = &(rn->i); + +	ri->i_mode = cpu_to_le16(inode->i_mode); +	ri->i_advise = F2FS_I(inode)->i_advise; +	ri->i_uid = cpu_to_le32(i_uid_read(inode)); +	ri->i_gid = cpu_to_le32(i_gid_read(inode)); +	ri->i_links = cpu_to_le32(inode->i_nlink); +	ri->i_size = cpu_to_le64(i_size_read(inode)); +	ri->i_blocks = cpu_to_le64(inode->i_blocks); +	set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); + +	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); +	ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); +	ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); +	ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); +	ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); +	ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); +	ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); +	ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); +	ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); +	ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); +	ri->i_generation = cpu_to_le32(inode->i_generation); + +	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { +		if (old_valid_dev(inode->i_rdev)) { +			ri->i_addr[0] = +				cpu_to_le32(old_encode_dev(inode->i_rdev)); +			ri->i_addr[1] = 0; +		} else { +			ri->i_addr[0] = 0; +			ri->i_addr[1] = +				cpu_to_le32(new_encode_dev(inode->i_rdev)); +			ri->i_addr[2] = 0; +		} +	} + +	set_cold_node(inode, node_page); +	set_page_dirty(node_page); +} + +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct page *node_page; +	bool need_lock = false; + +	if (inode->i_ino == F2FS_NODE_INO(sbi) || +			inode->i_ino == F2FS_META_INO(sbi)) +		return 0; + +	if (wbc) +		f2fs_balance_fs(sbi); + +	node_page = get_node_page(sbi, inode->i_ino); +	if (IS_ERR(node_page)) +		return PTR_ERR(node_page); + +	if (!PageDirty(node_page)) { +		need_lock = true; +		f2fs_put_page(node_page, 1); +		mutex_lock(&sbi->write_inode); +		node_page = get_node_page(sbi, inode->i_ino); +		if (IS_ERR(node_page)) { +			mutex_unlock(&sbi->write_inode); +			return PTR_ERR(node_page); +		} +	} +	update_inode(inode, node_page); +	f2fs_put_page(node_page, 1); +	if (need_lock) +		mutex_unlock(&sbi->write_inode); +	return 0; +} + +/* + * Called at the last iput() if i_nlink is zero + */ +void f2fs_evict_inode(struct inode *inode) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + +	truncate_inode_pages(&inode->i_data, 0); + +	if (inode->i_ino == F2FS_NODE_INO(sbi) || +			inode->i_ino == F2FS_META_INO(sbi)) +		goto no_delete; + +	BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents)); +	remove_dirty_dir_inode(inode); + +	if (inode->i_nlink || is_bad_inode(inode)) +		goto no_delete; + +	sb_start_intwrite(inode->i_sb); +	set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); +	i_size_write(inode, 0); + +	if (F2FS_HAS_BLOCKS(inode)) +		f2fs_truncate(inode); + +	remove_inode_page(inode); +	sb_end_intwrite(inode->i_sb); +no_delete: +	clear_inode(inode); +} diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c new file mode 100644 index 00000000000..1a49b881bac --- /dev/null +++ b/fs/f2fs/namei.c @@ -0,0 +1,503 @@ +/* + * fs/f2fs/namei.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/pagemap.h> +#include <linux/sched.h> +#include <linux/ctype.h> + +#include "f2fs.h" +#include "xattr.h" +#include "acl.h" + +static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) +{ +	struct super_block *sb = dir->i_sb; +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	nid_t ino; +	struct inode *inode; +	bool nid_free = false; +	int err; + +	inode = new_inode(sb); +	if (!inode) +		return ERR_PTR(-ENOMEM); + +	mutex_lock_op(sbi, NODE_NEW); +	if (!alloc_nid(sbi, &ino)) { +		mutex_unlock_op(sbi, NODE_NEW); +		err = -ENOSPC; +		goto fail; +	} +	mutex_unlock_op(sbi, NODE_NEW); + +	inode->i_uid = current_fsuid(); + +	if (dir->i_mode & S_ISGID) { +		inode->i_gid = dir->i_gid; +		if (S_ISDIR(mode)) +			mode |= S_ISGID; +	} else { +		inode->i_gid = current_fsgid(); +	} + +	inode->i_ino = ino; +	inode->i_mode = mode; +	inode->i_blocks = 0; +	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; +	inode->i_generation = sbi->s_next_generation++; + +	err = insert_inode_locked(inode); +	if (err) { +		err = -EINVAL; +		nid_free = true; +		goto out; +	} + +	mark_inode_dirty(inode); +	return inode; + +out: +	clear_nlink(inode); +	unlock_new_inode(inode); +fail: +	iput(inode); +	if (nid_free) +		alloc_nid_failed(sbi, ino); +	return ERR_PTR(err); +} + +static int is_multimedia_file(const unsigned char *s, const char *sub) +{ +	size_t slen = strlen(s); +	size_t sublen = strlen(sub); +	int ret; + +	if (sublen > slen) +		return 1; + +	ret = memcmp(s + slen - sublen, sub, sublen); +	if (ret) {	/* compare upper case */ +		int i; +		char upper_sub[8]; +		for (i = 0; i < sublen && i < sizeof(upper_sub); i++) +			upper_sub[i] = toupper(sub[i]); +		return memcmp(s + slen - sublen, upper_sub, sublen); +	} + +	return ret; +} + +/* + * Set multimedia files as cold files for hot/cold data separation + */ +static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, +		const unsigned char *name) +{ +	int i; +	__u8 (*extlist)[8] = sbi->raw_super->extension_list; + +	int count = le32_to_cpu(sbi->raw_super->extension_count); +	for (i = 0; i < count; i++) { +		if (!is_multimedia_file(name, extlist[i])) { +			F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; +			break; +		} +	} +} + +static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, +						bool excl) +{ +	struct super_block *sb = dir->i_sb; +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	struct inode *inode; +	nid_t ino = 0; +	int err; + +	f2fs_balance_fs(sbi); + +	inode = f2fs_new_inode(dir, mode); +	if (IS_ERR(inode)) +		return PTR_ERR(inode); + +	if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) +		set_cold_file(sbi, inode, dentry->d_name.name); + +	inode->i_op = &f2fs_file_inode_operations; +	inode->i_fop = &f2fs_file_operations; +	inode->i_mapping->a_ops = &f2fs_dblock_aops; +	ino = inode->i_ino; + +	err = f2fs_add_link(dentry, inode); +	if (err) +		goto out; + +	alloc_nid_done(sbi, ino); + +	if (!sbi->por_doing) +		d_instantiate(dentry, inode); +	unlock_new_inode(inode); +	return 0; +out: +	clear_nlink(inode); +	unlock_new_inode(inode); +	iput(inode); +	alloc_nid_failed(sbi, ino); +	return err; +} + +static int f2fs_link(struct dentry *old_dentry, struct inode *dir, +		struct dentry *dentry) +{ +	struct inode *inode = old_dentry->d_inode; +	struct super_block *sb = dir->i_sb; +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	int err; + +	f2fs_balance_fs(sbi); + +	inode->i_ctime = CURRENT_TIME; +	atomic_inc(&inode->i_count); + +	set_inode_flag(F2FS_I(inode), FI_INC_LINK); +	err = f2fs_add_link(dentry, inode); +	if (err) +		goto out; + +	d_instantiate(dentry, inode); +	return 0; +out: +	clear_inode_flag(F2FS_I(inode), FI_INC_LINK); +	iput(inode); +	return err; +} + +struct dentry *f2fs_get_parent(struct dentry *child) +{ +	struct qstr dotdot = QSTR_INIT("..", 2); +	unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot); +	if (!ino) +		return ERR_PTR(-ENOENT); +	return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino)); +} + +static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, +		unsigned int flags) +{ +	struct inode *inode = NULL; +	struct f2fs_dir_entry *de; +	struct page *page; + +	if (dentry->d_name.len > F2FS_MAX_NAME_LEN) +		return ERR_PTR(-ENAMETOOLONG); + +	de = f2fs_find_entry(dir, &dentry->d_name, &page); +	if (de) { +		nid_t ino = le32_to_cpu(de->ino); +		kunmap(page); +		f2fs_put_page(page, 0); + +		inode = f2fs_iget(dir->i_sb, ino); +		if (IS_ERR(inode)) +			return ERR_CAST(inode); +	} + +	return d_splice_alias(inode, dentry); +} + +static int f2fs_unlink(struct inode *dir, struct dentry *dentry) +{ +	struct super_block *sb = dir->i_sb; +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	struct inode *inode = dentry->d_inode; +	struct f2fs_dir_entry *de; +	struct page *page; +	int err = -ENOENT; + +	f2fs_balance_fs(sbi); + +	de = f2fs_find_entry(dir, &dentry->d_name, &page); +	if (!de) +		goto fail; + +	err = check_orphan_space(sbi); +	if (err) { +		kunmap(page); +		f2fs_put_page(page, 0); +		goto fail; +	} + +	f2fs_delete_entry(de, page, inode); + +	/* In order to evict this inode,  we set it dirty */ +	mark_inode_dirty(inode); +fail: +	return err; +} + +static int f2fs_symlink(struct inode *dir, struct dentry *dentry, +					const char *symname) +{ +	struct super_block *sb = dir->i_sb; +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	struct inode *inode; +	size_t symlen = strlen(symname) + 1; +	int err; + +	f2fs_balance_fs(sbi); + +	inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); +	if (IS_ERR(inode)) +		return PTR_ERR(inode); + +	inode->i_op = &f2fs_symlink_inode_operations; +	inode->i_mapping->a_ops = &f2fs_dblock_aops; + +	err = f2fs_add_link(dentry, inode); +	if (err) +		goto out; + +	err = page_symlink(inode, symname, symlen); +	alloc_nid_done(sbi, inode->i_ino); + +	d_instantiate(dentry, inode); +	unlock_new_inode(inode); +	return err; +out: +	clear_nlink(inode); +	unlock_new_inode(inode); +	iput(inode); +	alloc_nid_failed(sbi, inode->i_ino); +	return err; +} + +static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); +	struct inode *inode; +	int err; + +	f2fs_balance_fs(sbi); + +	inode = f2fs_new_inode(dir, S_IFDIR | mode); +	if (IS_ERR(inode)) +		return PTR_ERR(inode); + +	inode->i_op = &f2fs_dir_inode_operations; +	inode->i_fop = &f2fs_dir_operations; +	inode->i_mapping->a_ops = &f2fs_dblock_aops; +	mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + +	set_inode_flag(F2FS_I(inode), FI_INC_LINK); +	err = f2fs_add_link(dentry, inode); +	if (err) +		goto out_fail; + +	alloc_nid_done(sbi, inode->i_ino); + +	d_instantiate(dentry, inode); +	unlock_new_inode(inode); + +	return 0; + +out_fail: +	clear_inode_flag(F2FS_I(inode), FI_INC_LINK); +	clear_nlink(inode); +	unlock_new_inode(inode); +	iput(inode); +	alloc_nid_failed(sbi, inode->i_ino); +	return err; +} + +static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) +{ +	struct inode *inode = dentry->d_inode; +	if (f2fs_empty_dir(inode)) +		return f2fs_unlink(dir, dentry); +	return -ENOTEMPTY; +} + +static int f2fs_mknod(struct inode *dir, struct dentry *dentry, +				umode_t mode, dev_t rdev) +{ +	struct super_block *sb = dir->i_sb; +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	struct inode *inode; +	int err = 0; + +	if (!new_valid_dev(rdev)) +		return -EINVAL; + +	f2fs_balance_fs(sbi); + +	inode = f2fs_new_inode(dir, mode); +	if (IS_ERR(inode)) +		return PTR_ERR(inode); + +	init_special_inode(inode, inode->i_mode, rdev); +	inode->i_op = &f2fs_special_inode_operations; + +	err = f2fs_add_link(dentry, inode); +	if (err) +		goto out; + +	alloc_nid_done(sbi, inode->i_ino); +	d_instantiate(dentry, inode); +	unlock_new_inode(inode); +	return 0; +out: +	clear_nlink(inode); +	unlock_new_inode(inode); +	iput(inode); +	alloc_nid_failed(sbi, inode->i_ino); +	return err; +} + +static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, +			struct inode *new_dir, struct dentry *new_dentry) +{ +	struct super_block *sb = old_dir->i_sb; +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	struct inode *old_inode = old_dentry->d_inode; +	struct inode *new_inode = new_dentry->d_inode; +	struct page *old_dir_page; +	struct page *old_page; +	struct f2fs_dir_entry *old_dir_entry = NULL; +	struct f2fs_dir_entry *old_entry; +	struct f2fs_dir_entry *new_entry; +	int err = -ENOENT; + +	f2fs_balance_fs(sbi); + +	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); +	if (!old_entry) +		goto out; + +	if (S_ISDIR(old_inode->i_mode)) { +		err = -EIO; +		old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); +		if (!old_dir_entry) +			goto out_old; +	} + +	mutex_lock_op(sbi, RENAME); + +	if (new_inode) { +		struct page *new_page; + +		err = -ENOTEMPTY; +		if (old_dir_entry && !f2fs_empty_dir(new_inode)) +			goto out_dir; + +		err = -ENOENT; +		new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, +						&new_page); +		if (!new_entry) +			goto out_dir; + +		f2fs_set_link(new_dir, new_entry, new_page, old_inode); + +		new_inode->i_ctime = CURRENT_TIME; +		if (old_dir_entry) +			drop_nlink(new_inode); +		drop_nlink(new_inode); +		if (!new_inode->i_nlink) +			add_orphan_inode(sbi, new_inode->i_ino); +		f2fs_write_inode(new_inode, NULL); +	} else { +		err = f2fs_add_link(new_dentry, old_inode); +		if (err) +			goto out_dir; + +		if (old_dir_entry) { +			inc_nlink(new_dir); +			f2fs_write_inode(new_dir, NULL); +		} +	} + +	old_inode->i_ctime = CURRENT_TIME; +	set_inode_flag(F2FS_I(old_inode), FI_NEED_CP); +	mark_inode_dirty(old_inode); + +	f2fs_delete_entry(old_entry, old_page, NULL); + +	if (old_dir_entry) { +		if (old_dir != new_dir) { +			f2fs_set_link(old_inode, old_dir_entry, +						old_dir_page, new_dir); +		} else { +			kunmap(old_dir_page); +			f2fs_put_page(old_dir_page, 0); +		} +		drop_nlink(old_dir); +		f2fs_write_inode(old_dir, NULL); +	} + +	mutex_unlock_op(sbi, RENAME); +	return 0; + +out_dir: +	if (old_dir_entry) { +		kunmap(old_dir_page); +		f2fs_put_page(old_dir_page, 0); +	} +	mutex_unlock_op(sbi, RENAME); +out_old: +	kunmap(old_page); +	f2fs_put_page(old_page, 0); +out: +	return err; +} + +const struct inode_operations f2fs_dir_inode_operations = { +	.create		= f2fs_create, +	.lookup		= f2fs_lookup, +	.link		= f2fs_link, +	.unlink		= f2fs_unlink, +	.symlink	= f2fs_symlink, +	.mkdir		= f2fs_mkdir, +	.rmdir		= f2fs_rmdir, +	.mknod		= f2fs_mknod, +	.rename		= f2fs_rename, +	.setattr	= f2fs_setattr, +	.get_acl	= f2fs_get_acl, +#ifdef CONFIG_F2FS_FS_XATTR +	.setxattr	= generic_setxattr, +	.getxattr	= generic_getxattr, +	.listxattr	= f2fs_listxattr, +	.removexattr	= generic_removexattr, +#endif +}; + +const struct inode_operations f2fs_symlink_inode_operations = { +	.readlink       = generic_readlink, +	.follow_link    = page_follow_link_light, +	.put_link       = page_put_link, +	.setattr	= f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR +	.setxattr	= generic_setxattr, +	.getxattr	= generic_getxattr, +	.listxattr	= f2fs_listxattr, +	.removexattr	= generic_removexattr, +#endif +}; + +const struct inode_operations f2fs_special_inode_operations = { +	.setattr        = f2fs_setattr, +	.get_acl	= f2fs_get_acl, +#ifdef CONFIG_F2FS_FS_XATTR +	.setxattr       = generic_setxattr, +	.getxattr       = generic_getxattr, +	.listxattr	= f2fs_listxattr, +	.removexattr    = generic_removexattr, +#endif +}; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c new file mode 100644 index 00000000000..e275218904e --- /dev/null +++ b/fs/f2fs/node.c @@ -0,0 +1,1756 @@ +/* + * fs/f2fs/node.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/mpage.h> +#include <linux/backing-dev.h> +#include <linux/blkdev.h> +#include <linux/pagevec.h> +#include <linux/swap.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +static struct kmem_cache *nat_entry_slab; +static struct kmem_cache *free_nid_slab; + +static void clear_node_page_dirty(struct page *page) +{ +	struct address_space *mapping = page->mapping; +	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); +	unsigned int long flags; + +	if (PageDirty(page)) { +		spin_lock_irqsave(&mapping->tree_lock, flags); +		radix_tree_tag_clear(&mapping->page_tree, +				page_index(page), +				PAGECACHE_TAG_DIRTY); +		spin_unlock_irqrestore(&mapping->tree_lock, flags); + +		clear_page_dirty_for_io(page); +		dec_page_count(sbi, F2FS_DIRTY_NODES); +	} +	ClearPageUptodate(page); +} + +static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ +	pgoff_t index = current_nat_addr(sbi, nid); +	return get_meta_page(sbi, index); +} + +static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ +	struct page *src_page; +	struct page *dst_page; +	pgoff_t src_off; +	pgoff_t dst_off; +	void *src_addr; +	void *dst_addr; +	struct f2fs_nm_info *nm_i = NM_I(sbi); + +	src_off = current_nat_addr(sbi, nid); +	dst_off = next_nat_addr(sbi, src_off); + +	/* get current nat block page with lock */ +	src_page = get_meta_page(sbi, src_off); + +	/* Dirty src_page means that it is already the new target NAT page. */ +	if (PageDirty(src_page)) +		return src_page; + +	dst_page = grab_meta_page(sbi, dst_off); + +	src_addr = page_address(src_page); +	dst_addr = page_address(dst_page); +	memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); +	set_page_dirty(dst_page); +	f2fs_put_page(src_page, 1); + +	set_to_next_nat(nm_i, nid); + +	return dst_page; +} + +/* + * Readahead NAT pages + */ +static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) +{ +	struct address_space *mapping = sbi->meta_inode->i_mapping; +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct page *page; +	pgoff_t index; +	int i; + +	for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { +		if (nid >= nm_i->max_nid) +			nid = 0; +		index = current_nat_addr(sbi, nid); + +		page = grab_cache_page(mapping, index); +		if (!page) +			continue; +		if (f2fs_readpage(sbi, page, index, READ)) { +			f2fs_put_page(page, 1); +			continue; +		} +		f2fs_put_page(page, 0); +	} +} + +static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) +{ +	return radix_tree_lookup(&nm_i->nat_root, n); +} + +static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, +		nid_t start, unsigned int nr, struct nat_entry **ep) +{ +	return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr); +} + +static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) +{ +	list_del(&e->list); +	radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); +	nm_i->nat_cnt--; +	kmem_cache_free(nat_entry_slab, e); +} + +int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct nat_entry *e; +	int is_cp = 1; + +	read_lock(&nm_i->nat_tree_lock); +	e = __lookup_nat_cache(nm_i, nid); +	if (e && !e->checkpointed) +		is_cp = 0; +	read_unlock(&nm_i->nat_tree_lock); +	return is_cp; +} + +static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +{ +	struct nat_entry *new; + +	new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); +	if (!new) +		return NULL; +	if (radix_tree_insert(&nm_i->nat_root, nid, new)) { +		kmem_cache_free(nat_entry_slab, new); +		return NULL; +	} +	memset(new, 0, sizeof(struct nat_entry)); +	nat_set_nid(new, nid); +	list_add_tail(&new->list, &nm_i->nat_entries); +	nm_i->nat_cnt++; +	return new; +} + +static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, +						struct f2fs_nat_entry *ne) +{ +	struct nat_entry *e; +retry: +	write_lock(&nm_i->nat_tree_lock); +	e = __lookup_nat_cache(nm_i, nid); +	if (!e) { +		e = grab_nat_entry(nm_i, nid); +		if (!e) { +			write_unlock(&nm_i->nat_tree_lock); +			goto retry; +		} +		nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); +		nat_set_ino(e, le32_to_cpu(ne->ino)); +		nat_set_version(e, ne->version); +		e->checkpointed = true; +	} +	write_unlock(&nm_i->nat_tree_lock); +} + +static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, +			block_t new_blkaddr) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct nat_entry *e; +retry: +	write_lock(&nm_i->nat_tree_lock); +	e = __lookup_nat_cache(nm_i, ni->nid); +	if (!e) { +		e = grab_nat_entry(nm_i, ni->nid); +		if (!e) { +			write_unlock(&nm_i->nat_tree_lock); +			goto retry; +		} +		e->ni = *ni; +		e->checkpointed = true; +		BUG_ON(ni->blk_addr == NEW_ADDR); +	} else if (new_blkaddr == NEW_ADDR) { +		/* +		 * when nid is reallocated, +		 * previous nat entry can be remained in nat cache. +		 * So, reinitialize it with new information. +		 */ +		e->ni = *ni; +		BUG_ON(ni->blk_addr != NULL_ADDR); +	} + +	if (new_blkaddr == NEW_ADDR) +		e->checkpointed = false; + +	/* sanity check */ +	BUG_ON(nat_get_blkaddr(e) != ni->blk_addr); +	BUG_ON(nat_get_blkaddr(e) == NULL_ADDR && +			new_blkaddr == NULL_ADDR); +	BUG_ON(nat_get_blkaddr(e) == NEW_ADDR && +			new_blkaddr == NEW_ADDR); +	BUG_ON(nat_get_blkaddr(e) != NEW_ADDR && +			nat_get_blkaddr(e) != NULL_ADDR && +			new_blkaddr == NEW_ADDR); + +	/* increament version no as node is removed */ +	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { +		unsigned char version = nat_get_version(e); +		nat_set_version(e, inc_node_version(version)); +	} + +	/* change address */ +	nat_set_blkaddr(e, new_blkaddr); +	__set_nat_cache_dirty(nm_i, e); +	write_unlock(&nm_i->nat_tree_lock); +} + +static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); + +	if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD) +		return 0; + +	write_lock(&nm_i->nat_tree_lock); +	while (nr_shrink && !list_empty(&nm_i->nat_entries)) { +		struct nat_entry *ne; +		ne = list_first_entry(&nm_i->nat_entries, +					struct nat_entry, list); +		__del_from_nat_cache(nm_i, ne); +		nr_shrink--; +	} +	write_unlock(&nm_i->nat_tree_lock); +	return nr_shrink; +} + +/* + * This function returns always success + */ +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); +	struct f2fs_summary_block *sum = curseg->sum_blk; +	nid_t start_nid = START_NID(nid); +	struct f2fs_nat_block *nat_blk; +	struct page *page = NULL; +	struct f2fs_nat_entry ne; +	struct nat_entry *e; +	int i; + +	memset(&ne, 0, sizeof(struct f2fs_nat_entry)); +	ni->nid = nid; + +	/* Check nat cache */ +	read_lock(&nm_i->nat_tree_lock); +	e = __lookup_nat_cache(nm_i, nid); +	if (e) { +		ni->ino = nat_get_ino(e); +		ni->blk_addr = nat_get_blkaddr(e); +		ni->version = nat_get_version(e); +	} +	read_unlock(&nm_i->nat_tree_lock); +	if (e) +		return; + +	/* Check current segment summary */ +	mutex_lock(&curseg->curseg_mutex); +	i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); +	if (i >= 0) { +		ne = nat_in_journal(sum, i); +		node_info_from_raw_nat(ni, &ne); +	} +	mutex_unlock(&curseg->curseg_mutex); +	if (i >= 0) +		goto cache; + +	/* Fill node_info from nat page */ +	page = get_current_nat_page(sbi, start_nid); +	nat_blk = (struct f2fs_nat_block *)page_address(page); +	ne = nat_blk->entries[nid - start_nid]; +	node_info_from_raw_nat(ni, &ne); +	f2fs_put_page(page, 1); +cache: +	/* cache nat entry */ +	cache_nat_entry(NM_I(sbi), nid, &ne); +} + +/* + * The maximum depth is four. + * Offset[0] will have raw inode offset. + */ +static int get_node_path(long block, int offset[4], unsigned int noffset[4]) +{ +	const long direct_index = ADDRS_PER_INODE; +	const long direct_blks = ADDRS_PER_BLOCK; +	const long dptrs_per_blk = NIDS_PER_BLOCK; +	const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; +	const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK; +	int n = 0; +	int level = 0; + +	noffset[0] = 0; + +	if (block < direct_index) { +		offset[n++] = block; +		level = 0; +		goto got; +	} +	block -= direct_index; +	if (block < direct_blks) { +		offset[n++] = NODE_DIR1_BLOCK; +		noffset[n] = 1; +		offset[n++] = block; +		level = 1; +		goto got; +	} +	block -= direct_blks; +	if (block < direct_blks) { +		offset[n++] = NODE_DIR2_BLOCK; +		noffset[n] = 2; +		offset[n++] = block; +		level = 1; +		goto got; +	} +	block -= direct_blks; +	if (block < indirect_blks) { +		offset[n++] = NODE_IND1_BLOCK; +		noffset[n] = 3; +		offset[n++] = block / direct_blks; +		noffset[n] = 4 + offset[n - 1]; +		offset[n++] = block % direct_blks; +		level = 2; +		goto got; +	} +	block -= indirect_blks; +	if (block < indirect_blks) { +		offset[n++] = NODE_IND2_BLOCK; +		noffset[n] = 4 + dptrs_per_blk; +		offset[n++] = block / direct_blks; +		noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; +		offset[n++] = block % direct_blks; +		level = 2; +		goto got; +	} +	block -= indirect_blks; +	if (block < dindirect_blks) { +		offset[n++] = NODE_DIND_BLOCK; +		noffset[n] = 5 + (dptrs_per_blk * 2); +		offset[n++] = block / indirect_blks; +		noffset[n] = 6 + (dptrs_per_blk * 2) + +			      offset[n - 1] * (dptrs_per_blk + 1); +		offset[n++] = (block / direct_blks) % dptrs_per_blk; +		noffset[n] = 7 + (dptrs_per_blk * 2) + +			      offset[n - 2] * (dptrs_per_blk + 1) + +			      offset[n - 1]; +		offset[n++] = block % direct_blks; +		level = 3; +		goto got; +	} else { +		BUG(); +	} +got: +	return level; +} + +/* + * Caller should call f2fs_put_dnode(dn). + */ +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); +	struct page *npage[4]; +	struct page *parent; +	int offset[4]; +	unsigned int noffset[4]; +	nid_t nids[4]; +	int level, i; +	int err = 0; + +	level = get_node_path(index, offset, noffset); + +	nids[0] = dn->inode->i_ino; +	npage[0] = get_node_page(sbi, nids[0]); +	if (IS_ERR(npage[0])) +		return PTR_ERR(npage[0]); + +	parent = npage[0]; +	nids[1] = get_nid(parent, offset[0], true); +	dn->inode_page = npage[0]; +	dn->inode_page_locked = true; + +	/* get indirect or direct nodes */ +	for (i = 1; i <= level; i++) { +		bool done = false; + +		if (!nids[i] && !ro) { +			mutex_lock_op(sbi, NODE_NEW); + +			/* alloc new node */ +			if (!alloc_nid(sbi, &(nids[i]))) { +				mutex_unlock_op(sbi, NODE_NEW); +				err = -ENOSPC; +				goto release_pages; +			} + +			dn->nid = nids[i]; +			npage[i] = new_node_page(dn, noffset[i]); +			if (IS_ERR(npage[i])) { +				alloc_nid_failed(sbi, nids[i]); +				mutex_unlock_op(sbi, NODE_NEW); +				err = PTR_ERR(npage[i]); +				goto release_pages; +			} + +			set_nid(parent, offset[i - 1], nids[i], i == 1); +			alloc_nid_done(sbi, nids[i]); +			mutex_unlock_op(sbi, NODE_NEW); +			done = true; +		} else if (ro && i == level && level > 1) { +			npage[i] = get_node_page_ra(parent, offset[i - 1]); +			if (IS_ERR(npage[i])) { +				err = PTR_ERR(npage[i]); +				goto release_pages; +			} +			done = true; +		} +		if (i == 1) { +			dn->inode_page_locked = false; +			unlock_page(parent); +		} else { +			f2fs_put_page(parent, 1); +		} + +		if (!done) { +			npage[i] = get_node_page(sbi, nids[i]); +			if (IS_ERR(npage[i])) { +				err = PTR_ERR(npage[i]); +				f2fs_put_page(npage[0], 0); +				goto release_out; +			} +		} +		if (i < level) { +			parent = npage[i]; +			nids[i + 1] = get_nid(parent, offset[i], false); +		} +	} +	dn->nid = nids[level]; +	dn->ofs_in_node = offset[level]; +	dn->node_page = npage[level]; +	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); +	return 0; + +release_pages: +	f2fs_put_page(parent, 1); +	if (i > 1) +		f2fs_put_page(npage[0], 0); +release_out: +	dn->inode_page = NULL; +	dn->node_page = NULL; +	return err; +} + +static void truncate_node(struct dnode_of_data *dn) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); +	struct node_info ni; + +	get_node_info(sbi, dn->nid, &ni); +	if (dn->inode->i_blocks == 0) { +		BUG_ON(ni.blk_addr != NULL_ADDR); +		goto invalidate; +	} +	BUG_ON(ni.blk_addr == NULL_ADDR); + +	/* Deallocate node address */ +	invalidate_blocks(sbi, ni.blk_addr); +	dec_valid_node_count(sbi, dn->inode, 1); +	set_node_addr(sbi, &ni, NULL_ADDR); + +	if (dn->nid == dn->inode->i_ino) { +		remove_orphan_inode(sbi, dn->nid); +		dec_valid_inode_count(sbi); +	} else { +		sync_inode_page(dn); +	} +invalidate: +	clear_node_page_dirty(dn->node_page); +	F2FS_SET_SB_DIRT(sbi); + +	f2fs_put_page(dn->node_page, 1); +	dn->node_page = NULL; +} + +static int truncate_dnode(struct dnode_of_data *dn) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); +	struct page *page; + +	if (dn->nid == 0) +		return 1; + +	/* get direct node */ +	page = get_node_page(sbi, dn->nid); +	if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) +		return 1; +	else if (IS_ERR(page)) +		return PTR_ERR(page); + +	/* Make dnode_of_data for parameter */ +	dn->node_page = page; +	dn->ofs_in_node = 0; +	truncate_data_blocks(dn); +	truncate_node(dn); +	return 1; +} + +static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, +						int ofs, int depth) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); +	struct dnode_of_data rdn = *dn; +	struct page *page; +	struct f2fs_node *rn; +	nid_t child_nid; +	unsigned int child_nofs; +	int freed = 0; +	int i, ret; + +	if (dn->nid == 0) +		return NIDS_PER_BLOCK + 1; + +	page = get_node_page(sbi, dn->nid); +	if (IS_ERR(page)) +		return PTR_ERR(page); + +	rn = (struct f2fs_node *)page_address(page); +	if (depth < 3) { +		for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { +			child_nid = le32_to_cpu(rn->in.nid[i]); +			if (child_nid == 0) +				continue; +			rdn.nid = child_nid; +			ret = truncate_dnode(&rdn); +			if (ret < 0) +				goto out_err; +			set_nid(page, i, 0, false); +		} +	} else { +		child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; +		for (i = ofs; i < NIDS_PER_BLOCK; i++) { +			child_nid = le32_to_cpu(rn->in.nid[i]); +			if (child_nid == 0) { +				child_nofs += NIDS_PER_BLOCK + 1; +				continue; +			} +			rdn.nid = child_nid; +			ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); +			if (ret == (NIDS_PER_BLOCK + 1)) { +				set_nid(page, i, 0, false); +				child_nofs += ret; +			} else if (ret < 0 && ret != -ENOENT) { +				goto out_err; +			} +		} +		freed = child_nofs; +	} + +	if (!ofs) { +		/* remove current indirect node */ +		dn->node_page = page; +		truncate_node(dn); +		freed++; +	} else { +		f2fs_put_page(page, 1); +	} +	return freed; + +out_err: +	f2fs_put_page(page, 1); +	return ret; +} + +static int truncate_partial_nodes(struct dnode_of_data *dn, +			struct f2fs_inode *ri, int *offset, int depth) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); +	struct page *pages[2]; +	nid_t nid[3]; +	nid_t child_nid; +	int err = 0; +	int i; +	int idx = depth - 2; + +	nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); +	if (!nid[0]) +		return 0; + +	/* get indirect nodes in the path */ +	for (i = 0; i < depth - 1; i++) { +		/* refernece count'll be increased */ +		pages[i] = get_node_page(sbi, nid[i]); +		if (IS_ERR(pages[i])) { +			depth = i + 1; +			err = PTR_ERR(pages[i]); +			goto fail; +		} +		nid[i + 1] = get_nid(pages[i], offset[i + 1], false); +	} + +	/* free direct nodes linked to a partial indirect node */ +	for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) { +		child_nid = get_nid(pages[idx], i, false); +		if (!child_nid) +			continue; +		dn->nid = child_nid; +		err = truncate_dnode(dn); +		if (err < 0) +			goto fail; +		set_nid(pages[idx], i, 0, false); +	} + +	if (offset[depth - 1] == 0) { +		dn->node_page = pages[idx]; +		dn->nid = nid[idx]; +		truncate_node(dn); +	} else { +		f2fs_put_page(pages[idx], 1); +	} +	offset[idx]++; +	offset[depth - 1] = 0; +fail: +	for (i = depth - 3; i >= 0; i--) +		f2fs_put_page(pages[i], 1); +	return err; +} + +/* + * All the block addresses of data and nodes should be nullified. + */ +int truncate_inode_blocks(struct inode *inode, pgoff_t from) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	int err = 0, cont = 1; +	int level, offset[4], noffset[4]; +	unsigned int nofs = 0; +	struct f2fs_node *rn; +	struct dnode_of_data dn; +	struct page *page; + +	level = get_node_path(from, offset, noffset); + +	page = get_node_page(sbi, inode->i_ino); +	if (IS_ERR(page)) +		return PTR_ERR(page); + +	set_new_dnode(&dn, inode, page, NULL, 0); +	unlock_page(page); + +	rn = page_address(page); +	switch (level) { +	case 0: +	case 1: +		nofs = noffset[1]; +		break; +	case 2: +		nofs = noffset[1]; +		if (!offset[level - 1]) +			goto skip_partial; +		err = truncate_partial_nodes(&dn, &rn->i, offset, level); +		if (err < 0 && err != -ENOENT) +			goto fail; +		nofs += 1 + NIDS_PER_BLOCK; +		break; +	case 3: +		nofs = 5 + 2 * NIDS_PER_BLOCK; +		if (!offset[level - 1]) +			goto skip_partial; +		err = truncate_partial_nodes(&dn, &rn->i, offset, level); +		if (err < 0 && err != -ENOENT) +			goto fail; +		break; +	default: +		BUG(); +	} + +skip_partial: +	while (cont) { +		dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]); +		switch (offset[0]) { +		case NODE_DIR1_BLOCK: +		case NODE_DIR2_BLOCK: +			err = truncate_dnode(&dn); +			break; + +		case NODE_IND1_BLOCK: +		case NODE_IND2_BLOCK: +			err = truncate_nodes(&dn, nofs, offset[1], 2); +			break; + +		case NODE_DIND_BLOCK: +			err = truncate_nodes(&dn, nofs, offset[1], 3); +			cont = 0; +			break; + +		default: +			BUG(); +		} +		if (err < 0 && err != -ENOENT) +			goto fail; +		if (offset[1] == 0 && +				rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { +			lock_page(page); +			wait_on_page_writeback(page); +			rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; +			set_page_dirty(page); +			unlock_page(page); +		} +		offset[1] = 0; +		offset[0]++; +		nofs += err; +	} +fail: +	f2fs_put_page(page, 0); +	return err > 0 ? 0 : err; +} + +int remove_inode_page(struct inode *inode) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct page *page; +	nid_t ino = inode->i_ino; +	struct dnode_of_data dn; + +	mutex_lock_op(sbi, NODE_TRUNC); +	page = get_node_page(sbi, ino); +	if (IS_ERR(page)) { +		mutex_unlock_op(sbi, NODE_TRUNC); +		return PTR_ERR(page); +	} + +	if (F2FS_I(inode)->i_xattr_nid) { +		nid_t nid = F2FS_I(inode)->i_xattr_nid; +		struct page *npage = get_node_page(sbi, nid); + +		if (IS_ERR(npage)) { +			mutex_unlock_op(sbi, NODE_TRUNC); +			return PTR_ERR(npage); +		} + +		F2FS_I(inode)->i_xattr_nid = 0; +		set_new_dnode(&dn, inode, page, npage, nid); +		dn.inode_page_locked = 1; +		truncate_node(&dn); +	} + +	/* 0 is possible, after f2fs_new_inode() is failed */ +	BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); +	set_new_dnode(&dn, inode, page, page, ino); +	truncate_node(&dn); + +	mutex_unlock_op(sbi, NODE_TRUNC); +	return 0; +} + +int new_inode_page(struct inode *inode, const struct qstr *name) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct page *page; +	struct dnode_of_data dn; + +	/* allocate inode page for new inode */ +	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); +	mutex_lock_op(sbi, NODE_NEW); +	page = new_node_page(&dn, 0); +	init_dent_inode(name, page); +	mutex_unlock_op(sbi, NODE_NEW); +	if (IS_ERR(page)) +		return PTR_ERR(page); +	f2fs_put_page(page, 1); +	return 0; +} + +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); +	struct address_space *mapping = sbi->node_inode->i_mapping; +	struct node_info old_ni, new_ni; +	struct page *page; +	int err; + +	if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) +		return ERR_PTR(-EPERM); + +	page = grab_cache_page(mapping, dn->nid); +	if (!page) +		return ERR_PTR(-ENOMEM); + +	get_node_info(sbi, dn->nid, &old_ni); + +	SetPageUptodate(page); +	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); + +	/* Reinitialize old_ni with new node page */ +	BUG_ON(old_ni.blk_addr != NULL_ADDR); +	new_ni = old_ni; +	new_ni.ino = dn->inode->i_ino; + +	if (!inc_valid_node_count(sbi, dn->inode, 1)) { +		err = -ENOSPC; +		goto fail; +	} +	set_node_addr(sbi, &new_ni, NEW_ADDR); +	set_cold_node(dn->inode, page); + +	dn->node_page = page; +	sync_inode_page(dn); +	set_page_dirty(page); +	if (ofs == 0) +		inc_valid_inode_count(sbi); + +	return page; + +fail: +	clear_node_page_dirty(page); +	f2fs_put_page(page, 1); +	return ERR_PTR(err); +} + +static int read_node_page(struct page *page, int type) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); +	struct node_info ni; + +	get_node_info(sbi, page->index, &ni); + +	if (ni.blk_addr == NULL_ADDR) +		return -ENOENT; +	return f2fs_readpage(sbi, page, ni.blk_addr, type); +} + +/* + * Readahead a node page + */ +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) +{ +	struct address_space *mapping = sbi->node_inode->i_mapping; +	struct page *apage; + +	apage = find_get_page(mapping, nid); +	if (apage && PageUptodate(apage)) +		goto release_out; +	f2fs_put_page(apage, 0); + +	apage = grab_cache_page(mapping, nid); +	if (!apage) +		return; + +	if (read_node_page(apage, READA)) +		unlock_page(apage); + +release_out: +	f2fs_put_page(apage, 0); +	return; +} + +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +{ +	int err; +	struct page *page; +	struct address_space *mapping = sbi->node_inode->i_mapping; + +	page = grab_cache_page(mapping, nid); +	if (!page) +		return ERR_PTR(-ENOMEM); + +	err = read_node_page(page, READ_SYNC); +	if (err) { +		f2fs_put_page(page, 1); +		return ERR_PTR(err); +	} + +	BUG_ON(nid != nid_of_node(page)); +	mark_page_accessed(page); +	return page; +} + +/* + * Return a locked page for the desired node page. + * And, readahead MAX_RA_NODE number of node pages. + */ +struct page *get_node_page_ra(struct page *parent, int start) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); +	struct address_space *mapping = sbi->node_inode->i_mapping; +	int i, end; +	int err = 0; +	nid_t nid; +	struct page *page; + +	/* First, try getting the desired direct node. */ +	nid = get_nid(parent, start, false); +	if (!nid) +		return ERR_PTR(-ENOENT); + +	page = find_get_page(mapping, nid); +	if (page && PageUptodate(page)) +		goto page_hit; +	f2fs_put_page(page, 0); + +repeat: +	page = grab_cache_page(mapping, nid); +	if (!page) +		return ERR_PTR(-ENOMEM); + +	err = read_node_page(page, READA); +	if (err) { +		f2fs_put_page(page, 1); +		return ERR_PTR(err); +	} + +	/* Then, try readahead for siblings of the desired node */ +	end = start + MAX_RA_NODE; +	end = min(end, NIDS_PER_BLOCK); +	for (i = start + 1; i < end; i++) { +		nid = get_nid(parent, i, false); +		if (!nid) +			continue; +		ra_node_page(sbi, nid); +	} + +page_hit: +	lock_page(page); +	if (PageError(page)) { +		f2fs_put_page(page, 1); +		return ERR_PTR(-EIO); +	} + +	/* Has the page been truncated? */ +	if (page->mapping != mapping) { +		f2fs_put_page(page, 1); +		goto repeat; +	} +	return page; +} + +void sync_inode_page(struct dnode_of_data *dn) +{ +	if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { +		update_inode(dn->inode, dn->node_page); +	} else if (dn->inode_page) { +		if (!dn->inode_page_locked) +			lock_page(dn->inode_page); +		update_inode(dn->inode, dn->inode_page); +		if (!dn->inode_page_locked) +			unlock_page(dn->inode_page); +	} else { +		f2fs_write_inode(dn->inode, NULL); +	} +} + +int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, +					struct writeback_control *wbc) +{ +	struct address_space *mapping = sbi->node_inode->i_mapping; +	pgoff_t index, end; +	struct pagevec pvec; +	int step = ino ? 2 : 0; +	int nwritten = 0, wrote = 0; + +	pagevec_init(&pvec, 0); + +next_step: +	index = 0; +	end = LONG_MAX; + +	while (index <= end) { +		int i, nr_pages; +		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, +				PAGECACHE_TAG_DIRTY, +				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); +		if (nr_pages == 0) +			break; + +		for (i = 0; i < nr_pages; i++) { +			struct page *page = pvec.pages[i]; + +			/* +			 * flushing sequence with step: +			 * 0. indirect nodes +			 * 1. dentry dnodes +			 * 2. file dnodes +			 */ +			if (step == 0 && IS_DNODE(page)) +				continue; +			if (step == 1 && (!IS_DNODE(page) || +						is_cold_node(page))) +				continue; +			if (step == 2 && (!IS_DNODE(page) || +						!is_cold_node(page))) +				continue; + +			/* +			 * If an fsync mode, +			 * we should not skip writing node pages. +			 */ +			if (ino && ino_of_node(page) == ino) +				lock_page(page); +			else if (!trylock_page(page)) +				continue; + +			if (unlikely(page->mapping != mapping)) { +continue_unlock: +				unlock_page(page); +				continue; +			} +			if (ino && ino_of_node(page) != ino) +				goto continue_unlock; + +			if (!PageDirty(page)) { +				/* someone wrote it for us */ +				goto continue_unlock; +			} + +			if (!clear_page_dirty_for_io(page)) +				goto continue_unlock; + +			/* called by fsync() */ +			if (ino && IS_DNODE(page)) { +				int mark = !is_checkpointed_node(sbi, ino); +				set_fsync_mark(page, 1); +				if (IS_INODE(page)) +					set_dentry_mark(page, mark); +				nwritten++; +			} else { +				set_fsync_mark(page, 0); +				set_dentry_mark(page, 0); +			} +			mapping->a_ops->writepage(page, wbc); +			wrote++; + +			if (--wbc->nr_to_write == 0) +				break; +		} +		pagevec_release(&pvec); +		cond_resched(); + +		if (wbc->nr_to_write == 0) { +			step = 2; +			break; +		} +	} + +	if (step < 2) { +		step++; +		goto next_step; +	} + +	if (wrote) +		f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL); + +	return nwritten; +} + +static int f2fs_write_node_page(struct page *page, +				struct writeback_control *wbc) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); +	nid_t nid; +	block_t new_addr; +	struct node_info ni; + +	if (wbc->for_reclaim) { +		dec_page_count(sbi, F2FS_DIRTY_NODES); +		wbc->pages_skipped++; +		set_page_dirty(page); +		return AOP_WRITEPAGE_ACTIVATE; +	} + +	wait_on_page_writeback(page); + +	mutex_lock_op(sbi, NODE_WRITE); + +	/* get old block addr of this node page */ +	nid = nid_of_node(page); +	BUG_ON(page->index != nid); + +	get_node_info(sbi, nid, &ni); + +	/* This page is already truncated */ +	if (ni.blk_addr == NULL_ADDR) +		return 0; + +	set_page_writeback(page); + +	/* insert node offset */ +	write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); +	set_node_addr(sbi, &ni, new_addr); +	dec_page_count(sbi, F2FS_DIRTY_NODES); + +	mutex_unlock_op(sbi, NODE_WRITE); +	unlock_page(page); +	return 0; +} + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * Be default, 512 pages (2MB), a segment size, is quite reasonable. + */ +#define COLLECT_DIRTY_NODES	512 +static int f2fs_write_node_pages(struct address_space *mapping, +			    struct writeback_control *wbc) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); +	struct block_device *bdev = sbi->sb->s_bdev; +	long nr_to_write = wbc->nr_to_write; + +	/* First check balancing cached NAT entries */ +	if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { +		write_checkpoint(sbi, false); +		return 0; +	} + +	/* collect a number of dirty node pages and write together */ +	if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) +		return 0; + +	/* if mounting is failed, skip writing node pages */ +	wbc->nr_to_write = bio_get_nr_vecs(bdev); +	sync_node_pages(sbi, 0, wbc); +	wbc->nr_to_write = nr_to_write - +		(bio_get_nr_vecs(bdev) - wbc->nr_to_write); +	return 0; +} + +static int f2fs_set_node_page_dirty(struct page *page) +{ +	struct address_space *mapping = page->mapping; +	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + +	SetPageUptodate(page); +	if (!PageDirty(page)) { +		__set_page_dirty_nobuffers(page); +		inc_page_count(sbi, F2FS_DIRTY_NODES); +		SetPagePrivate(page); +		return 1; +	} +	return 0; +} + +static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) +{ +	struct inode *inode = page->mapping->host; +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	if (PageDirty(page)) +		dec_page_count(sbi, F2FS_DIRTY_NODES); +	ClearPagePrivate(page); +} + +static int f2fs_release_node_page(struct page *page, gfp_t wait) +{ +	ClearPagePrivate(page); +	return 0; +} + +/* + * Structure of the f2fs node operations + */ +const struct address_space_operations f2fs_node_aops = { +	.writepage	= f2fs_write_node_page, +	.writepages	= f2fs_write_node_pages, +	.set_page_dirty	= f2fs_set_node_page_dirty, +	.invalidatepage	= f2fs_invalidate_node_page, +	.releasepage	= f2fs_release_node_page, +}; + +static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) +{ +	struct list_head *this; +	struct free_nid *i = NULL; +	list_for_each(this, head) { +		i = list_entry(this, struct free_nid, list); +		if (i->nid == n) +			break; +		i = NULL; +	} +	return i; +} + +static void __del_from_free_nid_list(struct free_nid *i) +{ +	list_del(&i->list); +	kmem_cache_free(free_nid_slab, i); +} + +static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +{ +	struct free_nid *i; + +	if (nm_i->fcnt > 2 * MAX_FREE_NIDS) +		return 0; +retry: +	i = kmem_cache_alloc(free_nid_slab, GFP_NOFS); +	if (!i) { +		cond_resched(); +		goto retry; +	} +	i->nid = nid; +	i->state = NID_NEW; + +	spin_lock(&nm_i->free_nid_list_lock); +	if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { +		spin_unlock(&nm_i->free_nid_list_lock); +		kmem_cache_free(free_nid_slab, i); +		return 0; +	} +	list_add_tail(&i->list, &nm_i->free_nid_list); +	nm_i->fcnt++; +	spin_unlock(&nm_i->free_nid_list_lock); +	return 1; +} + +static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +{ +	struct free_nid *i; +	spin_lock(&nm_i->free_nid_list_lock); +	i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); +	if (i && i->state == NID_NEW) { +		__del_from_free_nid_list(i); +		nm_i->fcnt--; +	} +	spin_unlock(&nm_i->free_nid_list_lock); +} + +static int scan_nat_page(struct f2fs_nm_info *nm_i, +			struct page *nat_page, nid_t start_nid) +{ +	struct f2fs_nat_block *nat_blk = page_address(nat_page); +	block_t blk_addr; +	int fcnt = 0; +	int i; + +	/* 0 nid should not be used */ +	if (start_nid == 0) +		++start_nid; + +	i = start_nid % NAT_ENTRY_PER_BLOCK; + +	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { +		blk_addr  = le32_to_cpu(nat_blk->entries[i].block_addr); +		BUG_ON(blk_addr == NEW_ADDR); +		if (blk_addr == NULL_ADDR) +			fcnt += add_free_nid(nm_i, start_nid); +	} +	return fcnt; +} + +static void build_free_nids(struct f2fs_sb_info *sbi) +{ +	struct free_nid *fnid, *next_fnid; +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); +	struct f2fs_summary_block *sum = curseg->sum_blk; +	nid_t nid = 0; +	bool is_cycled = false; +	int fcnt = 0; +	int i; + +	nid = nm_i->next_scan_nid; +	nm_i->init_scan_nid = nid; + +	ra_nat_pages(sbi, nid); + +	while (1) { +		struct page *page = get_current_nat_page(sbi, nid); + +		fcnt += scan_nat_page(nm_i, page, nid); +		f2fs_put_page(page, 1); + +		nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); + +		if (nid >= nm_i->max_nid) { +			nid = 0; +			is_cycled = true; +		} +		if (fcnt > MAX_FREE_NIDS) +			break; +		if (is_cycled && nm_i->init_scan_nid <= nid) +			break; +	} + +	nm_i->next_scan_nid = nid; + +	/* find free nids from current sum_pages */ +	mutex_lock(&curseg->curseg_mutex); +	for (i = 0; i < nats_in_cursum(sum); i++) { +		block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); +		nid = le32_to_cpu(nid_in_journal(sum, i)); +		if (addr == NULL_ADDR) +			add_free_nid(nm_i, nid); +		else +			remove_free_nid(nm_i, nid); +	} +	mutex_unlock(&curseg->curseg_mutex); + +	/* remove the free nids from current allocated nids */ +	list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) { +		struct nat_entry *ne; + +		read_lock(&nm_i->nat_tree_lock); +		ne = __lookup_nat_cache(nm_i, fnid->nid); +		if (ne && nat_get_blkaddr(ne) != NULL_ADDR) +			remove_free_nid(nm_i, fnid->nid); +		read_unlock(&nm_i->nat_tree_lock); +	} +} + +/* + * If this function returns success, caller can obtain a new nid + * from second parameter of this function. + * The returned nid could be used ino as well as nid when inode is created. + */ +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct free_nid *i = NULL; +	struct list_head *this; +retry: +	mutex_lock(&nm_i->build_lock); +	if (!nm_i->fcnt) { +		/* scan NAT in order to build free nid list */ +		build_free_nids(sbi); +		if (!nm_i->fcnt) { +			mutex_unlock(&nm_i->build_lock); +			return false; +		} +	} +	mutex_unlock(&nm_i->build_lock); + +	/* +	 * We check fcnt again since previous check is racy as +	 * we didn't hold free_nid_list_lock. So other thread +	 * could consume all of free nids. +	 */ +	spin_lock(&nm_i->free_nid_list_lock); +	if (!nm_i->fcnt) { +		spin_unlock(&nm_i->free_nid_list_lock); +		goto retry; +	} + +	BUG_ON(list_empty(&nm_i->free_nid_list)); +	list_for_each(this, &nm_i->free_nid_list) { +		i = list_entry(this, struct free_nid, list); +		if (i->state == NID_NEW) +			break; +	} + +	BUG_ON(i->state != NID_NEW); +	*nid = i->nid; +	i->state = NID_ALLOC; +	nm_i->fcnt--; +	spin_unlock(&nm_i->free_nid_list_lock); +	return true; +} + +/* + * alloc_nid() should be called prior to this function. + */ +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct free_nid *i; + +	spin_lock(&nm_i->free_nid_list_lock); +	i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); +	if (i) { +		BUG_ON(i->state != NID_ALLOC); +		__del_from_free_nid_list(i); +	} +	spin_unlock(&nm_i->free_nid_list_lock); +} + +/* + * alloc_nid() should be called prior to this function. + */ +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) +{ +	alloc_nid_done(sbi, nid); +	add_free_nid(NM_I(sbi), nid); +} + +void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, +		struct f2fs_summary *sum, struct node_info *ni, +		block_t new_blkaddr) +{ +	rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); +	set_node_addr(sbi, ni, new_blkaddr); +	clear_node_page_dirty(page); +} + +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +{ +	struct address_space *mapping = sbi->node_inode->i_mapping; +	struct f2fs_node *src, *dst; +	nid_t ino = ino_of_node(page); +	struct node_info old_ni, new_ni; +	struct page *ipage; + +	ipage = grab_cache_page(mapping, ino); +	if (!ipage) +		return -ENOMEM; + +	/* Should not use this inode  from free nid list */ +	remove_free_nid(NM_I(sbi), ino); + +	get_node_info(sbi, ino, &old_ni); +	SetPageUptodate(ipage); +	fill_node_footer(ipage, ino, ino, 0, true); + +	src = (struct f2fs_node *)page_address(page); +	dst = (struct f2fs_node *)page_address(ipage); + +	memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); +	dst->i.i_size = 0; +	dst->i.i_blocks = cpu_to_le64(1); +	dst->i.i_links = cpu_to_le32(1); +	dst->i.i_xattr_nid = 0; + +	new_ni = old_ni; +	new_ni.ino = ino; + +	set_node_addr(sbi, &new_ni, NEW_ADDR); +	inc_valid_inode_count(sbi); + +	f2fs_put_page(ipage, 1); +	return 0; +} + +int restore_node_summary(struct f2fs_sb_info *sbi, +			unsigned int segno, struct f2fs_summary_block *sum) +{ +	struct f2fs_node *rn; +	struct f2fs_summary *sum_entry; +	struct page *page; +	block_t addr; +	int i, last_offset; + +	/* alloc temporal page for read node */ +	page = alloc_page(GFP_NOFS | __GFP_ZERO); +	if (IS_ERR(page)) +		return PTR_ERR(page); +	lock_page(page); + +	/* scan the node segment */ +	last_offset = sbi->blocks_per_seg; +	addr = START_BLOCK(sbi, segno); +	sum_entry = &sum->entries[0]; + +	for (i = 0; i < last_offset; i++, sum_entry++) { +		if (f2fs_readpage(sbi, page, addr, READ_SYNC)) +			goto out; + +		rn = (struct f2fs_node *)page_address(page); +		sum_entry->nid = rn->footer.nid; +		sum_entry->version = 0; +		sum_entry->ofs_in_node = 0; +		addr++; + +		/* +		 * In order to read next node page, +		 * we must clear PageUptodate flag. +		 */ +		ClearPageUptodate(page); +	} +out: +	unlock_page(page); +	__free_pages(page, 0); +	return 0; +} + +static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); +	struct f2fs_summary_block *sum = curseg->sum_blk; +	int i; + +	mutex_lock(&curseg->curseg_mutex); + +	if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) { +		mutex_unlock(&curseg->curseg_mutex); +		return false; +	} + +	for (i = 0; i < nats_in_cursum(sum); i++) { +		struct nat_entry *ne; +		struct f2fs_nat_entry raw_ne; +		nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); + +		raw_ne = nat_in_journal(sum, i); +retry: +		write_lock(&nm_i->nat_tree_lock); +		ne = __lookup_nat_cache(nm_i, nid); +		if (ne) { +			__set_nat_cache_dirty(nm_i, ne); +			write_unlock(&nm_i->nat_tree_lock); +			continue; +		} +		ne = grab_nat_entry(nm_i, nid); +		if (!ne) { +			write_unlock(&nm_i->nat_tree_lock); +			goto retry; +		} +		nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr)); +		nat_set_ino(ne, le32_to_cpu(raw_ne.ino)); +		nat_set_version(ne, raw_ne.version); +		__set_nat_cache_dirty(nm_i, ne); +		write_unlock(&nm_i->nat_tree_lock); +	} +	update_nats_in_cursum(sum, -i); +	mutex_unlock(&curseg->curseg_mutex); +	return true; +} + +/* + * This function is called during the checkpointing process. + */ +void flush_nat_entries(struct f2fs_sb_info *sbi) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); +	struct f2fs_summary_block *sum = curseg->sum_blk; +	struct list_head *cur, *n; +	struct page *page = NULL; +	struct f2fs_nat_block *nat_blk = NULL; +	nid_t start_nid = 0, end_nid = 0; +	bool flushed; + +	flushed = flush_nats_in_journal(sbi); + +	if (!flushed) +		mutex_lock(&curseg->curseg_mutex); + +	/* 1) flush dirty nat caches */ +	list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { +		struct nat_entry *ne; +		nid_t nid; +		struct f2fs_nat_entry raw_ne; +		int offset = -1; +		block_t new_blkaddr; + +		ne = list_entry(cur, struct nat_entry, list); +		nid = nat_get_nid(ne); + +		if (nat_get_blkaddr(ne) == NEW_ADDR) +			continue; +		if (flushed) +			goto to_nat_page; + +		/* if there is room for nat enries in curseg->sumpage */ +		offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); +		if (offset >= 0) { +			raw_ne = nat_in_journal(sum, offset); +			goto flush_now; +		} +to_nat_page: +		if (!page || (start_nid > nid || nid > end_nid)) { +			if (page) { +				f2fs_put_page(page, 1); +				page = NULL; +			} +			start_nid = START_NID(nid); +			end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1; + +			/* +			 * get nat block with dirty flag, increased reference +			 * count, mapped and lock +			 */ +			page = get_next_nat_page(sbi, start_nid); +			nat_blk = page_address(page); +		} + +		BUG_ON(!nat_blk); +		raw_ne = nat_blk->entries[nid - start_nid]; +flush_now: +		new_blkaddr = nat_get_blkaddr(ne); + +		raw_ne.ino = cpu_to_le32(nat_get_ino(ne)); +		raw_ne.block_addr = cpu_to_le32(new_blkaddr); +		raw_ne.version = nat_get_version(ne); + +		if (offset < 0) { +			nat_blk->entries[nid - start_nid] = raw_ne; +		} else { +			nat_in_journal(sum, offset) = raw_ne; +			nid_in_journal(sum, offset) = cpu_to_le32(nid); +		} + +		if (nat_get_blkaddr(ne) == NULL_ADDR) { +			write_lock(&nm_i->nat_tree_lock); +			__del_from_nat_cache(nm_i, ne); +			write_unlock(&nm_i->nat_tree_lock); + +			/* We can reuse this freed nid at this point */ +			add_free_nid(NM_I(sbi), nid); +		} else { +			write_lock(&nm_i->nat_tree_lock); +			__clear_nat_cache_dirty(nm_i, ne); +			ne->checkpointed = true; +			write_unlock(&nm_i->nat_tree_lock); +		} +	} +	if (!flushed) +		mutex_unlock(&curseg->curseg_mutex); +	f2fs_put_page(page, 1); + +	/* 2) shrink nat caches if necessary */ +	try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); +} + +static int init_node_manager(struct f2fs_sb_info *sbi) +{ +	struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	unsigned char *version_bitmap; +	unsigned int nat_segs, nat_blocks; + +	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); + +	/* segment_count_nat includes pair segment so divide to 2. */ +	nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; +	nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); +	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; +	nm_i->fcnt = 0; +	nm_i->nat_cnt = 0; + +	INIT_LIST_HEAD(&nm_i->free_nid_list); +	INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); +	INIT_LIST_HEAD(&nm_i->nat_entries); +	INIT_LIST_HEAD(&nm_i->dirty_nat_entries); + +	mutex_init(&nm_i->build_lock); +	spin_lock_init(&nm_i->free_nid_list_lock); +	rwlock_init(&nm_i->nat_tree_lock); + +	nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); +	nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); +	nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); + +	nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL); +	if (!nm_i->nat_bitmap) +		return -ENOMEM; +	version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); +	if (!version_bitmap) +		return -EFAULT; + +	/* copy version bitmap */ +	memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size); +	return 0; +} + +int build_node_manager(struct f2fs_sb_info *sbi) +{ +	int err; + +	sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL); +	if (!sbi->nm_info) +		return -ENOMEM; + +	err = init_node_manager(sbi); +	if (err) +		return err; + +	build_free_nids(sbi); +	return 0; +} + +void destroy_node_manager(struct f2fs_sb_info *sbi) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct free_nid *i, *next_i; +	struct nat_entry *natvec[NATVEC_SIZE]; +	nid_t nid = 0; +	unsigned int found; + +	if (!nm_i) +		return; + +	/* destroy free nid list */ +	spin_lock(&nm_i->free_nid_list_lock); +	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { +		BUG_ON(i->state == NID_ALLOC); +		__del_from_free_nid_list(i); +		nm_i->fcnt--; +	} +	BUG_ON(nm_i->fcnt); +	spin_unlock(&nm_i->free_nid_list_lock); + +	/* destroy nat cache */ +	write_lock(&nm_i->nat_tree_lock); +	while ((found = __gang_lookup_nat_cache(nm_i, +					nid, NATVEC_SIZE, natvec))) { +		unsigned idx; +		for (idx = 0; idx < found; idx++) { +			struct nat_entry *e = natvec[idx]; +			nid = nat_get_nid(e) + 1; +			__del_from_nat_cache(nm_i, e); +		} +	} +	BUG_ON(nm_i->nat_cnt); +	write_unlock(&nm_i->nat_tree_lock); + +	kfree(nm_i->nat_bitmap); +	sbi->nm_info = NULL; +	kfree(nm_i); +} + +int __init create_node_manager_caches(void) +{ +	nat_entry_slab = f2fs_kmem_cache_create("nat_entry", +			sizeof(struct nat_entry), NULL); +	if (!nat_entry_slab) +		return -ENOMEM; + +	free_nid_slab = f2fs_kmem_cache_create("free_nid", +			sizeof(struct free_nid), NULL); +	if (!free_nid_slab) { +		kmem_cache_destroy(nat_entry_slab); +		return -ENOMEM; +	} +	return 0; +} + +void destroy_node_manager_caches(void) +{ +	kmem_cache_destroy(free_nid_slab); +	kmem_cache_destroy(nat_entry_slab); +} diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h new file mode 100644 index 00000000000..afdb130f782 --- /dev/null +++ b/fs/f2fs/node.h @@ -0,0 +1,353 @@ +/* + * fs/f2fs/node.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +/* start node id of a node block dedicated to the given node id */ +#define	START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) + +/* node block offset on the NAT area dedicated to the given start node id */ +#define	NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) + +/* # of pages to perform readahead before building free nids */ +#define FREE_NID_PAGES 4 + +/* maximum # of free node ids to produce during build_free_nids */ +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) + +/* maximum readahead size for node during getting data blocks */ +#define MAX_RA_NODE		128 + +/* maximum cached nat entries to manage memory footprint */ +#define NM_WOUT_THRESHOLD	(64 * NAT_ENTRY_PER_BLOCK) + +/* vector size for gang look-up from nat cache that consists of radix tree */ +#define NATVEC_SIZE	64 + +/* + * For node information + */ +struct node_info { +	nid_t nid;		/* node id */ +	nid_t ino;		/* inode number of the node's owner */ +	block_t	blk_addr;	/* block address of the node */ +	unsigned char version;	/* version of the node */ +}; + +struct nat_entry { +	struct list_head list;	/* for clean or dirty nat list */ +	bool checkpointed;	/* whether it is checkpointed or not */ +	struct node_info ni;	/* in-memory node information */ +}; + +#define nat_get_nid(nat)		(nat->ni.nid) +#define nat_set_nid(nat, n)		(nat->ni.nid = n) +#define nat_get_blkaddr(nat)		(nat->ni.blk_addr) +#define nat_set_blkaddr(nat, b)		(nat->ni.blk_addr = b) +#define nat_get_ino(nat)		(nat->ni.ino) +#define nat_set_ino(nat, i)		(nat->ni.ino = i) +#define nat_get_version(nat)		(nat->ni.version) +#define nat_set_version(nat, v)		(nat->ni.version = v) + +#define __set_nat_cache_dirty(nm_i, ne)					\ +	list_move_tail(&ne->list, &nm_i->dirty_nat_entries); +#define __clear_nat_cache_dirty(nm_i, ne)				\ +	list_move_tail(&ne->list, &nm_i->nat_entries); +#define inc_node_version(version)	(++version) + +static inline void node_info_from_raw_nat(struct node_info *ni, +						struct f2fs_nat_entry *raw_ne) +{ +	ni->ino = le32_to_cpu(raw_ne->ino); +	ni->blk_addr = le32_to_cpu(raw_ne->block_addr); +	ni->version = raw_ne->version; +} + +/* + * For free nid mangement + */ +enum nid_state { +	NID_NEW,	/* newly added to free nid list */ +	NID_ALLOC	/* it is allocated */ +}; + +struct free_nid { +	struct list_head list;	/* for free node id list */ +	nid_t nid;		/* node id */ +	int state;		/* in use or not: NID_NEW or NID_ALLOC */ +}; + +static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	struct free_nid *fnid; + +	if (nm_i->fcnt <= 0) +		return -1; +	spin_lock(&nm_i->free_nid_list_lock); +	fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); +	*nid = fnid->nid; +	spin_unlock(&nm_i->free_nid_list_lock); +	return 0; +} + +/* + * inline functions + */ +static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); +} + +static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); +	pgoff_t block_off; +	pgoff_t block_addr; +	int seg_off; + +	block_off = NAT_BLOCK_OFFSET(start); +	seg_off = block_off >> sbi->log_blocks_per_seg; + +	block_addr = (pgoff_t)(nm_i->nat_blkaddr + +		(seg_off << sbi->log_blocks_per_seg << 1) + +		(block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + +	if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) +		block_addr += sbi->blocks_per_seg; + +	return block_addr; +} + +static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, +						pgoff_t block_addr) +{ +	struct f2fs_nm_info *nm_i = NM_I(sbi); + +	block_addr -= nm_i->nat_blkaddr; +	if ((block_addr >> sbi->log_blocks_per_seg) % 2) +		block_addr -= sbi->blocks_per_seg; +	else +		block_addr += sbi->blocks_per_seg; + +	return block_addr + nm_i->nat_blkaddr; +} + +static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) +{ +	unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); + +	if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) +		f2fs_clear_bit(block_off, nm_i->nat_bitmap); +	else +		f2fs_set_bit(block_off, nm_i->nat_bitmap); +} + +static inline void fill_node_footer(struct page *page, nid_t nid, +				nid_t ino, unsigned int ofs, bool reset) +{ +	void *kaddr = page_address(page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	if (reset) +		memset(rn, 0, sizeof(*rn)); +	rn->footer.nid = cpu_to_le32(nid); +	rn->footer.ino = cpu_to_le32(ino); +	rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT); +} + +static inline void copy_node_footer(struct page *dst, struct page *src) +{ +	void *src_addr = page_address(src); +	void *dst_addr = page_address(dst); +	struct f2fs_node *src_rn = (struct f2fs_node *)src_addr; +	struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr; +	memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); +} + +static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); +	void *kaddr = page_address(page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	rn->footer.cp_ver = ckpt->checkpoint_ver; +	rn->footer.next_blkaddr = cpu_to_le32(blkaddr); +} + +static inline nid_t ino_of_node(struct page *node_page) +{ +	void *kaddr = page_address(node_page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ +	void *kaddr = page_address(node_page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ +	void *kaddr = page_address(node_page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	unsigned flag = le32_to_cpu(rn->footer.flag); +	return flag >> OFFSET_BIT_SHIFT; +} + +static inline unsigned long long cpver_of_node(struct page *node_page) +{ +	void *kaddr = page_address(node_page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ +	void *kaddr = page_address(node_page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	return le32_to_cpu(rn->footer.next_blkaddr); +} + +/* + * f2fs assigns the following node offsets described as (num). + * N = NIDS_PER_BLOCK + * + *  Inode block (0) + *    |- direct node (1) + *    |- direct node (2) + *    |- indirect node (3) + *    |            `- direct node (4 => 4 + N - 1) + *    |- indirect node (4 + N) + *    |            `- direct node (5 + N => 5 + 2N - 1) + *    `- double indirect node (5 + 2N) + *                 `- indirect node (6 + 2N) + *                       `- direct node (x(N + 1)) + */ +static inline bool IS_DNODE(struct page *node_page) +{ +	unsigned int ofs = ofs_of_node(node_page); +	if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || +			ofs == 5 + 2 * NIDS_PER_BLOCK) +		return false; +	if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { +		ofs -= 6 + 2 * NIDS_PER_BLOCK; +		if ((long int)ofs % (NIDS_PER_BLOCK + 1)) +			return false; +	} +	return true; +} + +static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +{ +	struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + +	wait_on_page_writeback(p); + +	if (i) +		rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); +	else +		rn->in.nid[off] = cpu_to_le32(nid); +	set_page_dirty(p); +} + +static inline nid_t get_nid(struct page *p, int off, bool i) +{ +	struct f2fs_node *rn = (struct f2fs_node *)page_address(p); +	if (i) +		return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); +	return le32_to_cpu(rn->in.nid[off]); +} + +/* + * Coldness identification: + *  - Mark cold files in f2fs_inode_info + *  - Mark cold node blocks in their node footer + *  - Mark cold data pages in page cache + */ +static inline int is_cold_file(struct inode *inode) +{ +	return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT; +} + +static inline int is_cold_data(struct page *page) +{ +	return PageChecked(page); +} + +static inline void set_cold_data(struct page *page) +{ +	SetPageChecked(page); +} + +static inline void clear_cold_data(struct page *page) +{ +	ClearPageChecked(page); +} + +static inline int is_cold_node(struct page *page) +{ +	void *kaddr = page_address(page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	unsigned int flag = le32_to_cpu(rn->footer.flag); +	return flag & (0x1 << COLD_BIT_SHIFT); +} + +static inline unsigned char is_fsync_dnode(struct page *page) +{ +	void *kaddr = page_address(page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	unsigned int flag = le32_to_cpu(rn->footer.flag); +	return flag & (0x1 << FSYNC_BIT_SHIFT); +} + +static inline unsigned char is_dent_dnode(struct page *page) +{ +	void *kaddr = page_address(page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	unsigned int flag = le32_to_cpu(rn->footer.flag); +	return flag & (0x1 << DENT_BIT_SHIFT); +} + +static inline void set_cold_node(struct inode *inode, struct page *page) +{ +	struct f2fs_node *rn = (struct f2fs_node *)page_address(page); +	unsigned int flag = le32_to_cpu(rn->footer.flag); + +	if (S_ISDIR(inode->i_mode)) +		flag &= ~(0x1 << COLD_BIT_SHIFT); +	else +		flag |= (0x1 << COLD_BIT_SHIFT); +	rn->footer.flag = cpu_to_le32(flag); +} + +static inline void set_fsync_mark(struct page *page, int mark) +{ +	void *kaddr = page_address(page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	unsigned int flag = le32_to_cpu(rn->footer.flag); +	if (mark) +		flag |= (0x1 << FSYNC_BIT_SHIFT); +	else +		flag &= ~(0x1 << FSYNC_BIT_SHIFT); +	rn->footer.flag = cpu_to_le32(flag); +} + +static inline void set_dentry_mark(struct page *page, int mark) +{ +	void *kaddr = page_address(page); +	struct f2fs_node *rn = (struct f2fs_node *)kaddr; +	unsigned int flag = le32_to_cpu(rn->footer.flag); +	if (mark) +		flag |= (0x1 << DENT_BIT_SHIFT); +	else +		flag &= ~(0x1 << DENT_BIT_SHIFT); +	rn->footer.flag = cpu_to_le32(flag); +} diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c new file mode 100644 index 00000000000..b235215ac13 --- /dev/null +++ b/fs/f2fs/recovery.c @@ -0,0 +1,375 @@ +/* + * fs/f2fs/recovery.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +static struct kmem_cache *fsync_entry_slab; + +bool space_for_roll_forward(struct f2fs_sb_info *sbi) +{ +	if (sbi->last_valid_block_count + sbi->alloc_valid_block_count +			> sbi->user_block_count) +		return false; +	return true; +} + +static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, +								nid_t ino) +{ +	struct list_head *this; +	struct fsync_inode_entry *entry; + +	list_for_each(this, head) { +		entry = list_entry(this, struct fsync_inode_entry, list); +		if (entry->inode->i_ino == ino) +			return entry; +	} +	return NULL; +} + +static int recover_dentry(struct page *ipage, struct inode *inode) +{ +	struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage); +	struct f2fs_inode *raw_inode = &(raw_node->i); +	struct qstr name; +	struct f2fs_dir_entry *de; +	struct page *page; +	struct inode *dir; +	int err = 0; + +	if (!is_dent_dnode(ipage)) +		goto out; + +	dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino)); +	if (IS_ERR(dir)) { +		err = -EINVAL; +		goto out; +	} + +	name.len = le32_to_cpu(raw_inode->i_namelen); +	name.name = raw_inode->i_name; + +	de = f2fs_find_entry(dir, &name, &page); +	if (de) { +		kunmap(page); +		f2fs_put_page(page, 0); +	} else { +		err = __f2fs_add_link(dir, &name, inode); +	} +	iput(dir); +out: +	kunmap(ipage); +	return err; +} + +static int recover_inode(struct inode *inode, struct page *node_page) +{ +	void *kaddr = page_address(node_page); +	struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; +	struct f2fs_inode *raw_inode = &(raw_node->i); + +	inode->i_mode = le16_to_cpu(raw_inode->i_mode); +	i_size_write(inode, le64_to_cpu(raw_inode->i_size)); +	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); +	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); +	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); +	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); +	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); +	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + +	return recover_dentry(node_page, inode); +} + +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +{ +	unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); +	struct curseg_info *curseg; +	struct page *page; +	block_t blkaddr; +	int err = 0; + +	/* get node pages in the current segment */ +	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); +	blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; + +	/* read node page */ +	page = alloc_page(GFP_F2FS_ZERO); +	if (IS_ERR(page)) +		return PTR_ERR(page); +	lock_page(page); + +	while (1) { +		struct fsync_inode_entry *entry; + +		if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) +			goto out; + +		if (cp_ver != cpver_of_node(page)) +			goto out; + +		if (!is_fsync_dnode(page)) +			goto next; + +		entry = get_fsync_inode(head, ino_of_node(page)); +		if (entry) { +			entry->blkaddr = blkaddr; +			if (IS_INODE(page) && is_dent_dnode(page)) +				set_inode_flag(F2FS_I(entry->inode), +							FI_INC_LINK); +		} else { +			if (IS_INODE(page) && is_dent_dnode(page)) { +				if (recover_inode_page(sbi, page)) { +					err = -ENOMEM; +					goto out; +				} +			} + +			/* add this fsync inode to the list */ +			entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); +			if (!entry) { +				err = -ENOMEM; +				goto out; +			} + +			entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); +			if (IS_ERR(entry->inode)) { +				err = PTR_ERR(entry->inode); +				kmem_cache_free(fsync_entry_slab, entry); +				goto out; +			} + +			list_add_tail(&entry->list, head); +			entry->blkaddr = blkaddr; +		} +		if (IS_INODE(page)) { +			err = recover_inode(entry->inode, page); +			if (err) +				goto out; +		} +next: +		/* check next segment */ +		blkaddr = next_blkaddr_of_node(page); +		ClearPageUptodate(page); +	} +out: +	unlock_page(page); +	__free_pages(page, 0); +	return err; +} + +static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, +					struct list_head *head) +{ +	struct fsync_inode_entry *entry, *tmp; + +	list_for_each_entry_safe(entry, tmp, head, list) { +		iput(entry->inode); +		list_del(&entry->list); +		kmem_cache_free(fsync_entry_slab, entry); +	} +} + +static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, +						block_t blkaddr) +{ +	struct seg_entry *sentry; +	unsigned int segno = GET_SEGNO(sbi, blkaddr); +	unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & +					(sbi->blocks_per_seg - 1); +	struct f2fs_summary sum; +	nid_t ino; +	void *kaddr; +	struct inode *inode; +	struct page *node_page; +	block_t bidx; +	int i; + +	sentry = get_seg_entry(sbi, segno); +	if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) +		return; + +	/* Get the previous summary */ +	for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { +		struct curseg_info *curseg = CURSEG_I(sbi, i); +		if (curseg->segno == segno) { +			sum = curseg->sum_blk->entries[blkoff]; +			break; +		} +	} +	if (i > CURSEG_COLD_DATA) { +		struct page *sum_page = get_sum_page(sbi, segno); +		struct f2fs_summary_block *sum_node; +		kaddr = page_address(sum_page); +		sum_node = (struct f2fs_summary_block *)kaddr; +		sum = sum_node->entries[blkoff]; +		f2fs_put_page(sum_page, 1); +	} + +	/* Get the node page */ +	node_page = get_node_page(sbi, le32_to_cpu(sum.nid)); +	bidx = start_bidx_of_node(ofs_of_node(node_page)) + +				le16_to_cpu(sum.ofs_in_node); +	ino = ino_of_node(node_page); +	f2fs_put_page(node_page, 1); + +	/* Deallocate previous index in the node page */ +	inode = f2fs_iget(sbi->sb, ino); +	if (IS_ERR(inode)) +		return; + +	truncate_hole(inode, bidx, bidx + 1); +	iput(inode); +} + +static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, +					struct page *page, block_t blkaddr) +{ +	unsigned int start, end; +	struct dnode_of_data dn; +	struct f2fs_summary sum; +	struct node_info ni; + +	start = start_bidx_of_node(ofs_of_node(page)); +	if (IS_INODE(page)) +		end = start + ADDRS_PER_INODE; +	else +		end = start + ADDRS_PER_BLOCK; + +	set_new_dnode(&dn, inode, NULL, NULL, 0); +	if (get_dnode_of_data(&dn, start, 0)) +		return; + +	wait_on_page_writeback(dn.node_page); + +	get_node_info(sbi, dn.nid, &ni); +	BUG_ON(ni.ino != ino_of_node(page)); +	BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page)); + +	for (; start < end; start++) { +		block_t src, dest; + +		src = datablock_addr(dn.node_page, dn.ofs_in_node); +		dest = datablock_addr(page, dn.ofs_in_node); + +		if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) { +			if (src == NULL_ADDR) { +				int err = reserve_new_block(&dn); +				/* We should not get -ENOSPC */ +				BUG_ON(err); +			} + +			/* Check the previous node page having this index */ +			check_index_in_prev_nodes(sbi, dest); + +			set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + +			/* write dummy data page */ +			recover_data_page(sbi, NULL, &sum, src, dest); +			update_extent_cache(dest, &dn); +		} +		dn.ofs_in_node++; +	} + +	/* write node page in place */ +	set_summary(&sum, dn.nid, 0, 0); +	if (IS_INODE(dn.node_page)) +		sync_inode_page(&dn); + +	copy_node_footer(dn.node_page, page); +	fill_node_footer(dn.node_page, dn.nid, ni.ino, +					ofs_of_node(page), false); +	set_page_dirty(dn.node_page); + +	recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); +	f2fs_put_dnode(&dn); +} + +static void recover_data(struct f2fs_sb_info *sbi, +				struct list_head *head, int type) +{ +	unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); +	struct curseg_info *curseg; +	struct page *page; +	block_t blkaddr; + +	/* get node pages in the current segment */ +	curseg = CURSEG_I(sbi, type); +	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + +	/* read node page */ +	page = alloc_page(GFP_NOFS | __GFP_ZERO); +	if (IS_ERR(page)) +		return; +	lock_page(page); + +	while (1) { +		struct fsync_inode_entry *entry; + +		if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) +			goto out; + +		if (cp_ver != cpver_of_node(page)) +			goto out; + +		entry = get_fsync_inode(head, ino_of_node(page)); +		if (!entry) +			goto next; + +		do_recover_data(sbi, entry->inode, page, blkaddr); + +		if (entry->blkaddr == blkaddr) { +			iput(entry->inode); +			list_del(&entry->list); +			kmem_cache_free(fsync_entry_slab, entry); +		} +next: +		/* check next segment */ +		blkaddr = next_blkaddr_of_node(page); +		ClearPageUptodate(page); +	} +out: +	unlock_page(page); +	__free_pages(page, 0); + +	allocate_new_segments(sbi); +} + +void recover_fsync_data(struct f2fs_sb_info *sbi) +{ +	struct list_head inode_list; + +	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", +			sizeof(struct fsync_inode_entry), NULL); +	if (unlikely(!fsync_entry_slab)) +		return; + +	INIT_LIST_HEAD(&inode_list); + +	/* step #1: find fsynced inode numbers */ +	if (find_fsync_dnodes(sbi, &inode_list)) +		goto out; + +	if (list_empty(&inode_list)) +		goto out; + +	/* step #2: recover data */ +	sbi->por_doing = 1; +	recover_data(sbi, &inode_list, CURSEG_WARM_NODE); +	sbi->por_doing = 0; +	BUG_ON(!list_empty(&inode_list)); +out: +	destroy_fsync_dnodes(sbi, &inode_list); +	kmem_cache_destroy(fsync_entry_slab); +	write_checkpoint(sbi, false); +} diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c new file mode 100644 index 00000000000..777f17e496e --- /dev/null +++ b/fs/f2fs/segment.c @@ -0,0 +1,1770 @@ +/* + * fs/f2fs/segment.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/prefetch.h> +#include <linux/vmalloc.h> + +#include "f2fs.h" +#include "segment.h" +#include "node.h" + +/* + * This function balances dirty node and dentry pages. + * In addition, it controls garbage collection. + */ +void f2fs_balance_fs(struct f2fs_sb_info *sbi) +{ +	/* +	 * We should do GC or end up with checkpoint, if there are so many dirty +	 * dir/node pages without enough free segments. +	 */ +	if (has_not_enough_free_secs(sbi, 0)) { +		mutex_lock(&sbi->gc_mutex); +		f2fs_gc(sbi); +	} +} + +static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, +		enum dirty_type dirty_type) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + +	/* need not be added */ +	if (IS_CURSEG(sbi, segno)) +		return; + +	if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) +		dirty_i->nr_dirty[dirty_type]++; + +	if (dirty_type == DIRTY) { +		struct seg_entry *sentry = get_seg_entry(sbi, segno); +		dirty_type = sentry->type; +		if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) +			dirty_i->nr_dirty[dirty_type]++; +	} +} + +static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, +		enum dirty_type dirty_type) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + +	if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) +		dirty_i->nr_dirty[dirty_type]--; + +	if (dirty_type == DIRTY) { +		struct seg_entry *sentry = get_seg_entry(sbi, segno); +		dirty_type = sentry->type; +		if (test_and_clear_bit(segno, +					dirty_i->dirty_segmap[dirty_type])) +			dirty_i->nr_dirty[dirty_type]--; +		clear_bit(segno, dirty_i->victim_segmap[FG_GC]); +		clear_bit(segno, dirty_i->victim_segmap[BG_GC]); +	} +} + +/* + * Should not occur error such as -ENOMEM. + * Adding dirty entry into seglist is not critical operation. + * If a given segment is one of current working segments, it won't be added. + */ +void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	unsigned short valid_blocks; + +	if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) +		return; + +	mutex_lock(&dirty_i->seglist_lock); + +	valid_blocks = get_valid_blocks(sbi, segno, 0); + +	if (valid_blocks == 0) { +		__locate_dirty_segment(sbi, segno, PRE); +		__remove_dirty_segment(sbi, segno, DIRTY); +	} else if (valid_blocks < sbi->blocks_per_seg) { +		__locate_dirty_segment(sbi, segno, DIRTY); +	} else { +		/* Recovery routine with SSR needs this */ +		__remove_dirty_segment(sbi, segno, DIRTY); +	} + +	mutex_unlock(&dirty_i->seglist_lock); +	return; +} + +/* + * Should call clear_prefree_segments after checkpoint is done. + */ +static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	unsigned int segno, offset = 0; +	unsigned int total_segs = TOTAL_SEGS(sbi); + +	mutex_lock(&dirty_i->seglist_lock); +	while (1) { +		segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, +				offset); +		if (segno >= total_segs) +			break; +		__set_test_and_free(sbi, segno); +		offset = segno + 1; +	} +	mutex_unlock(&dirty_i->seglist_lock); +} + +void clear_prefree_segments(struct f2fs_sb_info *sbi) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	unsigned int segno, offset = 0; +	unsigned int total_segs = TOTAL_SEGS(sbi); + +	mutex_lock(&dirty_i->seglist_lock); +	while (1) { +		segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, +				offset); +		if (segno >= total_segs) +			break; + +		offset = segno + 1; +		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) +			dirty_i->nr_dirty[PRE]--; + +		/* Let's use trim */ +		if (test_opt(sbi, DISCARD)) +			blkdev_issue_discard(sbi->sb->s_bdev, +					START_BLOCK(sbi, segno) << +					sbi->log_sectors_per_block, +					1 << (sbi->log_sectors_per_block + +						sbi->log_blocks_per_seg), +					GFP_NOFS, 0); +	} +	mutex_unlock(&dirty_i->seglist_lock); +} + +static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) +		sit_i->dirty_sentries++; +} + +static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, +					unsigned int segno, int modified) +{ +	struct seg_entry *se = get_seg_entry(sbi, segno); +	se->type = type; +	if (modified) +		__mark_sit_entry_dirty(sbi, segno); +} + +static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) +{ +	struct seg_entry *se; +	unsigned int segno, offset; +	long int new_vblocks; + +	segno = GET_SEGNO(sbi, blkaddr); + +	se = get_seg_entry(sbi, segno); +	new_vblocks = se->valid_blocks + del; +	offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); + +	BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) || +				(new_vblocks > sbi->blocks_per_seg))); + +	se->valid_blocks = new_vblocks; +	se->mtime = get_mtime(sbi); +	SIT_I(sbi)->max_mtime = se->mtime; + +	/* Update valid block bitmap */ +	if (del > 0) { +		if (f2fs_set_bit(offset, se->cur_valid_map)) +			BUG(); +	} else { +		if (!f2fs_clear_bit(offset, se->cur_valid_map)) +			BUG(); +	} +	if (!f2fs_test_bit(offset, se->ckpt_valid_map)) +		se->ckpt_valid_blocks += del; + +	__mark_sit_entry_dirty(sbi, segno); + +	/* update total number of valid blocks to be written in ckpt area */ +	SIT_I(sbi)->written_valid_blocks += del; + +	if (sbi->segs_per_sec > 1) +		get_sec_entry(sbi, segno)->valid_blocks += del; +} + +static void refresh_sit_entry(struct f2fs_sb_info *sbi, +			block_t old_blkaddr, block_t new_blkaddr) +{ +	update_sit_entry(sbi, new_blkaddr, 1); +	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) +		update_sit_entry(sbi, old_blkaddr, -1); +} + +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) +{ +	unsigned int segno = GET_SEGNO(sbi, addr); +	struct sit_info *sit_i = SIT_I(sbi); + +	BUG_ON(addr == NULL_ADDR); +	if (addr == NEW_ADDR) +		return; + +	/* add it into sit main buffer */ +	mutex_lock(&sit_i->sentry_lock); + +	update_sit_entry(sbi, addr, -1); + +	/* add it into dirty seglist */ +	locate_dirty_segment(sbi, segno); + +	mutex_unlock(&sit_i->sentry_lock); +} + +/* + * This function should be resided under the curseg_mutex lock + */ +static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, +		struct f2fs_summary *sum, unsigned short offset) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	void *addr = curseg->sum_blk; +	addr += offset * sizeof(struct f2fs_summary); +	memcpy(addr, sum, sizeof(struct f2fs_summary)); +	return; +} + +/* + * Calculate the number of current summary pages for writing + */ +int npages_for_summary_flush(struct f2fs_sb_info *sbi) +{ +	int total_size_bytes = 0; +	int valid_sum_count = 0; +	int i, sum_space; + +	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { +		if (sbi->ckpt->alloc_type[i] == SSR) +			valid_sum_count += sbi->blocks_per_seg; +		else +			valid_sum_count += curseg_blkoff(sbi, i); +	} + +	total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1) +			+ sizeof(struct nat_journal) + 2 +			+ sizeof(struct sit_journal) + 2; +	sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE; +	if (total_size_bytes < sum_space) +		return 1; +	else if (total_size_bytes < 2 * sum_space) +		return 2; +	return 3; +} + +/* + * Caller should put this summary page + */ +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +{ +	return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); +} + +static void write_sum_page(struct f2fs_sb_info *sbi, +			struct f2fs_summary_block *sum_blk, block_t blk_addr) +{ +	struct page *page = grab_meta_page(sbi, blk_addr); +	void *kaddr = page_address(page); +	memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE); +	set_page_dirty(page); +	f2fs_put_page(page, 1); +} + +static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, +					int ofs_unit, int type) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE]; +	unsigned int segno, next_segno, i; +	int ofs = 0; + +	/* +	 * If there is not enough reserved sections, +	 * we should not reuse prefree segments. +	 */ +	if (has_not_enough_free_secs(sbi, 0)) +		return NULL_SEGNO; + +	/* +	 * NODE page should not reuse prefree segment, +	 * since those information is used for SPOR. +	 */ +	if (IS_NODESEG(type)) +		return NULL_SEGNO; +next: +	segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++); +	ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit; +	if (segno < TOTAL_SEGS(sbi)) { +		/* skip intermediate segments in a section */ +		if (segno % ofs_unit) +			goto next; + +		/* skip if whole section is not prefree */ +		next_segno = find_next_zero_bit(prefree_segmap, +						TOTAL_SEGS(sbi), segno + 1); +		if (next_segno - segno < ofs_unit) +			goto next; + +		/* skip if whole section was not free at the last checkpoint */ +		for (i = 0; i < ofs_unit; i++) +			if (get_seg_entry(sbi, segno)->ckpt_valid_blocks) +				goto next; +		return segno; +	} +	return NULL_SEGNO; +} + +/* + * Find a new segment from the free segments bitmap to right order + * This function should be returned with success, otherwise BUG + */ +static void get_new_segment(struct f2fs_sb_info *sbi, +			unsigned int *newseg, bool new_sec, int dir) +{ +	struct free_segmap_info *free_i = FREE_I(sbi); +	unsigned int total_secs = sbi->total_sections; +	unsigned int segno, secno, zoneno; +	unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone; +	unsigned int hint = *newseg / sbi->segs_per_sec; +	unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); +	unsigned int left_start = hint; +	bool init = true; +	int go_left = 0; +	int i; + +	write_lock(&free_i->segmap_lock); + +	if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { +		segno = find_next_zero_bit(free_i->free_segmap, +					TOTAL_SEGS(sbi), *newseg + 1); +		if (segno < TOTAL_SEGS(sbi)) +			goto got_it; +	} +find_other_zone: +	secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint); +	if (secno >= total_secs) { +		if (dir == ALLOC_RIGHT) { +			secno = find_next_zero_bit(free_i->free_secmap, +						total_secs, 0); +			BUG_ON(secno >= total_secs); +		} else { +			go_left = 1; +			left_start = hint - 1; +		} +	} +	if (go_left == 0) +		goto skip_left; + +	while (test_bit(left_start, free_i->free_secmap)) { +		if (left_start > 0) { +			left_start--; +			continue; +		} +		left_start = find_next_zero_bit(free_i->free_secmap, +						total_secs, 0); +		BUG_ON(left_start >= total_secs); +		break; +	} +	secno = left_start; +skip_left: +	hint = secno; +	segno = secno * sbi->segs_per_sec; +	zoneno = secno / sbi->secs_per_zone; + +	/* give up on finding another zone */ +	if (!init) +		goto got_it; +	if (sbi->secs_per_zone == 1) +		goto got_it; +	if (zoneno == old_zoneno) +		goto got_it; +	if (dir == ALLOC_LEFT) { +		if (!go_left && zoneno + 1 >= total_zones) +			goto got_it; +		if (go_left && zoneno == 0) +			goto got_it; +	} +	for (i = 0; i < NR_CURSEG_TYPE; i++) +		if (CURSEG_I(sbi, i)->zone == zoneno) +			break; + +	if (i < NR_CURSEG_TYPE) { +		/* zone is in user, try another */ +		if (go_left) +			hint = zoneno * sbi->secs_per_zone - 1; +		else if (zoneno + 1 >= total_zones) +			hint = 0; +		else +			hint = (zoneno + 1) * sbi->secs_per_zone; +		init = false; +		goto find_other_zone; +	} +got_it: +	/* set it as dirty segment in free segmap */ +	BUG_ON(test_bit(segno, free_i->free_segmap)); +	__set_inuse(sbi, segno); +	*newseg = segno; +	write_unlock(&free_i->segmap_lock); +} + +static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	struct summary_footer *sum_footer; + +	curseg->segno = curseg->next_segno; +	curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); +	curseg->next_blkoff = 0; +	curseg->next_segno = NULL_SEGNO; + +	sum_footer = &(curseg->sum_blk->footer); +	memset(sum_footer, 0, sizeof(struct summary_footer)); +	if (IS_DATASEG(type)) +		SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); +	if (IS_NODESEG(type)) +		SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); +	__set_sit_entry_type(sbi, type, curseg->segno, modified); +} + +/* + * Allocate a current working segment. + * This function always allocates a free segment in LFS manner. + */ +static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	unsigned int segno = curseg->segno; +	int dir = ALLOC_LEFT; + +	write_sum_page(sbi, curseg->sum_blk, +				GET_SUM_BLOCK(sbi, curseg->segno)); +	if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) +		dir = ALLOC_RIGHT; + +	if (test_opt(sbi, NOHEAP)) +		dir = ALLOC_RIGHT; + +	get_new_segment(sbi, &segno, new_sec, dir); +	curseg->next_segno = segno; +	reset_curseg(sbi, type, 1); +	curseg->alloc_type = LFS; +} + +static void __next_free_blkoff(struct f2fs_sb_info *sbi, +			struct curseg_info *seg, block_t start) +{ +	struct seg_entry *se = get_seg_entry(sbi, seg->segno); +	block_t ofs; +	for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) { +		if (!f2fs_test_bit(ofs, se->ckpt_valid_map) +			&& !f2fs_test_bit(ofs, se->cur_valid_map)) +			break; +	} +	seg->next_blkoff = ofs; +} + +/* + * If a segment is written by LFS manner, next block offset is just obtained + * by increasing the current block offset. However, if a segment is written by + * SSR manner, next block offset obtained by calling __next_free_blkoff + */ +static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, +				struct curseg_info *seg) +{ +	if (seg->alloc_type == SSR) +		__next_free_blkoff(sbi, seg, seg->next_blkoff + 1); +	else +		seg->next_blkoff++; +} + +/* + * This function always allocates a used segment (from dirty seglist) by SSR + * manner, so it should recover the existing segment information of valid blocks + */ +static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	unsigned int new_segno = curseg->next_segno; +	struct f2fs_summary_block *sum_node; +	struct page *sum_page; + +	write_sum_page(sbi, curseg->sum_blk, +				GET_SUM_BLOCK(sbi, curseg->segno)); +	__set_test_and_inuse(sbi, new_segno); + +	mutex_lock(&dirty_i->seglist_lock); +	__remove_dirty_segment(sbi, new_segno, PRE); +	__remove_dirty_segment(sbi, new_segno, DIRTY); +	mutex_unlock(&dirty_i->seglist_lock); + +	reset_curseg(sbi, type, 1); +	curseg->alloc_type = SSR; +	__next_free_blkoff(sbi, curseg, 0); + +	if (reuse) { +		sum_page = get_sum_page(sbi, new_segno); +		sum_node = (struct f2fs_summary_block *)page_address(sum_page); +		memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); +		f2fs_put_page(sum_page, 1); +	} +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + +	if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) +		return v_ops->get_victim(sbi, +				&(curseg)->next_segno, BG_GC, type, SSR); + +	/* For data segments, let's do SSR more intensively */ +	for (; type >= CURSEG_HOT_DATA; type--) +		if (v_ops->get_victim(sbi, &(curseg)->next_segno, +						BG_GC, type, SSR)) +			return 1; +	return 0; +} + +/* + * flush out current segment and replace it with new segment + * This function should be returned with success, otherwise BUG + */ +static void allocate_segment_by_default(struct f2fs_sb_info *sbi, +						int type, bool force) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	unsigned int ofs_unit; + +	if (force) { +		new_curseg(sbi, type, true); +		goto out; +	} + +	ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec; +	curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type); + +	if (curseg->next_segno != NULL_SEGNO) +		change_curseg(sbi, type, false); +	else if (type == CURSEG_WARM_NODE) +		new_curseg(sbi, type, false); +	else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) +		change_curseg(sbi, type, true); +	else +		new_curseg(sbi, type, false); +out: +	sbi->segment_count[curseg->alloc_type]++; +} + +void allocate_new_segments(struct f2fs_sb_info *sbi) +{ +	struct curseg_info *curseg; +	unsigned int old_curseg; +	int i; + +	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { +		curseg = CURSEG_I(sbi, i); +		old_curseg = curseg->segno; +		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); +		locate_dirty_segment(sbi, old_curseg); +	} +} + +static const struct segment_allocation default_salloc_ops = { +	.allocate_segment = allocate_segment_by_default, +}; + +static void f2fs_end_io_write(struct bio *bio, int err) +{ +	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; +	struct bio_private *p = bio->bi_private; + +	do { +		struct page *page = bvec->bv_page; + +		if (--bvec >= bio->bi_io_vec) +			prefetchw(&bvec->bv_page->flags); +		if (!uptodate) { +			SetPageError(page); +			if (page->mapping) +				set_bit(AS_EIO, &page->mapping->flags); +			set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); +			p->sbi->sb->s_flags |= MS_RDONLY; +		} +		end_page_writeback(page); +		dec_page_count(p->sbi, F2FS_WRITEBACK); +	} while (bvec >= bio->bi_io_vec); + +	if (p->is_sync) +		complete(p->wait); +	kfree(p); +	bio_put(bio); +} + +struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) +{ +	struct bio *bio; +	struct bio_private *priv; +retry: +	priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); +	if (!priv) { +		cond_resched(); +		goto retry; +	} + +	/* No failure on bio allocation */ +	bio = bio_alloc(GFP_NOIO, npages); +	bio->bi_bdev = bdev; +	bio->bi_private = priv; +	return bio; +} + +static void do_submit_bio(struct f2fs_sb_info *sbi, +				enum page_type type, bool sync) +{ +	int rw = sync ? WRITE_SYNC : WRITE; +	enum page_type btype = type > META ? META : type; + +	if (type >= META_FLUSH) +		rw = WRITE_FLUSH_FUA; + +	if (sbi->bio[btype]) { +		struct bio_private *p = sbi->bio[btype]->bi_private; +		p->sbi = sbi; +		sbi->bio[btype]->bi_end_io = f2fs_end_io_write; +		if (type == META_FLUSH) { +			DECLARE_COMPLETION_ONSTACK(wait); +			p->is_sync = true; +			p->wait = &wait; +			submit_bio(rw, sbi->bio[btype]); +			wait_for_completion(&wait); +		} else { +			p->is_sync = false; +			submit_bio(rw, sbi->bio[btype]); +		} +		sbi->bio[btype] = NULL; +	} +} + +void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync) +{ +	down_write(&sbi->bio_sem); +	do_submit_bio(sbi, type, sync); +	up_write(&sbi->bio_sem); +} + +static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, +				block_t blk_addr, enum page_type type) +{ +	struct block_device *bdev = sbi->sb->s_bdev; + +	verify_block_addr(sbi, blk_addr); + +	down_write(&sbi->bio_sem); + +	inc_page_count(sbi, F2FS_WRITEBACK); + +	if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1) +		do_submit_bio(sbi, type, false); +alloc_new: +	if (sbi->bio[type] == NULL) { +		sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev)); +		sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); +		/* +		 * The end_io will be assigned at the sumbission phase. +		 * Until then, let bio_add_page() merge consecutive IOs as much +		 * as possible. +		 */ +	} + +	if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) < +							PAGE_CACHE_SIZE) { +		do_submit_bio(sbi, type, false); +		goto alloc_new; +	} + +	sbi->last_block_in_bio[type] = blk_addr; + +	up_write(&sbi->bio_sem); +} + +static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	if (curseg->next_blkoff < sbi->blocks_per_seg) +		return true; +	return false; +} + +static int __get_segment_type_2(struct page *page, enum page_type p_type) +{ +	if (p_type == DATA) +		return CURSEG_HOT_DATA; +	else +		return CURSEG_HOT_NODE; +} + +static int __get_segment_type_4(struct page *page, enum page_type p_type) +{ +	if (p_type == DATA) { +		struct inode *inode = page->mapping->host; + +		if (S_ISDIR(inode->i_mode)) +			return CURSEG_HOT_DATA; +		else +			return CURSEG_COLD_DATA; +	} else { +		if (IS_DNODE(page) && !is_cold_node(page)) +			return CURSEG_HOT_NODE; +		else +			return CURSEG_COLD_NODE; +	} +} + +static int __get_segment_type_6(struct page *page, enum page_type p_type) +{ +	if (p_type == DATA) { +		struct inode *inode = page->mapping->host; + +		if (S_ISDIR(inode->i_mode)) +			return CURSEG_HOT_DATA; +		else if (is_cold_data(page) || is_cold_file(inode)) +			return CURSEG_COLD_DATA; +		else +			return CURSEG_WARM_DATA; +	} else { +		if (IS_DNODE(page)) +			return is_cold_node(page) ? CURSEG_WARM_NODE : +						CURSEG_HOT_NODE; +		else +			return CURSEG_COLD_NODE; +	} +} + +static int __get_segment_type(struct page *page, enum page_type p_type) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); +	switch (sbi->active_logs) { +	case 2: +		return __get_segment_type_2(page, p_type); +	case 4: +		return __get_segment_type_4(page, p_type); +	} +	/* NR_CURSEG_TYPE(6) logs by default */ +	BUG_ON(sbi->active_logs != NR_CURSEG_TYPE); +	return __get_segment_type_6(page, p_type); +} + +static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, +			block_t old_blkaddr, block_t *new_blkaddr, +			struct f2fs_summary *sum, enum page_type p_type) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	struct curseg_info *curseg; +	unsigned int old_cursegno; +	int type; + +	type = __get_segment_type(page, p_type); +	curseg = CURSEG_I(sbi, type); + +	mutex_lock(&curseg->curseg_mutex); + +	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); +	old_cursegno = curseg->segno; + +	/* +	 * __add_sum_entry should be resided under the curseg_mutex +	 * because, this function updates a summary entry in the +	 * current summary block. +	 */ +	__add_sum_entry(sbi, type, sum, curseg->next_blkoff); + +	mutex_lock(&sit_i->sentry_lock); +	__refresh_next_blkoff(sbi, curseg); +	sbi->block_count[curseg->alloc_type]++; + +	/* +	 * SIT information should be updated before segment allocation, +	 * since SSR needs latest valid block information. +	 */ +	refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + +	if (!__has_curseg_space(sbi, type)) +		sit_i->s_ops->allocate_segment(sbi, type, false); + +	locate_dirty_segment(sbi, old_cursegno); +	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); +	mutex_unlock(&sit_i->sentry_lock); + +	if (p_type == NODE) +		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + +	/* writeout dirty page into bdev */ +	submit_write_page(sbi, page, *new_blkaddr, p_type); + +	mutex_unlock(&curseg->curseg_mutex); +} + +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +{ +	set_page_writeback(page); +	submit_write_page(sbi, page, page->index, META); +} + +void write_node_page(struct f2fs_sb_info *sbi, struct page *page, +		unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) +{ +	struct f2fs_summary sum; +	set_summary(&sum, nid, 0, 0); +	do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE); +} + +void write_data_page(struct inode *inode, struct page *page, +		struct dnode_of_data *dn, block_t old_blkaddr, +		block_t *new_blkaddr) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct f2fs_summary sum; +	struct node_info ni; + +	BUG_ON(old_blkaddr == NULL_ADDR); +	get_node_info(sbi, dn->nid, &ni); +	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + +	do_write_page(sbi, page, old_blkaddr, +			new_blkaddr, &sum, DATA); +} + +void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page, +					block_t old_blk_addr) +{ +	submit_write_page(sbi, page, old_blk_addr, DATA); +} + +void recover_data_page(struct f2fs_sb_info *sbi, +			struct page *page, struct f2fs_summary *sum, +			block_t old_blkaddr, block_t new_blkaddr) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	struct curseg_info *curseg; +	unsigned int segno, old_cursegno; +	struct seg_entry *se; +	int type; + +	segno = GET_SEGNO(sbi, new_blkaddr); +	se = get_seg_entry(sbi, segno); +	type = se->type; + +	if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { +		if (old_blkaddr == NULL_ADDR) +			type = CURSEG_COLD_DATA; +		else +			type = CURSEG_WARM_DATA; +	} +	curseg = CURSEG_I(sbi, type); + +	mutex_lock(&curseg->curseg_mutex); +	mutex_lock(&sit_i->sentry_lock); + +	old_cursegno = curseg->segno; + +	/* change the current segment */ +	if (segno != curseg->segno) { +		curseg->next_segno = segno; +		change_curseg(sbi, type, true); +	} + +	curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & +					(sbi->blocks_per_seg - 1); +	__add_sum_entry(sbi, type, sum, curseg->next_blkoff); + +	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + +	locate_dirty_segment(sbi, old_cursegno); +	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + +	mutex_unlock(&sit_i->sentry_lock); +	mutex_unlock(&curseg->curseg_mutex); +} + +void rewrite_node_page(struct f2fs_sb_info *sbi, +			struct page *page, struct f2fs_summary *sum, +			block_t old_blkaddr, block_t new_blkaddr) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	int type = CURSEG_WARM_NODE; +	struct curseg_info *curseg; +	unsigned int segno, old_cursegno; +	block_t next_blkaddr = next_blkaddr_of_node(page); +	unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); + +	curseg = CURSEG_I(sbi, type); + +	mutex_lock(&curseg->curseg_mutex); +	mutex_lock(&sit_i->sentry_lock); + +	segno = GET_SEGNO(sbi, new_blkaddr); +	old_cursegno = curseg->segno; + +	/* change the current segment */ +	if (segno != curseg->segno) { +		curseg->next_segno = segno; +		change_curseg(sbi, type, true); +	} +	curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & +					(sbi->blocks_per_seg - 1); +	__add_sum_entry(sbi, type, sum, curseg->next_blkoff); + +	/* change the current log to the next block addr in advance */ +	if (next_segno != segno) { +		curseg->next_segno = next_segno; +		change_curseg(sbi, type, true); +	} +	curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & +					(sbi->blocks_per_seg - 1); + +	/* rewrite node page */ +	set_page_writeback(page); +	submit_write_page(sbi, page, new_blkaddr, NODE); +	f2fs_submit_bio(sbi, NODE, true); +	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + +	locate_dirty_segment(sbi, old_cursegno); +	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + +	mutex_unlock(&sit_i->sentry_lock); +	mutex_unlock(&curseg->curseg_mutex); +} + +static int read_compacted_summaries(struct f2fs_sb_info *sbi) +{ +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); +	struct curseg_info *seg_i; +	unsigned char *kaddr; +	struct page *page; +	block_t start; +	int i, j, offset; + +	start = start_sum_block(sbi); + +	page = get_meta_page(sbi, start++); +	kaddr = (unsigned char *)page_address(page); + +	/* Step 1: restore nat cache */ +	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); +	memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE); + +	/* Step 2: restore sit cache */ +	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); +	memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE, +						SUM_JOURNAL_SIZE); +	offset = 2 * SUM_JOURNAL_SIZE; + +	/* Step 3: restore summary entries */ +	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { +		unsigned short blk_off; +		unsigned int segno; + +		seg_i = CURSEG_I(sbi, i); +		segno = le32_to_cpu(ckpt->cur_data_segno[i]); +		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); +		seg_i->next_segno = segno; +		reset_curseg(sbi, i, 0); +		seg_i->alloc_type = ckpt->alloc_type[i]; +		seg_i->next_blkoff = blk_off; + +		if (seg_i->alloc_type == SSR) +			blk_off = sbi->blocks_per_seg; + +		for (j = 0; j < blk_off; j++) { +			struct f2fs_summary *s; +			s = (struct f2fs_summary *)(kaddr + offset); +			seg_i->sum_blk->entries[j] = *s; +			offset += SUMMARY_SIZE; +			if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE - +						SUM_FOOTER_SIZE) +				continue; + +			f2fs_put_page(page, 1); +			page = NULL; + +			page = get_meta_page(sbi, start++); +			kaddr = (unsigned char *)page_address(page); +			offset = 0; +		} +	} +	f2fs_put_page(page, 1); +	return 0; +} + +static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) +{ +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); +	struct f2fs_summary_block *sum; +	struct curseg_info *curseg; +	struct page *new; +	unsigned short blk_off; +	unsigned int segno = 0; +	block_t blk_addr = 0; + +	/* get segment number and block addr */ +	if (IS_DATASEG(type)) { +		segno = le32_to_cpu(ckpt->cur_data_segno[type]); +		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - +							CURSEG_HOT_DATA]); +		if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) +			blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); +		else +			blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); +	} else { +		segno = le32_to_cpu(ckpt->cur_node_segno[type - +							CURSEG_HOT_NODE]); +		blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - +							CURSEG_HOT_NODE]); +		if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) +			blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, +							type - CURSEG_HOT_NODE); +		else +			blk_addr = GET_SUM_BLOCK(sbi, segno); +	} + +	new = get_meta_page(sbi, blk_addr); +	sum = (struct f2fs_summary_block *)page_address(new); + +	if (IS_NODESEG(type)) { +		if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) { +			struct f2fs_summary *ns = &sum->entries[0]; +			int i; +			for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { +				ns->version = 0; +				ns->ofs_in_node = 0; +			} +		} else { +			if (restore_node_summary(sbi, segno, sum)) { +				f2fs_put_page(new, 1); +				return -EINVAL; +			} +		} +	} + +	/* set uncompleted segment to curseg */ +	curseg = CURSEG_I(sbi, type); +	mutex_lock(&curseg->curseg_mutex); +	memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE); +	curseg->next_segno = segno; +	reset_curseg(sbi, type, 0); +	curseg->alloc_type = ckpt->alloc_type[type]; +	curseg->next_blkoff = blk_off; +	mutex_unlock(&curseg->curseg_mutex); +	f2fs_put_page(new, 1); +	return 0; +} + +static int restore_curseg_summaries(struct f2fs_sb_info *sbi) +{ +	int type = CURSEG_HOT_DATA; + +	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { +		/* restore for compacted data summary */ +		if (read_compacted_summaries(sbi)) +			return -EINVAL; +		type = CURSEG_HOT_NODE; +	} + +	for (; type <= CURSEG_COLD_NODE; type++) +		if (read_normal_summaries(sbi, type)) +			return -EINVAL; +	return 0; +} + +static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) +{ +	struct page *page; +	unsigned char *kaddr; +	struct f2fs_summary *summary; +	struct curseg_info *seg_i; +	int written_size = 0; +	int i, j; + +	page = grab_meta_page(sbi, blkaddr++); +	kaddr = (unsigned char *)page_address(page); + +	/* Step 1: write nat cache */ +	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); +	memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE); +	written_size += SUM_JOURNAL_SIZE; + +	/* Step 2: write sit cache */ +	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); +	memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits, +						SUM_JOURNAL_SIZE); +	written_size += SUM_JOURNAL_SIZE; + +	set_page_dirty(page); + +	/* Step 3: write summary entries */ +	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { +		unsigned short blkoff; +		seg_i = CURSEG_I(sbi, i); +		if (sbi->ckpt->alloc_type[i] == SSR) +			blkoff = sbi->blocks_per_seg; +		else +			blkoff = curseg_blkoff(sbi, i); + +		for (j = 0; j < blkoff; j++) { +			if (!page) { +				page = grab_meta_page(sbi, blkaddr++); +				kaddr = (unsigned char *)page_address(page); +				written_size = 0; +			} +			summary = (struct f2fs_summary *)(kaddr + written_size); +			*summary = seg_i->sum_blk->entries[j]; +			written_size += SUMMARY_SIZE; +			set_page_dirty(page); + +			if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - +							SUM_FOOTER_SIZE) +				continue; + +			f2fs_put_page(page, 1); +			page = NULL; +		} +	} +	if (page) +		f2fs_put_page(page, 1); +} + +static void write_normal_summaries(struct f2fs_sb_info *sbi, +					block_t blkaddr, int type) +{ +	int i, end; +	if (IS_DATASEG(type)) +		end = type + NR_CURSEG_DATA_TYPE; +	else +		end = type + NR_CURSEG_NODE_TYPE; + +	for (i = type; i < end; i++) { +		struct curseg_info *sum = CURSEG_I(sbi, i); +		mutex_lock(&sum->curseg_mutex); +		write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type)); +		mutex_unlock(&sum->curseg_mutex); +	} +} + +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ +	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) +		write_compacted_summaries(sbi, start_blk); +	else +		write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); +} + +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ +	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) +		write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); +	return; +} + +int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, +					unsigned int val, int alloc) +{ +	int i; + +	if (type == NAT_JOURNAL) { +		for (i = 0; i < nats_in_cursum(sum); i++) { +			if (le32_to_cpu(nid_in_journal(sum, i)) == val) +				return i; +		} +		if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) +			return update_nats_in_cursum(sum, 1); +	} else if (type == SIT_JOURNAL) { +		for (i = 0; i < sits_in_cursum(sum); i++) +			if (le32_to_cpu(segno_in_journal(sum, i)) == val) +				return i; +		if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) +			return update_sits_in_cursum(sum, 1); +	} +	return -1; +} + +static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, +					unsigned int segno) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno); +	block_t blk_addr = sit_i->sit_base_addr + offset; + +	check_seg_range(sbi, segno); + +	/* calculate sit block address */ +	if (f2fs_test_bit(offset, sit_i->sit_bitmap)) +		blk_addr += sit_i->sit_blocks; + +	return get_meta_page(sbi, blk_addr); +} + +static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, +					unsigned int start) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	struct page *src_page, *dst_page; +	pgoff_t src_off, dst_off; +	void *src_addr, *dst_addr; + +	src_off = current_sit_addr(sbi, start); +	dst_off = next_sit_addr(sbi, src_off); + +	/* get current sit block page without lock */ +	src_page = get_meta_page(sbi, src_off); +	dst_page = grab_meta_page(sbi, dst_off); +	BUG_ON(PageDirty(src_page)); + +	src_addr = page_address(src_page); +	dst_addr = page_address(dst_page); +	memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + +	set_page_dirty(dst_page); +	f2fs_put_page(src_page, 1); + +	set_to_next_sit(sit_i, start); + +	return dst_page; +} + +static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); +	struct f2fs_summary_block *sum = curseg->sum_blk; +	int i; + +	/* +	 * If the journal area in the current summary is full of sit entries, +	 * all the sit entries will be flushed. Otherwise the sit entries +	 * are not able to replace with newly hot sit entries. +	 */ +	if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) { +		for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { +			unsigned int segno; +			segno = le32_to_cpu(segno_in_journal(sum, i)); +			__mark_sit_entry_dirty(sbi, segno); +		} +		update_sits_in_cursum(sum, -sits_in_cursum(sum)); +		return 1; +	} +	return 0; +} + +/* + * CP calls this function, which flushes SIT entries including sit_journal, + * and moves prefree segs to free segs. + */ +void flush_sit_entries(struct f2fs_sb_info *sbi) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	unsigned long *bitmap = sit_i->dirty_sentries_bitmap; +	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); +	struct f2fs_summary_block *sum = curseg->sum_blk; +	unsigned long nsegs = TOTAL_SEGS(sbi); +	struct page *page = NULL; +	struct f2fs_sit_block *raw_sit = NULL; +	unsigned int start = 0, end = 0; +	unsigned int segno = -1; +	bool flushed; + +	mutex_lock(&curseg->curseg_mutex); +	mutex_lock(&sit_i->sentry_lock); + +	/* +	 * "flushed" indicates whether sit entries in journal are flushed +	 * to the SIT area or not. +	 */ +	flushed = flush_sits_in_journal(sbi); + +	while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) { +		struct seg_entry *se = get_seg_entry(sbi, segno); +		int sit_offset, offset; + +		sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + +		if (flushed) +			goto to_sit_page; + +		offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1); +		if (offset >= 0) { +			segno_in_journal(sum, offset) = cpu_to_le32(segno); +			seg_info_to_raw_sit(se, &sit_in_journal(sum, offset)); +			goto flush_done; +		} +to_sit_page: +		if (!page || (start > segno) || (segno > end)) { +			if (page) { +				f2fs_put_page(page, 1); +				page = NULL; +			} + +			start = START_SEGNO(sit_i, segno); +			end = start + SIT_ENTRY_PER_BLOCK - 1; + +			/* read sit block that will be updated */ +			page = get_next_sit_page(sbi, start); +			raw_sit = page_address(page); +		} + +		/* udpate entry in SIT block */ +		seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); +flush_done: +		__clear_bit(segno, bitmap); +		sit_i->dirty_sentries--; +	} +	mutex_unlock(&sit_i->sentry_lock); +	mutex_unlock(&curseg->curseg_mutex); + +	/* writeout last modified SIT block */ +	f2fs_put_page(page, 1); + +	set_prefree_as_free_segments(sbi); +} + +static int build_sit_info(struct f2fs_sb_info *sbi) +{ +	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); +	struct sit_info *sit_i; +	unsigned int sit_segs, start; +	char *src_bitmap, *dst_bitmap; +	unsigned int bitmap_size; + +	/* allocate memory for SIT information */ +	sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL); +	if (!sit_i) +		return -ENOMEM; + +	SM_I(sbi)->sit_info = sit_i; + +	sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry)); +	if (!sit_i->sentries) +		return -ENOMEM; + +	bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); +	sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); +	if (!sit_i->dirty_sentries_bitmap) +		return -ENOMEM; + +	for (start = 0; start < TOTAL_SEGS(sbi); start++) { +		sit_i->sentries[start].cur_valid_map +			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); +		sit_i->sentries[start].ckpt_valid_map +			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); +		if (!sit_i->sentries[start].cur_valid_map +				|| !sit_i->sentries[start].ckpt_valid_map) +			return -ENOMEM; +	} + +	if (sbi->segs_per_sec > 1) { +		sit_i->sec_entries = vzalloc(sbi->total_sections * +					sizeof(struct sec_entry)); +		if (!sit_i->sec_entries) +			return -ENOMEM; +	} + +	/* get information related with SIT */ +	sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; + +	/* setup SIT bitmap from ckeckpoint pack */ +	bitmap_size = __bitmap_size(sbi, SIT_BITMAP); +	src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); + +	dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL); +	if (!dst_bitmap) +		return -ENOMEM; +	memcpy(dst_bitmap, src_bitmap, bitmap_size); + +	/* init SIT information */ +	sit_i->s_ops = &default_salloc_ops; + +	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); +	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; +	sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); +	sit_i->sit_bitmap = dst_bitmap; +	sit_i->bitmap_size = bitmap_size; +	sit_i->dirty_sentries = 0; +	sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; +	sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); +	sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; +	mutex_init(&sit_i->sentry_lock); +	return 0; +} + +static int build_free_segmap(struct f2fs_sb_info *sbi) +{ +	struct f2fs_sm_info *sm_info = SM_I(sbi); +	struct free_segmap_info *free_i; +	unsigned int bitmap_size, sec_bitmap_size; + +	/* allocate memory for free segmap information */ +	free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL); +	if (!free_i) +		return -ENOMEM; + +	SM_I(sbi)->free_info = free_i; + +	bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); +	free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); +	if (!free_i->free_segmap) +		return -ENOMEM; + +	sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections); +	free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); +	if (!free_i->free_secmap) +		return -ENOMEM; + +	/* set all segments as dirty temporarily */ +	memset(free_i->free_segmap, 0xff, bitmap_size); +	memset(free_i->free_secmap, 0xff, sec_bitmap_size); + +	/* init free segmap information */ +	free_i->start_segno = +		(unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr); +	free_i->free_segments = 0; +	free_i->free_sections = 0; +	rwlock_init(&free_i->segmap_lock); +	return 0; +} + +static int build_curseg(struct f2fs_sb_info *sbi) +{ +	struct curseg_info *array; +	int i; + +	array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); +	if (!array) +		return -ENOMEM; + +	SM_I(sbi)->curseg_array = array; + +	for (i = 0; i < NR_CURSEG_TYPE; i++) { +		mutex_init(&array[i].curseg_mutex); +		array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); +		if (!array[i].sum_blk) +			return -ENOMEM; +		array[i].segno = NULL_SEGNO; +		array[i].next_blkoff = 0; +	} +	return restore_curseg_summaries(sbi); +} + +static void build_sit_entries(struct f2fs_sb_info *sbi) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); +	struct f2fs_summary_block *sum = curseg->sum_blk; +	unsigned int start; + +	for (start = 0; start < TOTAL_SEGS(sbi); start++) { +		struct seg_entry *se = &sit_i->sentries[start]; +		struct f2fs_sit_block *sit_blk; +		struct f2fs_sit_entry sit; +		struct page *page; +		int i; + +		mutex_lock(&curseg->curseg_mutex); +		for (i = 0; i < sits_in_cursum(sum); i++) { +			if (le32_to_cpu(segno_in_journal(sum, i)) == start) { +				sit = sit_in_journal(sum, i); +				mutex_unlock(&curseg->curseg_mutex); +				goto got_it; +			} +		} +		mutex_unlock(&curseg->curseg_mutex); +		page = get_current_sit_page(sbi, start); +		sit_blk = (struct f2fs_sit_block *)page_address(page); +		sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; +		f2fs_put_page(page, 1); +got_it: +		check_block_count(sbi, start, &sit); +		seg_info_from_raw_sit(se, &sit); +		if (sbi->segs_per_sec > 1) { +			struct sec_entry *e = get_sec_entry(sbi, start); +			e->valid_blocks += se->valid_blocks; +		} +	} +} + +static void init_free_segmap(struct f2fs_sb_info *sbi) +{ +	unsigned int start; +	int type; + +	for (start = 0; start < TOTAL_SEGS(sbi); start++) { +		struct seg_entry *sentry = get_seg_entry(sbi, start); +		if (!sentry->valid_blocks) +			__set_free(sbi, start); +	} + +	/* set use the current segments */ +	for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { +		struct curseg_info *curseg_t = CURSEG_I(sbi, type); +		__set_test_and_inuse(sbi, curseg_t->segno); +	} +} + +static void init_dirty_segmap(struct f2fs_sb_info *sbi) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	struct free_segmap_info *free_i = FREE_I(sbi); +	unsigned int segno = 0, offset = 0; +	unsigned short valid_blocks; + +	while (segno < TOTAL_SEGS(sbi)) { +		/* find dirty segment based on free segmap */ +		segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset); +		if (segno >= TOTAL_SEGS(sbi)) +			break; +		offset = segno + 1; +		valid_blocks = get_valid_blocks(sbi, segno, 0); +		if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks) +			continue; +		mutex_lock(&dirty_i->seglist_lock); +		__locate_dirty_segment(sbi, segno, DIRTY); +		mutex_unlock(&dirty_i->seglist_lock); +	} +} + +static int init_victim_segmap(struct f2fs_sb_info *sbi) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + +	dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL); +	dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL); +	if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC]) +		return -ENOMEM; +	return 0; +} + +static int build_dirty_segmap(struct f2fs_sb_info *sbi) +{ +	struct dirty_seglist_info *dirty_i; +	unsigned int bitmap_size, i; + +	/* allocate memory for dirty segments list information */ +	dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL); +	if (!dirty_i) +		return -ENOMEM; + +	SM_I(sbi)->dirty_info = dirty_i; +	mutex_init(&dirty_i->seglist_lock); + +	bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + +	for (i = 0; i < NR_DIRTY_TYPE; i++) { +		dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); +		if (!dirty_i->dirty_segmap[i]) +			return -ENOMEM; +	} + +	init_dirty_segmap(sbi); +	return init_victim_segmap(sbi); +} + +/* + * Update min, max modified time for cost-benefit GC algorithm + */ +static void init_min_max_mtime(struct f2fs_sb_info *sbi) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	unsigned int segno; + +	mutex_lock(&sit_i->sentry_lock); + +	sit_i->min_mtime = LLONG_MAX; + +	for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { +		unsigned int i; +		unsigned long long mtime = 0; + +		for (i = 0; i < sbi->segs_per_sec; i++) +			mtime += get_seg_entry(sbi, segno + i)->mtime; + +		mtime = div_u64(mtime, sbi->segs_per_sec); + +		if (sit_i->min_mtime > mtime) +			sit_i->min_mtime = mtime; +	} +	sit_i->max_mtime = get_mtime(sbi); +	mutex_unlock(&sit_i->sentry_lock); +} + +int build_segment_manager(struct f2fs_sb_info *sbi) +{ +	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); +	struct f2fs_sm_info *sm_info; +	int err; + +	sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL); +	if (!sm_info) +		return -ENOMEM; + +	/* init sm info */ +	sbi->sm_info = sm_info; +	INIT_LIST_HEAD(&sm_info->wblist_head); +	spin_lock_init(&sm_info->wblist_lock); +	sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); +	sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); +	sm_info->segment_count = le32_to_cpu(raw_super->segment_count); +	sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); +	sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); +	sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); +	sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + +	err = build_sit_info(sbi); +	if (err) +		return err; +	err = build_free_segmap(sbi); +	if (err) +		return err; +	err = build_curseg(sbi); +	if (err) +		return err; + +	/* reinit free segmap based on SIT */ +	build_sit_entries(sbi); + +	init_free_segmap(sbi); +	err = build_dirty_segmap(sbi); +	if (err) +		return err; + +	init_min_max_mtime(sbi); +	return 0; +} + +static void discard_dirty_segmap(struct f2fs_sb_info *sbi, +		enum dirty_type dirty_type) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + +	mutex_lock(&dirty_i->seglist_lock); +	kfree(dirty_i->dirty_segmap[dirty_type]); +	dirty_i->nr_dirty[dirty_type] = 0; +	mutex_unlock(&dirty_i->seglist_lock); +} + +void reset_victim_segmap(struct f2fs_sb_info *sbi) +{ +	unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); +	memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size); +} + +static void destroy_victim_segmap(struct f2fs_sb_info *sbi) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + +	kfree(dirty_i->victim_segmap[FG_GC]); +	kfree(dirty_i->victim_segmap[BG_GC]); +} + +static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) +{ +	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); +	int i; + +	if (!dirty_i) +		return; + +	/* discard pre-free/dirty segments list */ +	for (i = 0; i < NR_DIRTY_TYPE; i++) +		discard_dirty_segmap(sbi, i); + +	destroy_victim_segmap(sbi); +	SM_I(sbi)->dirty_info = NULL; +	kfree(dirty_i); +} + +static void destroy_curseg(struct f2fs_sb_info *sbi) +{ +	struct curseg_info *array = SM_I(sbi)->curseg_array; +	int i; + +	if (!array) +		return; +	SM_I(sbi)->curseg_array = NULL; +	for (i = 0; i < NR_CURSEG_TYPE; i++) +		kfree(array[i].sum_blk); +	kfree(array); +} + +static void destroy_free_segmap(struct f2fs_sb_info *sbi) +{ +	struct free_segmap_info *free_i = SM_I(sbi)->free_info; +	if (!free_i) +		return; +	SM_I(sbi)->free_info = NULL; +	kfree(free_i->free_segmap); +	kfree(free_i->free_secmap); +	kfree(free_i); +} + +static void destroy_sit_info(struct f2fs_sb_info *sbi) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	unsigned int start; + +	if (!sit_i) +		return; + +	if (sit_i->sentries) { +		for (start = 0; start < TOTAL_SEGS(sbi); start++) { +			kfree(sit_i->sentries[start].cur_valid_map); +			kfree(sit_i->sentries[start].ckpt_valid_map); +		} +	} +	vfree(sit_i->sentries); +	vfree(sit_i->sec_entries); +	kfree(sit_i->dirty_sentries_bitmap); + +	SM_I(sbi)->sit_info = NULL; +	kfree(sit_i->sit_bitmap); +	kfree(sit_i); +} + +void destroy_segment_manager(struct f2fs_sb_info *sbi) +{ +	struct f2fs_sm_info *sm_info = SM_I(sbi); +	destroy_dirty_segmap(sbi); +	destroy_curseg(sbi); +	destroy_free_segmap(sbi); +	destroy_sit_info(sbi); +	sbi->sm_info = NULL; +	kfree(sm_info); +} diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h new file mode 100644 index 00000000000..552dadbb232 --- /dev/null +++ b/fs/f2fs/segment.h @@ -0,0 +1,618 @@ +/* + * fs/f2fs/segment.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +/* constant macro */ +#define NULL_SEGNO			((unsigned int)(~0)) + +/* V: Logical segment # in volume, R: Relative segment # in main area */ +#define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno) +#define GET_R2L_SEGNO(free_i, segno)	(segno + free_i->start_segno) + +#define IS_DATASEG(t)							\ +	((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) ||		\ +	(t == CURSEG_WARM_DATA)) + +#define IS_NODESEG(t)							\ +	((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) ||		\ +	(t == CURSEG_WARM_NODE)) + +#define IS_CURSEG(sbi, segno)						\ +	((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||	\ +	 (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||	\ +	 (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||	\ +	 (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||	\ +	 (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||	\ +	 (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + +#define IS_CURSEC(sbi, secno)						\ +	((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /		\ +	  sbi->segs_per_sec) ||	\ +	 (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno /		\ +	  sbi->segs_per_sec) ||	\ +	 (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno /		\ +	  sbi->segs_per_sec) ||	\ +	 (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno /		\ +	  sbi->segs_per_sec) ||	\ +	 (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno /		\ +	  sbi->segs_per_sec) ||	\ +	 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /		\ +	  sbi->segs_per_sec))	\ + +#define START_BLOCK(sbi, segno)						\ +	(SM_I(sbi)->seg0_blkaddr +					\ +	 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) +#define NEXT_FREE_BLKADDR(sbi, curseg)					\ +	(START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + +#define MAIN_BASE_BLOCK(sbi)	(SM_I(sbi)->main_blkaddr) + +#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)				\ +	((blk_addr) - SM_I(sbi)->seg0_blkaddr) +#define GET_SEGNO_FROM_SEG0(sbi, blk_addr)				\ +	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) +#define GET_SEGNO(sbi, blk_addr)					\ +	(((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ?		\ +	NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),			\ +		GET_SEGNO_FROM_SEG0(sbi, blk_addr))) +#define GET_SECNO(sbi, segno)					\ +	((segno) / sbi->segs_per_sec) +#define GET_ZONENO_FROM_SEGNO(sbi, segno)				\ +	((segno / sbi->segs_per_sec) / sbi->secs_per_zone) + +#define GET_SUM_BLOCK(sbi, segno)				\ +	((sbi->sm_info->ssa_blkaddr) + segno) + +#define GET_SUM_TYPE(footer) ((footer)->entry_type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) + +#define SIT_ENTRY_OFFSET(sit_i, segno)					\ +	(segno % sit_i->sents_per_block) +#define SIT_BLOCK_OFFSET(sit_i, segno)					\ +	(segno / SIT_ENTRY_PER_BLOCK) +#define	START_SEGNO(sit_i, segno)		\ +	(SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) +#define f2fs_bitmap_size(nr)			\ +	(BITS_TO_LONGS(nr) * sizeof(unsigned long)) +#define TOTAL_SEGS(sbi)	(SM_I(sbi)->main_segments) + +#define SECTOR_FROM_BLOCK(sbi, blk_addr)				\ +	(blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) + +/* during checkpoint, bio_private is used to synchronize the last bio */ +struct bio_private { +	struct f2fs_sb_info *sbi; +	bool is_sync; +	void *wait; +}; + +/* + * indicate a block allocation direction: RIGHT and LEFT. + * RIGHT means allocating new sections towards the end of volume. + * LEFT means the opposite direction. + */ +enum { +	ALLOC_RIGHT = 0, +	ALLOC_LEFT +}; + +/* + * In the victim_sel_policy->alloc_mode, there are two block allocation modes. + * LFS writes data sequentially with cleaning operations. + * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + */ +enum { +	LFS = 0, +	SSR +}; + +/* + * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. + * GC_CB is based on cost-benefit algorithm. + * GC_GREEDY is based on greedy algorithm. + */ +enum { +	GC_CB = 0, +	GC_GREEDY +}; + +/* + * BG_GC means the background cleaning job. + * FG_GC means the on-demand cleaning job. + */ +enum { +	BG_GC = 0, +	FG_GC +}; + +/* for a function parameter to select a victim segment */ +struct victim_sel_policy { +	int alloc_mode;			/* LFS or SSR */ +	int gc_mode;			/* GC_CB or GC_GREEDY */ +	unsigned long *dirty_segmap;	/* dirty segment bitmap */ +	unsigned int offset;		/* last scanned bitmap offset */ +	unsigned int ofs_unit;		/* bitmap search unit */ +	unsigned int min_cost;		/* minimum cost */ +	unsigned int min_segno;		/* segment # having min. cost */ +}; + +struct seg_entry { +	unsigned short valid_blocks;	/* # of valid blocks */ +	unsigned char *cur_valid_map;	/* validity bitmap of blocks */ +	/* +	 * # of valid blocks and the validity bitmap stored in the the last +	 * checkpoint pack. This information is used by the SSR mode. +	 */ +	unsigned short ckpt_valid_blocks; +	unsigned char *ckpt_valid_map; +	unsigned char type;		/* segment type like CURSEG_XXX_TYPE */ +	unsigned long long mtime;	/* modification time of the segment */ +}; + +struct sec_entry { +	unsigned int valid_blocks;	/* # of valid blocks in a section */ +}; + +struct segment_allocation { +	void (*allocate_segment)(struct f2fs_sb_info *, int, bool); +}; + +struct sit_info { +	const struct segment_allocation *s_ops; + +	block_t sit_base_addr;		/* start block address of SIT area */ +	block_t sit_blocks;		/* # of blocks used by SIT area */ +	block_t written_valid_blocks;	/* # of valid blocks in main area */ +	char *sit_bitmap;		/* SIT bitmap pointer */ +	unsigned int bitmap_size;	/* SIT bitmap size */ + +	unsigned long *dirty_sentries_bitmap;	/* bitmap for dirty sentries */ +	unsigned int dirty_sentries;		/* # of dirty sentries */ +	unsigned int sents_per_block;		/* # of SIT entries per block */ +	struct mutex sentry_lock;		/* to protect SIT cache */ +	struct seg_entry *sentries;		/* SIT segment-level cache */ +	struct sec_entry *sec_entries;		/* SIT section-level cache */ + +	/* for cost-benefit algorithm in cleaning procedure */ +	unsigned long long elapsed_time;	/* elapsed time after mount */ +	unsigned long long mounted_time;	/* mount time */ +	unsigned long long min_mtime;		/* min. modification time */ +	unsigned long long max_mtime;		/* max. modification time */ +}; + +struct free_segmap_info { +	unsigned int start_segno;	/* start segment number logically */ +	unsigned int free_segments;	/* # of free segments */ +	unsigned int free_sections;	/* # of free sections */ +	rwlock_t segmap_lock;		/* free segmap lock */ +	unsigned long *free_segmap;	/* free segment bitmap */ +	unsigned long *free_secmap;	/* free section bitmap */ +}; + +/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */ +enum dirty_type { +	DIRTY_HOT_DATA,		/* dirty segments assigned as hot data logs */ +	DIRTY_WARM_DATA,	/* dirty segments assigned as warm data logs */ +	DIRTY_COLD_DATA,	/* dirty segments assigned as cold data logs */ +	DIRTY_HOT_NODE,		/* dirty segments assigned as hot node logs */ +	DIRTY_WARM_NODE,	/* dirty segments assigned as warm node logs */ +	DIRTY_COLD_NODE,	/* dirty segments assigned as cold node logs */ +	DIRTY,			/* to count # of dirty segments */ +	PRE,			/* to count # of entirely obsolete segments */ +	NR_DIRTY_TYPE +}; + +struct dirty_seglist_info { +	const struct victim_selection *v_ops;	/* victim selction operation */ +	unsigned long *dirty_segmap[NR_DIRTY_TYPE]; +	struct mutex seglist_lock;		/* lock for segment bitmaps */ +	int nr_dirty[NR_DIRTY_TYPE];		/* # of dirty segments */ +	unsigned long *victim_segmap[2];	/* BG_GC, FG_GC */ +}; + +/* victim selection function for cleaning and SSR */ +struct victim_selection { +	int (*get_victim)(struct f2fs_sb_info *, unsigned int *, +							int, int, char); +}; + +/* for active log information */ +struct curseg_info { +	struct mutex curseg_mutex;		/* lock for consistency */ +	struct f2fs_summary_block *sum_blk;	/* cached summary block */ +	unsigned char alloc_type;		/* current allocation type */ +	unsigned int segno;			/* current segment number */ +	unsigned short next_blkoff;		/* next block offset to write */ +	unsigned int zone;			/* current zone number */ +	unsigned int next_segno;		/* preallocated segment */ +}; + +/* + * inline functions + */ +static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) +{ +	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); +} + +static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, +						unsigned int segno) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	return &sit_i->sentries[segno]; +} + +static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, +						unsigned int segno) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; +} + +static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, +				unsigned int segno, int section) +{ +	/* +	 * In order to get # of valid blocks in a section instantly from many +	 * segments, f2fs manages two counting structures separately. +	 */ +	if (section > 1) +		return get_sec_entry(sbi, segno)->valid_blocks; +	else +		return get_seg_entry(sbi, segno)->valid_blocks; +} + +static inline void seg_info_from_raw_sit(struct seg_entry *se, +					struct f2fs_sit_entry *rs) +{ +	se->valid_blocks = GET_SIT_VBLOCKS(rs); +	se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); +	memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +	memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +	se->type = GET_SIT_TYPE(rs); +	se->mtime = le64_to_cpu(rs->mtime); +} + +static inline void seg_info_to_raw_sit(struct seg_entry *se, +					struct f2fs_sit_entry *rs) +{ +	unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | +					se->valid_blocks; +	rs->vblocks = cpu_to_le16(raw_vblocks); +	memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); +	memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +	se->ckpt_valid_blocks = se->valid_blocks; +	rs->mtime = cpu_to_le64(se->mtime); +} + +static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, +		unsigned int max, unsigned int segno) +{ +	unsigned int ret; +	read_lock(&free_i->segmap_lock); +	ret = find_next_bit(free_i->free_segmap, max, segno); +	read_unlock(&free_i->segmap_lock); +	return ret; +} + +static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) +{ +	struct free_segmap_info *free_i = FREE_I(sbi); +	unsigned int secno = segno / sbi->segs_per_sec; +	unsigned int start_segno = secno * sbi->segs_per_sec; +	unsigned int next; + +	write_lock(&free_i->segmap_lock); +	clear_bit(segno, free_i->free_segmap); +	free_i->free_segments++; + +	next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno); +	if (next >= start_segno + sbi->segs_per_sec) { +		clear_bit(secno, free_i->free_secmap); +		free_i->free_sections++; +	} +	write_unlock(&free_i->segmap_lock); +} + +static inline void __set_inuse(struct f2fs_sb_info *sbi, +		unsigned int segno) +{ +	struct free_segmap_info *free_i = FREE_I(sbi); +	unsigned int secno = segno / sbi->segs_per_sec; +	set_bit(segno, free_i->free_segmap); +	free_i->free_segments--; +	if (!test_and_set_bit(secno, free_i->free_secmap)) +		free_i->free_sections--; +} + +static inline void __set_test_and_free(struct f2fs_sb_info *sbi, +		unsigned int segno) +{ +	struct free_segmap_info *free_i = FREE_I(sbi); +	unsigned int secno = segno / sbi->segs_per_sec; +	unsigned int start_segno = secno * sbi->segs_per_sec; +	unsigned int next; + +	write_lock(&free_i->segmap_lock); +	if (test_and_clear_bit(segno, free_i->free_segmap)) { +		free_i->free_segments++; + +		next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), +								start_segno); +		if (next >= start_segno + sbi->segs_per_sec) { +			if (test_and_clear_bit(secno, free_i->free_secmap)) +				free_i->free_sections++; +		} +	} +	write_unlock(&free_i->segmap_lock); +} + +static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, +		unsigned int segno) +{ +	struct free_segmap_info *free_i = FREE_I(sbi); +	unsigned int secno = segno / sbi->segs_per_sec; +	write_lock(&free_i->segmap_lock); +	if (!test_and_set_bit(segno, free_i->free_segmap)) { +		free_i->free_segments--; +		if (!test_and_set_bit(secno, free_i->free_secmap)) +			free_i->free_sections--; +	} +	write_unlock(&free_i->segmap_lock); +} + +static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, +		void *dst_addr) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); +} + +static inline block_t written_block_count(struct f2fs_sb_info *sbi) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	block_t vblocks; + +	mutex_lock(&sit_i->sentry_lock); +	vblocks = sit_i->written_valid_blocks; +	mutex_unlock(&sit_i->sentry_lock); + +	return vblocks; +} + +static inline unsigned int free_segments(struct f2fs_sb_info *sbi) +{ +	struct free_segmap_info *free_i = FREE_I(sbi); +	unsigned int free_segs; + +	read_lock(&free_i->segmap_lock); +	free_segs = free_i->free_segments; +	read_unlock(&free_i->segmap_lock); + +	return free_segs; +} + +static inline int reserved_segments(struct f2fs_sb_info *sbi) +{ +	return SM_I(sbi)->reserved_segments; +} + +static inline unsigned int free_sections(struct f2fs_sb_info *sbi) +{ +	struct free_segmap_info *free_i = FREE_I(sbi); +	unsigned int free_secs; + +	read_lock(&free_i->segmap_lock); +	free_secs = free_i->free_sections; +	read_unlock(&free_i->segmap_lock); + +	return free_secs; +} + +static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) +{ +	return DIRTY_I(sbi)->nr_dirty[PRE]; +} + +static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi) +{ +	return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] + +		DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] + +		DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] + +		DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] + +		DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] + +		DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE]; +} + +static inline int overprovision_segments(struct f2fs_sb_info *sbi) +{ +	return SM_I(sbi)->ovp_segments; +} + +static inline int overprovision_sections(struct f2fs_sb_info *sbi) +{ +	return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; +} + +static inline int reserved_sections(struct f2fs_sb_info *sbi) +{ +	return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; +} + +static inline bool need_SSR(struct f2fs_sb_info *sbi) +{ +	return (free_sections(sbi) < overprovision_sections(sbi)); +} + +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) +{ +	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); +	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + +	if (sbi->por_doing) +		return false; + +	return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + +						reserved_sections(sbi))); +} + +static inline int utilization(struct f2fs_sb_info *sbi) +{ +	return (long int)valid_user_blocks(sbi) * 100 / +			(long int)sbi->user_block_count; +} + +/* + * Sometimes f2fs may be better to drop out-of-place update policy. + * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write + * data in the original place likewise other traditional file systems. + * But, currently set 100 in percentage, which means it is disabled. + * See below need_inplace_update(). + */ +#define MIN_IPU_UTIL		100 +static inline bool need_inplace_update(struct inode *inode) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	if (S_ISDIR(inode->i_mode)) +		return false; +	if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL) +		return true; +	return false; +} + +static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, +		int type) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	return curseg->segno; +} + +static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, +		int type) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	return curseg->alloc_type; +} + +static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) +{ +	struct curseg_info *curseg = CURSEG_I(sbi, type); +	return curseg->next_blkoff; +} + +static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +{ +	unsigned int end_segno = SM_I(sbi)->segment_count - 1; +	BUG_ON(segno > end_segno); +} + +/* + * This function is used for only debugging. + * NOTE: In future, we have to remove this function. + */ +static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +{ +	struct f2fs_sm_info *sm_info = SM_I(sbi); +	block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg; +	block_t start_addr = sm_info->seg0_blkaddr; +	block_t end_addr = start_addr + total_blks - 1; +	BUG_ON(blk_addr < start_addr); +	BUG_ON(blk_addr > end_addr); +} + +/* + * Summary block is always treated as invalid block + */ +static inline void check_block_count(struct f2fs_sb_info *sbi, +		int segno, struct f2fs_sit_entry *raw_sit) +{ +	struct f2fs_sm_info *sm_info = SM_I(sbi); +	unsigned int end_segno = sm_info->segment_count - 1; +	int valid_blocks = 0; +	int i; + +	/* check segment usage */ +	BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); + +	/* check boundary of a given segment number */ +	BUG_ON(segno > end_segno); + +	/* check bitmap with valid block count */ +	for (i = 0; i < sbi->blocks_per_seg; i++) +		if (f2fs_test_bit(i, raw_sit->valid_map)) +			valid_blocks++; +	BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); +} + +static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, +						unsigned int start) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start); +	block_t blk_addr = sit_i->sit_base_addr + offset; + +	check_seg_range(sbi, start); + +	/* calculate sit block address */ +	if (f2fs_test_bit(offset, sit_i->sit_bitmap)) +		blk_addr += sit_i->sit_blocks; + +	return blk_addr; +} + +static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, +						pgoff_t block_addr) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	block_addr -= sit_i->sit_base_addr; +	if (block_addr < sit_i->sit_blocks) +		block_addr += sit_i->sit_blocks; +	else +		block_addr -= sit_i->sit_blocks; + +	return block_addr + sit_i->sit_base_addr; +} + +static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) +{ +	unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start); + +	if (f2fs_test_bit(block_off, sit_i->sit_bitmap)) +		f2fs_clear_bit(block_off, sit_i->sit_bitmap); +	else +		f2fs_set_bit(block_off, sit_i->sit_bitmap); +} + +static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) +{ +	struct sit_info *sit_i = SIT_I(sbi); +	return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - +						sit_i->mounted_time; +} + +static inline void set_summary(struct f2fs_summary *sum, nid_t nid, +			unsigned int ofs_in_node, unsigned char version) +{ +	sum->nid = cpu_to_le32(nid); +	sum->ofs_in_node = cpu_to_le16(ofs_in_node); +	sum->version = version; +} + +static inline block_t start_sum_block(struct f2fs_sb_info *sbi) +{ +	return __start_cp_addr(sbi) + +		le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) +{ +	return __start_cp_addr(sbi) + +		le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) +				- (base + 1) + type; +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c new file mode 100644 index 00000000000..8c117649a03 --- /dev/null +++ b/fs/f2fs/super.c @@ -0,0 +1,749 @@ +/* + * fs/f2fs/super.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/statfs.h> +#include <linux/proc_fs.h> +#include <linux/buffer_head.h> +#include <linux/backing-dev.h> +#include <linux/kthread.h> +#include <linux/parser.h> +#include <linux/mount.h> +#include <linux/seq_file.h> +#include <linux/random.h> +#include <linux/exportfs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" +#include "xattr.h" + +static struct kmem_cache *f2fs_inode_cachep; + +enum { +	Opt_gc_background_off, +	Opt_disable_roll_forward, +	Opt_discard, +	Opt_noheap, +	Opt_nouser_xattr, +	Opt_noacl, +	Opt_active_logs, +	Opt_disable_ext_identify, +	Opt_err, +}; + +static match_table_t f2fs_tokens = { +	{Opt_gc_background_off, "background_gc_off"}, +	{Opt_disable_roll_forward, "disable_roll_forward"}, +	{Opt_discard, "discard"}, +	{Opt_noheap, "no_heap"}, +	{Opt_nouser_xattr, "nouser_xattr"}, +	{Opt_noacl, "noacl"}, +	{Opt_active_logs, "active_logs=%u"}, +	{Opt_disable_ext_identify, "disable_ext_identify"}, +	{Opt_err, NULL}, +}; + +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ +	struct va_format vaf; +	va_list args; + +	va_start(args, fmt); +	vaf.fmt = fmt; +	vaf.va = &args; +	printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); +	va_end(args); +} + +static void init_once(void *foo) +{ +	struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; + +	inode_init_once(&fi->vfs_inode); +} + +static struct inode *f2fs_alloc_inode(struct super_block *sb) +{ +	struct f2fs_inode_info *fi; + +	fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO); +	if (!fi) +		return NULL; + +	init_once((void *) fi); + +	/* Initilize f2fs-specific inode info */ +	fi->vfs_inode.i_version = 1; +	atomic_set(&fi->dirty_dents, 0); +	fi->i_current_depth = 1; +	fi->i_advise = 0; +	rwlock_init(&fi->ext.ext_lock); + +	set_inode_flag(fi, FI_NEW_INODE); + +	return &fi->vfs_inode; +} + +static void f2fs_i_callback(struct rcu_head *head) +{ +	struct inode *inode = container_of(head, struct inode, i_rcu); +	kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode)); +} + +static void f2fs_destroy_inode(struct inode *inode) +{ +	call_rcu(&inode->i_rcu, f2fs_i_callback); +} + +static void f2fs_put_super(struct super_block *sb) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(sb); + +	f2fs_destroy_stats(sbi); +	stop_gc_thread(sbi); + +	write_checkpoint(sbi, true); + +	iput(sbi->node_inode); +	iput(sbi->meta_inode); + +	/* destroy f2fs internal modules */ +	destroy_node_manager(sbi); +	destroy_segment_manager(sbi); + +	kfree(sbi->ckpt); + +	sb->s_fs_info = NULL; +	brelse(sbi->raw_super_buf); +	kfree(sbi); +} + +int f2fs_sync_fs(struct super_block *sb, int sync) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(sb); + +	if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) +		return 0; + +	if (sync) +		write_checkpoint(sbi, false); +	else +		f2fs_balance_fs(sbi); + +	return 0; +} + +static int f2fs_freeze(struct super_block *sb) +{ +	int err; + +	if (sb->s_flags & MS_RDONLY) +		return 0; + +	err = f2fs_sync_fs(sb, 1); +	return err; +} + +static int f2fs_unfreeze(struct super_block *sb) +{ +	return 0; +} + +static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ +	struct super_block *sb = dentry->d_sb; +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	u64 id = huge_encode_dev(sb->s_bdev->bd_dev); +	block_t total_count, user_block_count, start_count, ovp_count; + +	total_count = le64_to_cpu(sbi->raw_super->block_count); +	user_block_count = sbi->user_block_count; +	start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); +	ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; +	buf->f_type = F2FS_SUPER_MAGIC; +	buf->f_bsize = sbi->blocksize; + +	buf->f_blocks = total_count - start_count; +	buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; +	buf->f_bavail = user_block_count - valid_user_blocks(sbi); + +	buf->f_files = sbi->total_node_count; +	buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); + +	buf->f_namelen = F2FS_MAX_NAME_LEN; +	buf->f_fsid.val[0] = (u32)id; +	buf->f_fsid.val[1] = (u32)(id >> 32); + +	return 0; +} + +static int f2fs_show_options(struct seq_file *seq, struct dentry *root) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); + +	if (test_opt(sbi, BG_GC)) +		seq_puts(seq, ",background_gc_on"); +	else +		seq_puts(seq, ",background_gc_off"); +	if (test_opt(sbi, DISABLE_ROLL_FORWARD)) +		seq_puts(seq, ",disable_roll_forward"); +	if (test_opt(sbi, DISCARD)) +		seq_puts(seq, ",discard"); +	if (test_opt(sbi, NOHEAP)) +		seq_puts(seq, ",no_heap_alloc"); +#ifdef CONFIG_F2FS_FS_XATTR +	if (test_opt(sbi, XATTR_USER)) +		seq_puts(seq, ",user_xattr"); +	else +		seq_puts(seq, ",nouser_xattr"); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL +	if (test_opt(sbi, POSIX_ACL)) +		seq_puts(seq, ",acl"); +	else +		seq_puts(seq, ",noacl"); +#endif +	if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) +		seq_puts(seq, ",disable_ext_identify"); + +	seq_printf(seq, ",active_logs=%u", sbi->active_logs); + +	return 0; +} + +static struct super_operations f2fs_sops = { +	.alloc_inode	= f2fs_alloc_inode, +	.destroy_inode	= f2fs_destroy_inode, +	.write_inode	= f2fs_write_inode, +	.show_options	= f2fs_show_options, +	.evict_inode	= f2fs_evict_inode, +	.put_super	= f2fs_put_super, +	.sync_fs	= f2fs_sync_fs, +	.freeze_fs	= f2fs_freeze, +	.unfreeze_fs	= f2fs_unfreeze, +	.statfs		= f2fs_statfs, +}; + +static struct inode *f2fs_nfs_get_inode(struct super_block *sb, +		u64 ino, u32 generation) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(sb); +	struct inode *inode; + +	if (ino < F2FS_ROOT_INO(sbi)) +		return ERR_PTR(-ESTALE); + +	/* +	 * f2fs_iget isn't quite right if the inode is currently unallocated! +	 * However f2fs_iget currently does appropriate checks to handle stale +	 * inodes so everything is OK. +	 */ +	inode = f2fs_iget(sb, ino); +	if (IS_ERR(inode)) +		return ERR_CAST(inode); +	if (generation && inode->i_generation != generation) { +		/* we didn't find the right inode.. */ +		iput(inode); +		return ERR_PTR(-ESTALE); +	} +	return inode; +} + +static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid, +		int fh_len, int fh_type) +{ +	return generic_fh_to_dentry(sb, fid, fh_len, fh_type, +				    f2fs_nfs_get_inode); +} + +static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid, +		int fh_len, int fh_type) +{ +	return generic_fh_to_parent(sb, fid, fh_len, fh_type, +				    f2fs_nfs_get_inode); +} + +static const struct export_operations f2fs_export_ops = { +	.fh_to_dentry = f2fs_fh_to_dentry, +	.fh_to_parent = f2fs_fh_to_parent, +	.get_parent = f2fs_get_parent, +}; + +static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi, +				char *options) +{ +	substring_t args[MAX_OPT_ARGS]; +	char *p; +	int arg = 0; + +	if (!options) +		return 0; + +	while ((p = strsep(&options, ",")) != NULL) { +		int token; +		if (!*p) +			continue; +		/* +		 * Initialize args struct so we know whether arg was +		 * found; some options take optional arguments. +		 */ +		args[0].to = args[0].from = NULL; +		token = match_token(p, f2fs_tokens, args); + +		switch (token) { +		case Opt_gc_background_off: +			clear_opt(sbi, BG_GC); +			break; +		case Opt_disable_roll_forward: +			set_opt(sbi, DISABLE_ROLL_FORWARD); +			break; +		case Opt_discard: +			set_opt(sbi, DISCARD); +			break; +		case Opt_noheap: +			set_opt(sbi, NOHEAP); +			break; +#ifdef CONFIG_F2FS_FS_XATTR +		case Opt_nouser_xattr: +			clear_opt(sbi, XATTR_USER); +			break; +#else +		case Opt_nouser_xattr: +			f2fs_msg(sb, KERN_INFO, +				"nouser_xattr options not supported"); +			break; +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL +		case Opt_noacl: +			clear_opt(sbi, POSIX_ACL); +			break; +#else +		case Opt_noacl: +			f2fs_msg(sb, KERN_INFO, "noacl options not supported"); +			break; +#endif +		case Opt_active_logs: +			if (args->from && match_int(args, &arg)) +				return -EINVAL; +			if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) +				return -EINVAL; +			sbi->active_logs = arg; +			break; +		case Opt_disable_ext_identify: +			set_opt(sbi, DISABLE_EXT_IDENTIFY); +			break; +		default: +			f2fs_msg(sb, KERN_ERR, +				"Unrecognized mount option \"%s\" or missing value", +				p); +			return -EINVAL; +		} +	} +	return 0; +} + +static loff_t max_file_size(unsigned bits) +{ +	loff_t result = ADDRS_PER_INODE; +	loff_t leaf_count = ADDRS_PER_BLOCK; + +	/* two direct node blocks */ +	result += (leaf_count * 2); + +	/* two indirect node blocks */ +	leaf_count *= NIDS_PER_BLOCK; +	result += (leaf_count * 2); + +	/* one double indirect node block */ +	leaf_count *= NIDS_PER_BLOCK; +	result += leaf_count; + +	result <<= bits; +	return result; +} + +static int sanity_check_raw_super(struct super_block *sb, +			struct f2fs_super_block *raw_super) +{ +	unsigned int blocksize; + +	if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { +		f2fs_msg(sb, KERN_INFO, +			"Magic Mismatch, valid(0x%x) - read(0x%x)", +			F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); +		return 1; +	} + +	/* Currently, support only 4KB page cache size */ +	if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { +		f2fs_msg(sb, KERN_INFO, +			"Invalid page_cache_size (%lu), supports only 4KB\n", +			PAGE_CACHE_SIZE); +		return 1; +	} + +	/* Currently, support only 4KB block size */ +	blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); +	if (blocksize != F2FS_BLKSIZE) { +		f2fs_msg(sb, KERN_INFO, +			"Invalid blocksize (%u), supports only 4KB\n", +			blocksize); +		return 1; +	} + +	if (le32_to_cpu(raw_super->log_sectorsize) != +					F2FS_LOG_SECTOR_SIZE) { +		f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); +		return 1; +	} +	if (le32_to_cpu(raw_super->log_sectors_per_block) != +					F2FS_LOG_SECTORS_PER_BLOCK) { +		f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); +		return 1; +	} +	return 0; +} + +static int sanity_check_ckpt(struct f2fs_sb_info *sbi) +{ +	unsigned int total, fsmeta; +	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + +	total = le32_to_cpu(raw_super->segment_count); +	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); +	fsmeta += le32_to_cpu(raw_super->segment_count_sit); +	fsmeta += le32_to_cpu(raw_super->segment_count_nat); +	fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); +	fsmeta += le32_to_cpu(raw_super->segment_count_ssa); + +	if (fsmeta >= total) +		return 1; + +	if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { +		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); +		return 1; +	} +	return 0; +} + +static void init_sb_info(struct f2fs_sb_info *sbi) +{ +	struct f2fs_super_block *raw_super = sbi->raw_super; +	int i; + +	sbi->log_sectors_per_block = +		le32_to_cpu(raw_super->log_sectors_per_block); +	sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); +	sbi->blocksize = 1 << sbi->log_blocksize; +	sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); +	sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg; +	sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); +	sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); +	sbi->total_sections = le32_to_cpu(raw_super->section_count); +	sbi->total_node_count = +		(le32_to_cpu(raw_super->segment_count_nat) / 2) +			* sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; +	sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); +	sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); +	sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + +	for (i = 0; i < NR_COUNT_TYPE; i++) +		atomic_set(&sbi->nr_pages[i], 0); +} + +static int validate_superblock(struct super_block *sb, +		struct f2fs_super_block **raw_super, +		struct buffer_head **raw_super_buf, sector_t block) +{ +	const char *super = (block == 0 ? "first" : "second"); + +	/* read f2fs raw super block */ +	*raw_super_buf = sb_bread(sb, block); +	if (!*raw_super_buf) { +		f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", +				super); +		return 1; +	} + +	*raw_super = (struct f2fs_super_block *) +		((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); + +	/* sanity checking of raw super */ +	if (!sanity_check_raw_super(sb, *raw_super)) +		return 0; + +	f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " +				"in %s superblock", super); +	return 1; +} + +static int f2fs_fill_super(struct super_block *sb, void *data, int silent) +{ +	struct f2fs_sb_info *sbi; +	struct f2fs_super_block *raw_super; +	struct buffer_head *raw_super_buf; +	struct inode *root; +	long err = -EINVAL; +	int i; + +	/* allocate memory for f2fs-specific super block info */ +	sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); +	if (!sbi) +		return -ENOMEM; + +	/* set a block size */ +	if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { +		f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); +		goto free_sbi; +	} + +	if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) { +		brelse(raw_super_buf); +		if (validate_superblock(sb, &raw_super, &raw_super_buf, 1)) +			goto free_sb_buf; +	} +	/* init some FS parameters */ +	sbi->active_logs = NR_CURSEG_TYPE; + +	set_opt(sbi, BG_GC); + +#ifdef CONFIG_F2FS_FS_XATTR +	set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL +	set_opt(sbi, POSIX_ACL); +#endif +	/* parse mount options */ +	if (parse_options(sb, sbi, (char *)data)) +		goto free_sb_buf; + +	sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); +	sb->s_max_links = F2FS_LINK_MAX; +	get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + +	sb->s_op = &f2fs_sops; +	sb->s_xattr = f2fs_xattr_handlers; +	sb->s_export_op = &f2fs_export_ops; +	sb->s_magic = F2FS_SUPER_MAGIC; +	sb->s_fs_info = sbi; +	sb->s_time_gran = 1; +	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | +		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); +	memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + +	/* init f2fs-specific super block info */ +	sbi->sb = sb; +	sbi->raw_super = raw_super; +	sbi->raw_super_buf = raw_super_buf; +	mutex_init(&sbi->gc_mutex); +	mutex_init(&sbi->write_inode); +	mutex_init(&sbi->writepages); +	mutex_init(&sbi->cp_mutex); +	for (i = 0; i < NR_LOCK_TYPE; i++) +		mutex_init(&sbi->fs_lock[i]); +	sbi->por_doing = 0; +	spin_lock_init(&sbi->stat_lock); +	init_rwsem(&sbi->bio_sem); +	init_sb_info(sbi); + +	/* get an inode for meta space */ +	sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); +	if (IS_ERR(sbi->meta_inode)) { +		f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); +		err = PTR_ERR(sbi->meta_inode); +		goto free_sb_buf; +	} + +	err = get_valid_checkpoint(sbi); +	if (err) { +		f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); +		goto free_meta_inode; +	} + +	/* sanity checking of checkpoint */ +	err = -EINVAL; +	if (sanity_check_ckpt(sbi)) { +		f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); +		goto free_cp; +	} + +	sbi->total_valid_node_count = +				le32_to_cpu(sbi->ckpt->valid_node_count); +	sbi->total_valid_inode_count = +				le32_to_cpu(sbi->ckpt->valid_inode_count); +	sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); +	sbi->total_valid_block_count = +				le64_to_cpu(sbi->ckpt->valid_block_count); +	sbi->last_valid_block_count = sbi->total_valid_block_count; +	sbi->alloc_valid_block_count = 0; +	INIT_LIST_HEAD(&sbi->dir_inode_list); +	spin_lock_init(&sbi->dir_inode_lock); + +	init_orphan_info(sbi); + +	/* setup f2fs internal modules */ +	err = build_segment_manager(sbi); +	if (err) { +		f2fs_msg(sb, KERN_ERR, +			"Failed to initialize F2FS segment manager"); +		goto free_sm; +	} +	err = build_node_manager(sbi); +	if (err) { +		f2fs_msg(sb, KERN_ERR, +			"Failed to initialize F2FS node manager"); +		goto free_nm; +	} + +	build_gc_manager(sbi); + +	/* get an inode for node space */ +	sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); +	if (IS_ERR(sbi->node_inode)) { +		f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); +		err = PTR_ERR(sbi->node_inode); +		goto free_nm; +	} + +	/* if there are nt orphan nodes free them */ +	err = -EINVAL; +	if (recover_orphan_inodes(sbi)) +		goto free_node_inode; + +	/* read root inode and dentry */ +	root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); +	if (IS_ERR(root)) { +		f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); +		err = PTR_ERR(root); +		goto free_node_inode; +	} +	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) +		goto free_root_inode; + +	sb->s_root = d_make_root(root); /* allocate root dentry */ +	if (!sb->s_root) { +		err = -ENOMEM; +		goto free_root_inode; +	} + +	/* recover fsynced data */ +	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) +		recover_fsync_data(sbi); + +	/* After POR, we can run background GC thread */ +	err = start_gc_thread(sbi); +	if (err) +		goto fail; + +	err = f2fs_build_stats(sbi); +	if (err) +		goto fail; + +	return 0; +fail: +	stop_gc_thread(sbi); +free_root_inode: +	dput(sb->s_root); +	sb->s_root = NULL; +free_node_inode: +	iput(sbi->node_inode); +free_nm: +	destroy_node_manager(sbi); +free_sm: +	destroy_segment_manager(sbi); +free_cp: +	kfree(sbi->ckpt); +free_meta_inode: +	make_bad_inode(sbi->meta_inode); +	iput(sbi->meta_inode); +free_sb_buf: +	brelse(raw_super_buf); +free_sbi: +	kfree(sbi); +	return err; +} + +static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, +			const char *dev_name, void *data) +{ +	return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); +} + +static struct file_system_type f2fs_fs_type = { +	.owner		= THIS_MODULE, +	.name		= "f2fs", +	.mount		= f2fs_mount, +	.kill_sb	= kill_block_super, +	.fs_flags	= FS_REQUIRES_DEV, +}; + +static int __init init_inodecache(void) +{ +	f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", +			sizeof(struct f2fs_inode_info), NULL); +	if (f2fs_inode_cachep == NULL) +		return -ENOMEM; +	return 0; +} + +static void destroy_inodecache(void) +{ +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier(); +	kmem_cache_destroy(f2fs_inode_cachep); +} + +static int __init init_f2fs_fs(void) +{ +	int err; + +	err = init_inodecache(); +	if (err) +		goto fail; +	err = create_node_manager_caches(); +	if (err) +		goto fail; +	err = create_gc_caches(); +	if (err) +		goto fail; +	err = create_checkpoint_caches(); +	if (err) +		goto fail; +	err = register_filesystem(&f2fs_fs_type); +	if (err) +		goto fail; +	f2fs_create_root_stats(); +fail: +	return err; +} + +static void __exit exit_f2fs_fs(void) +{ +	f2fs_destroy_root_stats(); +	unregister_filesystem(&f2fs_fs_type); +	destroy_checkpoint_caches(); +	destroy_gc_caches(); +	destroy_node_manager_caches(); +	destroy_inodecache(); +} + +module_init(init_f2fs_fs) +module_exit(exit_f2fs_fs) + +MODULE_AUTHOR("Samsung Electronics's Praesto Team"); +MODULE_DESCRIPTION("Flash Friendly File System"); +MODULE_LICENSE("GPL"); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c new file mode 100644 index 00000000000..8038c049650 --- /dev/null +++ b/fs/f2fs/xattr.c @@ -0,0 +1,443 @@ +/* + * fs/f2fs/xattr.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de> + * + * Fix by Harrison Xing <harrison@mountainviewdata.com>. + * Extended attributes for symlinks and special files added per + *  suggestion of Luka Renko <luka.renko@hermes.si>. + * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, + *  Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/rwsem.h> +#include <linux/f2fs_fs.h> +#include "f2fs.h" +#include "xattr.h" + +static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, +		size_t list_size, const char *name, size_t name_len, int type) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); +	int total_len, prefix_len = 0; +	const char *prefix = NULL; + +	switch (type) { +	case F2FS_XATTR_INDEX_USER: +		if (!test_opt(sbi, XATTR_USER)) +			return -EOPNOTSUPP; +		prefix = XATTR_USER_PREFIX; +		prefix_len = XATTR_USER_PREFIX_LEN; +		break; +	case F2FS_XATTR_INDEX_TRUSTED: +		if (!capable(CAP_SYS_ADMIN)) +			return -EPERM; +		prefix = XATTR_TRUSTED_PREFIX; +		prefix_len = XATTR_TRUSTED_PREFIX_LEN; +		break; +	default: +		return -EINVAL; +	} + +	total_len = prefix_len + name_len + 1; +	if (list && total_len <= list_size) { +		memcpy(list, prefix, prefix_len); +		memcpy(list+prefix_len, name, name_len); +		list[prefix_len + name_len] = '\0'; +	} +	return total_len; +} + +static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, +		void *buffer, size_t size, int type) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + +	switch (type) { +	case F2FS_XATTR_INDEX_USER: +		if (!test_opt(sbi, XATTR_USER)) +			return -EOPNOTSUPP; +		break; +	case F2FS_XATTR_INDEX_TRUSTED: +		if (!capable(CAP_SYS_ADMIN)) +			return -EPERM; +		break; +	default: +		return -EINVAL; +	} +	if (strcmp(name, "") == 0) +		return -EINVAL; +	return f2fs_getxattr(dentry->d_inode, type, name, +			buffer, size); +} + +static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, +		const void *value, size_t size, int flags, int type) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + +	switch (type) { +	case F2FS_XATTR_INDEX_USER: +		if (!test_opt(sbi, XATTR_USER)) +			return -EOPNOTSUPP; +		break; +	case F2FS_XATTR_INDEX_TRUSTED: +		if (!capable(CAP_SYS_ADMIN)) +			return -EPERM; +		break; +	default: +		return -EINVAL; +	} +	if (strcmp(name, "") == 0) +		return -EINVAL; + +	return f2fs_setxattr(dentry->d_inode, type, name, value, size); +} + +static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, +		size_t list_size, const char *name, size_t name_len, int type) +{ +	const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; +	size_t size; + +	if (type != F2FS_XATTR_INDEX_ADVISE) +		return 0; + +	size = strlen(xname) + 1; +	if (list && size <= list_size) +		memcpy(list, xname, size); +	return size; +} + +static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, +		void *buffer, size_t size, int type) +{ +	struct inode *inode = dentry->d_inode; + +	if (strcmp(name, "") != 0) +		return -EINVAL; + +	*((char *)buffer) = F2FS_I(inode)->i_advise; +	return sizeof(char); +} + +static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, +		const void *value, size_t size, int flags, int type) +{ +	struct inode *inode = dentry->d_inode; + +	if (strcmp(name, "") != 0) +		return -EINVAL; +	if (!inode_owner_or_capable(inode)) +		return -EPERM; +	if (value == NULL) +		return -EINVAL; + +	F2FS_I(inode)->i_advise |= *(char *)value; +	return 0; +} + +const struct xattr_handler f2fs_xattr_user_handler = { +	.prefix	= XATTR_USER_PREFIX, +	.flags	= F2FS_XATTR_INDEX_USER, +	.list	= f2fs_xattr_generic_list, +	.get	= f2fs_xattr_generic_get, +	.set	= f2fs_xattr_generic_set, +}; + +const struct xattr_handler f2fs_xattr_trusted_handler = { +	.prefix	= XATTR_TRUSTED_PREFIX, +	.flags	= F2FS_XATTR_INDEX_TRUSTED, +	.list	= f2fs_xattr_generic_list, +	.get	= f2fs_xattr_generic_get, +	.set	= f2fs_xattr_generic_set, +}; + +const struct xattr_handler f2fs_xattr_advise_handler = { +	.prefix = F2FS_SYSTEM_ADVISE_PREFIX, +	.flags	= F2FS_XATTR_INDEX_ADVISE, +	.list   = f2fs_xattr_advise_list, +	.get    = f2fs_xattr_advise_get, +	.set    = f2fs_xattr_advise_set, +}; + +static const struct xattr_handler *f2fs_xattr_handler_map[] = { +	[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL +	[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler, +	[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, +#endif +	[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, +	[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, +}; + +const struct xattr_handler *f2fs_xattr_handlers[] = { +	&f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL +	&f2fs_xattr_acl_access_handler, +	&f2fs_xattr_acl_default_handler, +#endif +	&f2fs_xattr_trusted_handler, +	&f2fs_xattr_advise_handler, +	NULL, +}; + +static inline const struct xattr_handler *f2fs_xattr_handler(int name_index) +{ +	const struct xattr_handler *handler = NULL; + +	if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map)) +		handler = f2fs_xattr_handler_map[name_index]; +	return handler; +} + +int f2fs_getxattr(struct inode *inode, int name_index, const char *name, +		void *buffer, size_t buffer_size) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct f2fs_inode_info *fi = F2FS_I(inode); +	struct f2fs_xattr_entry *entry; +	struct page *page; +	void *base_addr; +	int error = 0, found = 0; +	size_t value_len, name_len; + +	if (name == NULL) +		return -EINVAL; +	name_len = strlen(name); + +	if (!fi->i_xattr_nid) +		return -ENODATA; + +	page = get_node_page(sbi, fi->i_xattr_nid); +	base_addr = page_address(page); + +	list_for_each_xattr(entry, base_addr) { +		if (entry->e_name_index != name_index) +			continue; +		if (entry->e_name_len != name_len) +			continue; +		if (!memcmp(entry->e_name, name, name_len)) { +			found = 1; +			break; +		} +	} +	if (!found) { +		error = -ENODATA; +		goto cleanup; +	} + +	value_len = le16_to_cpu(entry->e_value_size); + +	if (buffer && value_len > buffer_size) { +		error = -ERANGE; +		goto cleanup; +	} + +	if (buffer) { +		char *pval = entry->e_name + entry->e_name_len; +		memcpy(buffer, pval, value_len); +	} +	error = value_len; + +cleanup: +	f2fs_put_page(page, 1); +	return error; +} + +ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ +	struct inode *inode = dentry->d_inode; +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct f2fs_inode_info *fi = F2FS_I(inode); +	struct f2fs_xattr_entry *entry; +	struct page *page; +	void *base_addr; +	int error = 0; +	size_t rest = buffer_size; + +	if (!fi->i_xattr_nid) +		return 0; + +	page = get_node_page(sbi, fi->i_xattr_nid); +	base_addr = page_address(page); + +	list_for_each_xattr(entry, base_addr) { +		const struct xattr_handler *handler = +			f2fs_xattr_handler(entry->e_name_index); +		size_t size; + +		if (!handler) +			continue; + +		size = handler->list(dentry, buffer, rest, entry->e_name, +				entry->e_name_len, handler->flags); +		if (buffer && size > rest) { +			error = -ERANGE; +			goto cleanup; +		} + +		if (buffer) +			buffer += size; +		rest -= size; +	} +	error = buffer_size - rest; +cleanup: +	f2fs_put_page(page, 1); +	return error; +} + +int f2fs_setxattr(struct inode *inode, int name_index, const char *name, +					const void *value, size_t value_len) +{ +	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +	struct f2fs_inode_info *fi = F2FS_I(inode); +	struct f2fs_xattr_header *header = NULL; +	struct f2fs_xattr_entry *here, *last; +	struct page *page; +	void *base_addr; +	int error, found, free, newsize; +	size_t name_len; +	char *pval; + +	if (name == NULL) +		return -EINVAL; +	name_len = strlen(name); + +	if (value == NULL) +		value_len = 0; + +	if (name_len > 255 || value_len > MAX_VALUE_LEN) +		return -ERANGE; + +	f2fs_balance_fs(sbi); + +	mutex_lock_op(sbi, NODE_NEW); +	if (!fi->i_xattr_nid) { +		/* Allocate new attribute block */ +		struct dnode_of_data dn; + +		if (!alloc_nid(sbi, &fi->i_xattr_nid)) { +			mutex_unlock_op(sbi, NODE_NEW); +			return -ENOSPC; +		} +		set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); +		mark_inode_dirty(inode); + +		page = new_node_page(&dn, XATTR_NODE_OFFSET); +		if (IS_ERR(page)) { +			alloc_nid_failed(sbi, fi->i_xattr_nid); +			fi->i_xattr_nid = 0; +			mutex_unlock_op(sbi, NODE_NEW); +			return PTR_ERR(page); +		} + +		alloc_nid_done(sbi, fi->i_xattr_nid); +		base_addr = page_address(page); +		header = XATTR_HDR(base_addr); +		header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); +		header->h_refcount = cpu_to_le32(1); +	} else { +		/* The inode already has an extended attribute block. */ +		page = get_node_page(sbi, fi->i_xattr_nid); +		if (IS_ERR(page)) { +			mutex_unlock_op(sbi, NODE_NEW); +			return PTR_ERR(page); +		} + +		base_addr = page_address(page); +		header = XATTR_HDR(base_addr); +	} + +	if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { +		error = -EIO; +		goto cleanup; +	} + +	/* find entry with wanted name. */ +	found = 0; +	list_for_each_xattr(here, base_addr) { +		if (here->e_name_index != name_index) +			continue; +		if (here->e_name_len != name_len) +			continue; +		if (!memcmp(here->e_name, name, name_len)) { +			found = 1; +			break; +		} +	} + +	last = here; + +	while (!IS_XATTR_LAST_ENTRY(last)) +		last = XATTR_NEXT_ENTRY(last); + +	newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + +			name_len + value_len); + +	/* 1. Check space */ +	if (value) { +		/* If value is NULL, it is remove operation. +		 * In case of update operation, we caculate free. +		 */ +		free = MIN_OFFSET - ((char *)last - (char *)header); +		if (found) +			free = free - ENTRY_SIZE(here); + +		if (free < newsize) { +			error = -ENOSPC; +			goto cleanup; +		} +	} + +	/* 2. Remove old entry */ +	if (found) { +		/* If entry is found, remove old entry. +		 * If not found, remove operation is not needed. +		 */ +		struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); +		int oldsize = ENTRY_SIZE(here); + +		memmove(here, next, (char *)last - (char *)next); +		last = (struct f2fs_xattr_entry *)((char *)last - oldsize); +		memset(last, 0, oldsize); +	} + +	/* 3. Write new entry */ +	if (value) { +		/* Before we come here, old entry is removed. +		 * We just write new entry. */ +		memset(last, 0, newsize); +		last->e_name_index = name_index; +		last->e_name_len = name_len; +		memcpy(last->e_name, name, name_len); +		pval = last->e_name + name_len; +		memcpy(pval, value, value_len); +		last->e_value_size = cpu_to_le16(value_len); +	} + +	set_page_dirty(page); +	f2fs_put_page(page, 1); + +	if (is_inode_flag_set(fi, FI_ACL_MODE)) { +		inode->i_mode = fi->i_acl_mode; +		inode->i_ctime = CURRENT_TIME; +		clear_inode_flag(fi, FI_ACL_MODE); +	} +	f2fs_write_inode(inode, NULL); +	mutex_unlock_op(sbi, NODE_NEW); + +	return 0; +cleanup: +	f2fs_put_page(page, 1); +	mutex_unlock_op(sbi, NODE_NEW); +	return error; +} diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h new file mode 100644 index 00000000000..49c9558305e --- /dev/null +++ b/fs/f2fs/xattr.h @@ -0,0 +1,145 @@ +/* + * fs/f2fs/xattr.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + *             http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.h + * + * On-disk format of extended attributes for the ext2 filesystem. + * + * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_XATTR_H__ +#define __F2FS_XATTR_H__ + +#include <linux/init.h> +#include <linux/xattr.h> + +/* Magic value in attribute blocks */ +#define F2FS_XATTR_MAGIC                0xF2F52011 + +/* Maximum number of references to one attribute block */ +#define F2FS_XATTR_REFCOUNT_MAX         1024 + +/* Name indexes */ +#define F2FS_SYSTEM_ADVISE_PREFIX		"system.advise" +#define F2FS_XATTR_INDEX_USER			1 +#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS	2 +#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT	3 +#define F2FS_XATTR_INDEX_TRUSTED		4 +#define F2FS_XATTR_INDEX_LUSTRE			5 +#define F2FS_XATTR_INDEX_SECURITY		6 +#define F2FS_XATTR_INDEX_ADVISE			7 + +struct f2fs_xattr_header { +	__le32  h_magic;        /* magic number for identification */ +	__le32  h_refcount;     /* reference count */ +	__u32   h_reserved[4];  /* zero right now */ +}; + +struct f2fs_xattr_entry { +	__u8    e_name_index; +	__u8    e_name_len; +	__le16  e_value_size;   /* size of attribute value */ +	char    e_name[0];      /* attribute name */ +}; + +#define XATTR_HDR(ptr)		((struct f2fs_xattr_header *)(ptr)) +#define XATTR_ENTRY(ptr)	((struct f2fs_xattr_entry *)(ptr)) +#define XATTR_FIRST_ENTRY(ptr)	(XATTR_ENTRY(XATTR_HDR(ptr)+1)) +#define XATTR_ROUND		(3) + +#define XATTR_ALIGN(size)	((size + XATTR_ROUND) & ~XATTR_ROUND) + +#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ +			entry->e_name_len + le16_to_cpu(entry->e_value_size))) + +#define XATTR_NEXT_ENTRY(entry)	((struct f2fs_xattr_entry *)((char *)(entry) +\ +			ENTRY_SIZE(entry))) + +#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define list_for_each_xattr(entry, addr) \ +		for (entry = XATTR_FIRST_ENTRY(addr);\ +				!IS_XATTR_LAST_ENTRY(entry);\ +				entry = XATTR_NEXT_ENTRY(entry)) + + +#define MIN_OFFSET	XATTR_ALIGN(PAGE_SIZE - \ +			sizeof(struct node_footer) - \ +			sizeof(__u32)) + +#define MAX_VALUE_LEN	(MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \ +			sizeof(struct f2fs_xattr_entry)) + +/* + * On-disk structure of f2fs_xattr + * We use only 1 block for xattr. + * + * +--------------------+ + * | f2fs_xattr_header  | + * |                    | + * +--------------------+ + * | f2fs_xattr_entry   | + * | .e_name_index = 1  | + * | .e_name_len = 3    | + * | .e_value_size = 14 | + * | .e_name = "foo"    | + * | "value_of_xattr"   |<- value_offs = e_name + e_name_len + * +--------------------+ + * | f2fs_xattr_entry   | + * | .e_name_index = 4  | + * | .e_name = "bar"    | + * +--------------------+ + * |                    | + * |        Free        | + * |                    | + * +--------------------+<- MIN_OFFSET + * |   node_footer      | + * | (nid, ino, offset) | + * +--------------------+ + * + **/ + +#ifdef CONFIG_F2FS_FS_XATTR +extern const struct xattr_handler f2fs_xattr_user_handler; +extern const struct xattr_handler f2fs_xattr_trusted_handler; +extern const struct xattr_handler f2fs_xattr_acl_access_handler; +extern const struct xattr_handler f2fs_xattr_acl_default_handler; +extern const struct xattr_handler f2fs_xattr_advise_handler; + +extern const struct xattr_handler *f2fs_xattr_handlers[]; + +extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name, +		const void *value, size_t value_len); +extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name, +		void *buffer, size_t buffer_size); +extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, +		size_t buffer_size); + +#else + +#define f2fs_xattr_handlers	NULL +static inline int f2fs_setxattr(struct inode *inode, int name_index, +	const char *name, const void *value, size_t value_len) +{ +	return -EOPNOTSUPP; +} +static inline int f2fs_getxattr(struct inode *inode, int name_index, +		const char *name, void *buffer, size_t buffer_size) +{ +	return -EOPNOTSUPP; +} +static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, +		size_t buffer_size) +{ +	return -EOPNOTSUPP; +} +#endif + +#endif /* __F2FS_XATTR_H__ */ diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 2a182342442..165012ef363 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -461,8 +461,7 @@ static int fat_parse_short(struct super_block *sb,  }  /* - * Return values: negative -> error, 0 -> not found, positive -> found, - * value is the total amount of slots, including the shortname entry. + * Return values: negative -> error/not found, 0 -> found.   */  int fat_search_long(struct inode *inode, const unsigned char *name,  		    int name_len, struct fat_slot_info *sinfo) @@ -699,7 +698,7 @@ out:  static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	return __fat_readdir(inode, filp, dirent, filldir, 0, 0);  } @@ -780,7 +779,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,  static long fat_dir_ioctl(struct file *filp, unsigned int cmd,  			  unsigned long arg)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;  	int short_only, both; @@ -820,7 +819,7 @@ FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent)  static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,  				 unsigned long arg)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct compat_dirent __user *d1 = compat_ptr(arg);  	int short_only, both; @@ -1255,7 +1254,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,  	sinfo->nr_slots = nr_slots; -	/* First stage: search free direcotry entries */ +	/* First stage: search free directory entries */  	free_slots = nr_bhs = 0;  	bh = prev = NULL;  	pos = 0; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 12701a56775..e9cc3f0d58e 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -95,6 +95,8 @@ struct msdos_sb_info {  	spinlock_t dir_hash_lock;  	struct hlist_head dir_hashtable[FAT_HASH_SIZE]; + +	unsigned int dirty;           /* fs state before mount */  };  #define FAT_CACHE_VALID	0	/* special case for valid cache */ diff --git a/fs/fat/file.c b/fs/fat/file.c index a62e0ecbe2d..3978f8ca182 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -32,7 +32,7 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)  static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);  	int is_dir = S_ISDIR(inode->i_mode);  	u32 attr, oldattr; @@ -116,7 +116,7 @@ out:  long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	u32 __user *user_attr = (u32 __user *)arg;  	switch (cmd) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 35806813ea4..acf6e479b44 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -341,12 +341,11 @@ struct inode *fat_iget(struct super_block *sb, loff_t i_pos)  {  	struct msdos_sb_info *sbi = MSDOS_SB(sb);  	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); -	struct hlist_node *_p;  	struct msdos_inode_info *i;  	struct inode *inode = NULL;  	spin_lock(&sbi->inode_hash_lock); -	hlist_for_each_entry(i, _p, head, i_fat_hash) { +	hlist_for_each_entry(i, head, i_fat_hash) {  		BUG_ON(i->vfs_inode.i_sb != sb);  		if (i->i_pos != i_pos)  			continue; @@ -488,10 +487,59 @@ static void fat_evict_inode(struct inode *inode)  	fat_detach(inode);  } +static void fat_set_state(struct super_block *sb, +			unsigned int set, unsigned int force) +{ +	struct buffer_head *bh; +	struct fat_boot_sector *b; +	struct msdos_sb_info *sbi = sb->s_fs_info; + +	/* do not change any thing if mounted read only */ +	if ((sb->s_flags & MS_RDONLY) && !force) +		return; + +	/* do not change state if fs was dirty */ +	if (sbi->dirty) { +		/* warn only on set (mount). */ +		if (set) +			fat_msg(sb, KERN_WARNING, "Volume was not properly " +				"unmounted. Some data may be corrupt. " +				"Please run fsck."); +		return; +	} + +	bh = sb_bread(sb, 0); +	if (bh == NULL) { +		fat_msg(sb, KERN_ERR, "unable to read boot sector " +			"to mark fs as dirty"); +		return; +	} + +	b = (struct fat_boot_sector *) bh->b_data; + +	if (sbi->fat_bits == 32) { +		if (set) +			b->fat32.state |= FAT_STATE_DIRTY; +		else +			b->fat32.state &= ~FAT_STATE_DIRTY; +	} else /* fat 16 and 12 */ { +		if (set) +			b->fat16.state |= FAT_STATE_DIRTY; +		else +			b->fat16.state &= ~FAT_STATE_DIRTY; +	} + +	mark_buffer_dirty(bh); +	sync_dirty_buffer(bh); +	brelse(bh); +} +  static void fat_put_super(struct super_block *sb)  {  	struct msdos_sb_info *sbi = MSDOS_SB(sb); +	fat_set_state(sb, 0, 0); +  	iput(sbi->fsinfo_inode);  	iput(sbi->fat_inode); @@ -566,8 +614,18 @@ static void __exit fat_destroy_inodecache(void)  static int fat_remount(struct super_block *sb, int *flags, char *data)  { +	int new_rdonly;  	struct msdos_sb_info *sbi = MSDOS_SB(sb);  	*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + +	/* make sure we update state on remount. */ +	new_rdonly = *flags & MS_RDONLY; +	if (new_rdonly != (sb->s_flags & MS_RDONLY)) { +		if (new_rdonly) +			fat_set_state(sb, 0, 0); +		else +			fat_set_state(sb, 1, 1); +	}  	return 0;  } @@ -1298,17 +1356,17 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,  	sbi->prev_free = FAT_START_ENT;  	sb->s_maxbytes = 0xffffffff; -	if (!sbi->fat_length && b->fat32_length) { +	if (!sbi->fat_length && b->fat32.length) {  		struct fat_boot_fsinfo *fsinfo;  		struct buffer_head *fsinfo_bh;  		/* Must be FAT32 */  		sbi->fat_bits = 32; -		sbi->fat_length = le32_to_cpu(b->fat32_length); -		sbi->root_cluster = le32_to_cpu(b->root_cluster); +		sbi->fat_length = le32_to_cpu(b->fat32.length); +		sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster);  		/* MC - if info_sector is 0, don't multiply by 0 */ -		sbi->fsinfo_sector = le16_to_cpu(b->info_sector); +		sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector);  		if (sbi->fsinfo_sector == 0)  			sbi->fsinfo_sector = 1; @@ -1344,7 +1402,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,  	sbi->dir_entries = get_unaligned_le16(&b->dir_entries);  	if (sbi->dir_entries & (sbi->dir_per_block - 1)) {  		if (!silent) -			fat_msg(sb, KERN_ERR, "bogus directroy-entries per block" +			fat_msg(sb, KERN_ERR, "bogus directory-entries per block"  			       " (%u)", sbi->dir_entries);  		brelse(bh);  		goto out_invalid; @@ -1362,6 +1420,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,  	if (sbi->fat_bits != 32)  		sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12; +	/* some OSes set FAT_STATE_DIRTY and clean it on unmount. */ +	if (sbi->fat_bits == 32) +		sbi->dirty = b->fat32.state & FAT_STATE_DIRTY; +	else /* fat 16 or 12 */ +		sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; +  	/* check that FAT table does not overflow */  	fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;  	total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); @@ -1456,6 +1520,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,  					"the device does not support discard");  	} +	fat_set_state(sb, 1, 0);  	return 0;  out_invalid: diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 5eb600dc43a..359d307b550 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -135,6 +135,10 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)  		}  		if (ret < 0)  			return ret; +		/* +		 * FIXME:Although we can add this cache, fat_cache_add() is +		 * assuming to be called after linear search with fat_cache_id. +		 */  //		fat_cache_add(inode, new_fclus, new_dclus);  	} else {  		MSDOS_I(inode)->i_start = new_dclus; diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index ef4b5faba87..499c10438ca 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -21,13 +21,12 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart)  {  	struct msdos_sb_info *sbi = MSDOS_SB(sb);  	struct hlist_head *head; -	struct hlist_node *_p;  	struct msdos_inode_info *i;  	struct inode *inode = NULL;  	head = sbi->dir_hashtable + fat_dir_hash(i_logstart);  	spin_lock(&sbi->dir_hash_lock); -	hlist_for_each_entry(i, _p, head, i_dir_hash) { +	hlist_for_each_entry(i, head, i_dir_hash) {  		BUG_ON(i->vfs_inode.i_sb != sb);  		if (i->i_logstart != i_logstart)  			continue; diff --git a/fs/fcntl.c b/fs/fcntl.c index 71a600a19f0..6599222536e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -30,7 +30,7 @@  static int setfl(int fd, struct file * filp, unsigned long arg)  { -	struct inode * inode = filp->f_path.dentry->d_inode; +	struct inode * inode = file_inode(filp);  	int error = 0;  	/* diff --git a/fs/fhandle.c b/fs/fhandle.c index cccdc874bb5..999ff5c3cab 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -52,7 +52,7 @@ static long do_sys_name_to_handle(struct path *path,  	handle_bytes = handle_dwords * sizeof(u32);  	handle->handle_bytes = handle_bytes;  	if ((handle->handle_bytes > f_handle.handle_bytes) || -	    (retval == 255) || (retval == -ENOSPC)) { +	    (retval == FILEID_INVALID) || (retval == -ENOSPC)) {  		/* As per old exportfs_encode_fh documentation  		 * we could return ENOSPC to indicate overflow  		 * But file system returned 255 always. So handle diff --git a/fs/file.c b/fs/file.c index 15cb8618e95..3906d9577a1 100644 --- a/fs/file.c +++ b/fs/file.c @@ -490,7 +490,7 @@ void exit_files(struct task_struct *tsk)  	}  } -static void __devinit fdtable_defer_list_init(int cpu) +static void fdtable_defer_list_init(int cpu)  {  	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);  	spin_lock_init(&fddef->lock); @@ -516,7 +516,7 @@ struct files_struct init_files = {  		.close_on_exec	= init_files.close_on_exec_init,  		.open_fds	= init_files.open_fds_init,  	}, -	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock), +	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),  };  /* diff --git a/fs/file_table.c b/fs/file_table.c index a72bf9ddd0d..cd4d87a8295 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -94,8 +94,8 @@ int proc_nr_files(ctl_table *table, int write,  #endif  /* Find an unused file structure and return a pointer to it. - * Returns NULL, if there are no more free file structures or - * we run out of memory. + * Returns an error pointer if some error happend e.g. we over file + * structures limit, run out of memory or operation is not permitted.   *   * Be very careful using this.  You are responsible for   * getting write access to any mount that you might assign @@ -107,7 +107,8 @@ struct file *get_empty_filp(void)  {  	const struct cred *cred = current_cred();  	static long old_max; -	struct file * f; +	struct file *f; +	int error;  	/*  	 * Privileged users can go above max_files @@ -122,13 +123,16 @@ struct file *get_empty_filp(void)  	}  	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); -	if (f == NULL) -		goto fail; +	if (unlikely(!f)) +		return ERR_PTR(-ENOMEM);  	percpu_counter_inc(&nr_files);  	f->f_cred = get_cred(cred); -	if (security_file_alloc(f)) -		goto fail_sec; +	error = security_file_alloc(f); +	if (unlikely(error)) { +		file_free(f); +		return ERR_PTR(error); +	}  	INIT_LIST_HEAD(&f->f_u.fu_list);  	atomic_long_set(&f->f_count, 1); @@ -144,12 +148,7 @@ over:  		pr_info("VFS: file-max limit %lu reached\n", get_max_files());  		old_max = get_nr_files();  	} -	goto fail; - -fail_sec: -	file_free(f); -fail: -	return NULL; +	return ERR_PTR(-ENFILE);  }  /** @@ -173,10 +172,11 @@ struct file *alloc_file(struct path *path, fmode_t mode,  	struct file *file;  	file = get_empty_filp(); -	if (!file) -		return NULL; +	if (IS_ERR(file)) +		return file;  	file->f_path = *path; +	file->f_inode = path->dentry->d_inode;  	file->f_mapping = path->dentry->d_inode->i_mapping;  	file->f_mode = mode;  	file->f_op = fop; @@ -259,6 +259,7 @@ static void __fput(struct file *file)  		drop_file_write_access(file);  	file->f_path.dentry = NULL;  	file->f_path.mnt = NULL; +	file->f_inode = NULL;  	file_free(file);  	dput(dentry);  	mntput(mnt); @@ -447,7 +448,7 @@ void mark_files_ro(struct super_block *sb)  	lg_global_lock(&files_lglock);  	do_file_list_for_each_entry(sb, f) { -		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) +		if (!S_ISREG(file_inode(f)->i_mode))  		       continue;  		if (!file_count(f))  			continue; @@ -458,8 +459,8 @@ void mark_files_ro(struct super_block *sb)  		spin_unlock(&f->f_lock);  		if (file_check_writeable(f) != 0)  			continue; +		__mnt_drop_write(f->f_path.mnt);  		file_release_write(f); -		mnt_drop_write_file(f);  	} while_file_list_for_each_entry;  	lg_global_unlock(&files_lglock);  } diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index bd447e88f20..664b07a5387 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -237,7 +237,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)  static int  vxfs_readdir(struct file *fp, void *retp, filldir_t filler)  { -	struct inode		*ip = fp->f_path.dentry->d_inode; +	struct inode		*ip = file_inode(fp);  	struct super_block	*sbp = ip->i_sb;  	u_long			bsize = sbp->s_blocksize;  	u_long			page, npages, block, pblocks, nblocks, offset; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 310972b72a6..21f46fb3a10 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -318,8 +318,14 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)  static int write_inode(struct inode *inode, struct writeback_control *wbc)  { -	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) -		return inode->i_sb->s_op->write_inode(inode, wbc); +	int ret; + +	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { +		trace_writeback_write_inode_start(inode, wbc); +		ret = inode->i_sb->s_op->write_inode(inode, wbc); +		trace_writeback_write_inode(inode, wbc); +		return ret; +	}  	return 0;  } @@ -450,6 +456,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)  	WARN_ON(!(inode->i_state & I_SYNC)); +	trace_writeback_single_inode_start(inode, wbc, nr_to_write); +  	ret = do_writepages(mapping, wbc);  	/* @@ -1150,8 +1158,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)  	 * dirty the inode itself  	 */  	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { +		trace_writeback_dirty_inode_start(inode, flags); +  		if (sb->s_op->dirty_inode)  			sb->s_op->dirty_inode(inode, flags); + +		trace_writeback_dirty_inode(inode, flags);  	}  	/* @@ -1332,47 +1344,43 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)  EXPORT_SYMBOL(writeback_inodes_sb);  /** - * writeback_inodes_sb_if_idle	-	start writeback if none underway + * try_to_writeback_inodes_sb_nr - try to start writeback if none underway   * @sb: the superblock - * @reason: reason why some writeback work was initiated + * @nr: the number of pages to write + * @reason: the reason of writeback   * - * Invoke writeback_inodes_sb if no writeback is currently underway. + * Invoke writeback_inodes_sb_nr if no writeback is currently underway.   * Returns 1 if writeback was started, 0 if not.   */ -int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) +int try_to_writeback_inodes_sb_nr(struct super_block *sb, +				  unsigned long nr, +				  enum wb_reason reason)  { -	if (!writeback_in_progress(sb->s_bdi)) { -		down_read(&sb->s_umount); -		writeback_inodes_sb(sb, reason); -		up_read(&sb->s_umount); +	if (writeback_in_progress(sb->s_bdi))  		return 1; -	} else + +	if (!down_read_trylock(&sb->s_umount))  		return 0; + +	writeback_inodes_sb_nr(sb, nr, reason); +	up_read(&sb->s_umount); +	return 1;  } -EXPORT_SYMBOL(writeback_inodes_sb_if_idle); +EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);  /** - * writeback_inodes_sb_nr_if_idle	-	start writeback if none underway + * try_to_writeback_inodes_sb - try to start writeback if none underway   * @sb: the superblock - * @nr: the number of pages to write   * @reason: reason why some writeback work was initiated   * - * Invoke writeback_inodes_sb if no writeback is currently underway. + * Implement by try_to_writeback_inodes_sb_nr()   * Returns 1 if writeback was started, 0 if not.   */ -int writeback_inodes_sb_nr_if_idle(struct super_block *sb, -				   unsigned long nr, -				   enum wb_reason reason) +int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)  { -	if (!writeback_in_progress(sb->s_bdi)) { -		down_read(&sb->s_umount); -		writeback_inodes_sb_nr(sb, nr, reason); -		up_read(&sb->s_umount); -		return 1; -	} else -		return 0; +	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);  } -EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); +EXPORT_SYMBOL(try_to_writeback_inodes_sb);  /**   * sync_inodes_sb	-	sync sb inode pages diff --git a/fs/fs_struct.c b/fs/fs_struct.c index fe6ca583bbc..d8ac61d0c93 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -10,7 +10,7 @@   * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.   * It can block.   */ -void set_fs_root(struct fs_struct *fs, struct path *path) +void set_fs_root(struct fs_struct *fs, const struct path *path)  {  	struct path old_root; @@ -29,7 +29,7 @@ void set_fs_root(struct fs_struct *fs, struct path *path)   * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.   * It can block.   */ -void set_fs_pwd(struct fs_struct *fs, struct path *path) +void set_fs_pwd(struct fs_struct *fs, const struct path *path)  {  	struct path old_pwd; @@ -53,7 +53,7 @@ static inline int replace_path(struct path *p, const struct path *old, const str  	return 1;  } -void chroot_fs_refs(struct path *old_root, struct path *new_root) +void chroot_fs_refs(const struct path *old_root, const struct path *new_root)  {  	struct task_struct *g, *p;  	struct fs_struct *fs; diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 6a3c48abd67..b52aed1dca9 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -314,10 +314,10 @@ EXPORT_SYMBOL(fscache_add_cache);   */  void fscache_io_error(struct fscache_cache *cache)  { -	set_bit(FSCACHE_IOERROR, &cache->flags); - -	printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n", -	       cache->ops->name); +	if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) +		printk(KERN_ERR "FS-Cache:" +		       " Cache '%s' stopped due to I/O error\n", +		       cache->ops->name);  }  EXPORT_SYMBOL(fscache_io_error); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 990535071a8..e2cba1f60c2 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -237,13 +237,12 @@ static int fscache_alloc_object(struct fscache_cache *cache,  				struct fscache_cookie *cookie)  {  	struct fscache_object *object; -	struct hlist_node *_n;  	int ret;  	_enter("%p,%p{%s}", cache, cookie, cookie->def->name);  	spin_lock(&cookie->lock); -	hlist_for_each_entry(object, _n, &cookie->backing_objects, +	hlist_for_each_entry(object, &cookie->backing_objects,  			     cookie_link) {  		if (object->cache == cache)  			goto object_already_extant; @@ -311,7 +310,6 @@ static int fscache_attach_object(struct fscache_cookie *cookie,  {  	struct fscache_object *p;  	struct fscache_cache *cache = object->cache; -	struct hlist_node *_n;  	int ret;  	_enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); @@ -321,7 +319,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,  	/* there may be multiple initial creations of this object, but we only  	 * want one */  	ret = -EEXIST; -	hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) { +	hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {  		if (p->cache == object->cache) {  			if (p->state >= FSCACHE_OBJECT_DYING)  				ret = -ENOBUFS; @@ -331,7 +329,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,  	/* pin the parent object */  	spin_lock_nested(&cookie->parent->lock, 1); -	hlist_for_each_entry(p, _n, &cookie->parent->backing_objects, +	hlist_for_each_entry(p, &cookie->parent->backing_objects,  			     cookie_link) {  		if (p->cache == object->cache) {  			if (p->state >= FSCACHE_OBJECT_DYING) { @@ -370,12 +368,71 @@ cant_attach_object:  }  /* + * Invalidate an object.  Callable with spinlocks held. + */ +void __fscache_invalidate(struct fscache_cookie *cookie) +{ +	struct fscache_object *object; + +	_enter("{%s}", cookie->def->name); + +	fscache_stat(&fscache_n_invalidates); + +	/* Only permit invalidation of data files.  Invalidating an index will +	 * require the caller to release all its attachments to the tree rooted +	 * there, and if it's doing that, it may as well just retire the +	 * cookie. +	 */ +	ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + +	/* We will be updating the cookie too. */ +	BUG_ON(!cookie->def->get_aux); + +	/* If there's an object, we tell the object state machine to handle the +	 * invalidation on our behalf, otherwise there's nothing to do. +	 */ +	if (!hlist_empty(&cookie->backing_objects)) { +		spin_lock(&cookie->lock); + +		if (!hlist_empty(&cookie->backing_objects) && +		    !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING, +				      &cookie->flags)) { +			object = hlist_entry(cookie->backing_objects.first, +					     struct fscache_object, +					     cookie_link); +			if (object->state < FSCACHE_OBJECT_DYING) +				fscache_raise_event( +					object, FSCACHE_OBJECT_EV_INVALIDATE); +		} + +		spin_unlock(&cookie->lock); +	} + +	_leave(""); +} +EXPORT_SYMBOL(__fscache_invalidate); + +/* + * Wait for object invalidation to complete. + */ +void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) +{ +	_enter("%p", cookie); + +	wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, +		    fscache_wait_bit_interruptible, +		    TASK_UNINTERRUPTIBLE); + +	_leave(""); +} +EXPORT_SYMBOL(__fscache_wait_on_invalidate); + +/*   * update the index entries backing a cookie   */  void __fscache_update_cookie(struct fscache_cookie *cookie)  {  	struct fscache_object *object; -	struct hlist_node *_p;  	fscache_stat(&fscache_n_updates); @@ -392,7 +449,7 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)  	spin_lock(&cookie->lock);  	/* update the index entry on disk in each cache backing this cookie */ -	hlist_for_each_entry(object, _p, +	hlist_for_each_entry(object,  			     &cookie->backing_objects, cookie_link) {  		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);  	} @@ -442,16 +499,34 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)  	event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; +try_again:  	spin_lock(&cookie->lock);  	/* break links with all the active objects */  	while (!hlist_empty(&cookie->backing_objects)) { +		int n_reads;  		object = hlist_entry(cookie->backing_objects.first,  				     struct fscache_object,  				     cookie_link);  		_debug("RELEASE OBJ%x", object->debug_id); +		set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags); +		n_reads = atomic_read(&object->n_reads); +		if (n_reads) { +			int n_ops = object->n_ops; +			int n_in_progress = object->n_in_progress; +			spin_unlock(&cookie->lock); +			printk(KERN_ERR "FS-Cache:" +			       " Cookie '%s' still has %d outstanding reads (%d,%d)\n", +			       cookie->def->name, +			       n_reads, n_ops, n_in_progress); +			wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS, +				    fscache_wait_bit, TASK_UNINTERRUPTIBLE); +			printk("Wait finished\n"); +			goto try_again; +		} +  		/* detach each cache object from the object cookie */  		spin_lock(&object->lock);  		hlist_del_init(&object->cookie_link); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index f6aad48d38a..ee38fef4be5 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -121,12 +121,19 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,  				       struct fscache_operation *);  extern int fscache_submit_op(struct fscache_object *,  			     struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *); +extern int fscache_cancel_op(struct fscache_operation *, +			     void (*)(struct fscache_operation *)); +extern void fscache_cancel_all_ops(struct fscache_object *);  extern void fscache_abort_object(struct fscache_object *);  extern void fscache_start_operations(struct fscache_object *);  extern void fscache_operation_gc(struct work_struct *);  /* + * page.c + */ +extern void fscache_invalidate_writes(struct fscache_cookie *); + +/*   * proc.c   */  #ifdef CONFIG_PROC_FS @@ -194,6 +201,7 @@ extern atomic_t fscache_n_store_vmscan_not_storing;  extern atomic_t fscache_n_store_vmscan_gone;  extern atomic_t fscache_n_store_vmscan_busy;  extern atomic_t fscache_n_store_vmscan_cancelled; +extern atomic_t fscache_n_store_vmscan_wait;  extern atomic_t fscache_n_marks;  extern atomic_t fscache_n_uncaches; @@ -205,6 +213,9 @@ extern atomic_t fscache_n_acquires_ok;  extern atomic_t fscache_n_acquires_nobufs;  extern atomic_t fscache_n_acquires_oom; +extern atomic_t fscache_n_invalidates; +extern atomic_t fscache_n_invalidates_run; +  extern atomic_t fscache_n_updates;  extern atomic_t fscache_n_updates_null;  extern atomic_t fscache_n_updates_run; @@ -237,6 +248,7 @@ extern atomic_t fscache_n_cop_alloc_object;  extern atomic_t fscache_n_cop_lookup_object;  extern atomic_t fscache_n_cop_lookup_complete;  extern atomic_t fscache_n_cop_grab_object; +extern atomic_t fscache_n_cop_invalidate_object;  extern atomic_t fscache_n_cop_update_object;  extern atomic_t fscache_n_cop_drop_object;  extern atomic_t fscache_n_cop_put_object; @@ -278,6 +290,7 @@ extern const struct file_operations fscache_stats_fops;  static inline void fscache_raise_event(struct fscache_object *object,  				       unsigned event)  { +	BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);  	if (!test_and_set_bit(event, &object->events) &&  	    test_bit(event, &object->event_mask))  		fscache_enqueue_object(object); diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index ebe29c58138..f27c89d1788 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -245,7 +245,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)  		   obj->n_in_progress,  		   obj->n_exclusive,  		   atomic_read(&obj->n_reads), -		   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, +		   obj->event_mask,  		   obj->events,  		   obj->flags,  		   work_busy(&obj->work)); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index b6b897c550a..50d41c18021 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -14,6 +14,7 @@  #define FSCACHE_DEBUG_LEVEL COOKIE  #include <linux/module.h> +#include <linux/slab.h>  #include "internal.h"  const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { @@ -22,6 +23,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {  	[FSCACHE_OBJECT_CREATING]	= "OBJECT_CREATING",  	[FSCACHE_OBJECT_AVAILABLE]	= "OBJECT_AVAILABLE",  	[FSCACHE_OBJECT_ACTIVE]		= "OBJECT_ACTIVE", +	[FSCACHE_OBJECT_INVALIDATING]	= "OBJECT_INVALIDATING",  	[FSCACHE_OBJECT_UPDATING]	= "OBJECT_UPDATING",  	[FSCACHE_OBJECT_DYING]		= "OBJECT_DYING",  	[FSCACHE_OBJECT_LC_DYING]	= "OBJECT_LC_DYING", @@ -39,6 +41,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {  	[FSCACHE_OBJECT_CREATING]	= "CRTN",  	[FSCACHE_OBJECT_AVAILABLE]	= "AVBL",  	[FSCACHE_OBJECT_ACTIVE]		= "ACTV", +	[FSCACHE_OBJECT_INVALIDATING]	= "INVL",  	[FSCACHE_OBJECT_UPDATING]	= "UPDT",  	[FSCACHE_OBJECT_DYING]		= "DYNG",  	[FSCACHE_OBJECT_LC_DYING]	= "LCDY", @@ -54,6 +57,7 @@ static void fscache_put_object(struct fscache_object *);  static void fscache_initialise_object(struct fscache_object *);  static void fscache_lookup_object(struct fscache_object *);  static void fscache_object_available(struct fscache_object *); +static void fscache_invalidate_object(struct fscache_object *);  static void fscache_release_object(struct fscache_object *);  static void fscache_withdraw_object(struct fscache_object *);  static void fscache_enqueue_dependents(struct fscache_object *); @@ -79,6 +83,15 @@ static inline void fscache_done_parent_op(struct fscache_object *object)  }  /* + * Notify netfs of invalidation completion. + */ +static inline void fscache_invalidation_complete(struct fscache_cookie *cookie) +{ +	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) +		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); +} + +/*   * process events that have been sent to an object's state machine   * - initiates parent lookup   * - does object lookup @@ -90,6 +103,7 @@ static void fscache_object_state_machine(struct fscache_object *object)  {  	enum fscache_object_state new_state;  	struct fscache_cookie *cookie; +	int event;  	ASSERT(object != NULL); @@ -101,7 +115,8 @@ static void fscache_object_state_machine(struct fscache_object *object)  		/* wait for the parent object to become ready */  	case FSCACHE_OBJECT_INIT:  		object->event_mask = -			ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); +			FSCACHE_OBJECT_EVENTS_MASK & +			~(1 << FSCACHE_OBJECT_EV_CLEARED);  		fscache_initialise_object(object);  		goto done; @@ -125,6 +140,16 @@ static void fscache_object_state_machine(struct fscache_object *object)  	case FSCACHE_OBJECT_ACTIVE:  		goto active_transit; +		/* Invalidate an object on disk */ +	case FSCACHE_OBJECT_INVALIDATING: +		clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events); +		fscache_stat(&fscache_n_invalidates_run); +		fscache_stat(&fscache_n_cop_invalidate_object); +		fscache_invalidate_object(object); +		fscache_stat_d(&fscache_n_cop_invalidate_object); +		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); +		goto active_transit; +  		/* update the object metadata on disk */  	case FSCACHE_OBJECT_UPDATING:  		clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); @@ -251,13 +276,17 @@ static void fscache_object_state_machine(struct fscache_object *object)  	/* determine the transition from a lookup state */  lookup_transit: -	switch (fls(object->events & object->event_mask) - 1) { +	event = fls(object->events & object->event_mask) - 1; +	switch (event) {  	case FSCACHE_OBJECT_EV_WITHDRAW:  	case FSCACHE_OBJECT_EV_RETIRE:  	case FSCACHE_OBJECT_EV_RELEASE:  	case FSCACHE_OBJECT_EV_ERROR:  		new_state = FSCACHE_OBJECT_LC_DYING;  		goto change_state; +	case FSCACHE_OBJECT_EV_INVALIDATE: +		new_state = FSCACHE_OBJECT_INVALIDATING; +		goto change_state;  	case FSCACHE_OBJECT_EV_REQUEUE:  		goto done;  	case -1: @@ -268,13 +297,17 @@ lookup_transit:  	/* determine the transition from an active state */  active_transit: -	switch (fls(object->events & object->event_mask) - 1) { +	event = fls(object->events & object->event_mask) - 1; +	switch (event) {  	case FSCACHE_OBJECT_EV_WITHDRAW:  	case FSCACHE_OBJECT_EV_RETIRE:  	case FSCACHE_OBJECT_EV_RELEASE:  	case FSCACHE_OBJECT_EV_ERROR:  		new_state = FSCACHE_OBJECT_DYING;  		goto change_state; +	case FSCACHE_OBJECT_EV_INVALIDATE: +		new_state = FSCACHE_OBJECT_INVALIDATING; +		goto change_state;  	case FSCACHE_OBJECT_EV_UPDATE:  		new_state = FSCACHE_OBJECT_UPDATING;  		goto change_state; @@ -287,7 +320,8 @@ active_transit:  	/* determine the transition from a terminal state */  terminal_transit: -	switch (fls(object->events & object->event_mask) - 1) { +	event = fls(object->events & object->event_mask) - 1; +	switch (event) {  	case FSCACHE_OBJECT_EV_WITHDRAW:  		new_state = FSCACHE_OBJECT_WITHDRAWING;  		goto change_state; @@ -320,8 +354,8 @@ done:  unsupported_event:  	printk(KERN_ERR "FS-Cache:" -	       " Unsupported event %lx [mask %lx] in state %s\n", -	       object->events, object->event_mask, +	       " Unsupported event %d [%lx/%lx] in state %s\n", +	       event, object->events, object->event_mask,  	       fscache_object_states[object->state]);  	BUG();  } @@ -587,8 +621,6 @@ static void fscache_object_available(struct fscache_object *object)  	if (object->n_in_progress == 0) {  		if (object->n_ops > 0) {  			ASSERTCMP(object->n_ops, >=, object->n_obj_ops); -			ASSERTIF(object->n_ops > object->n_obj_ops, -				 !list_empty(&object->pending_ops));  			fscache_start_operations(object);  		} else {  			ASSERT(list_empty(&object->pending_ops)); @@ -681,6 +713,7 @@ static void fscache_withdraw_object(struct fscache_object *object)  		if (object->cookie == cookie) {  			hlist_del_init(&object->cookie_link);  			object->cookie = NULL; +			fscache_invalidation_complete(cookie);  			detached = true;  		}  		spin_unlock(&cookie->lock); @@ -890,3 +923,55 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object,  	return result;  }  EXPORT_SYMBOL(fscache_check_aux); + +/* + * Asynchronously invalidate an object. + */ +static void fscache_invalidate_object(struct fscache_object *object) +{ +	struct fscache_operation *op; +	struct fscache_cookie *cookie = object->cookie; + +	_enter("{OBJ%x}", object->debug_id); + +	/* Reject any new read/write ops and abort any that are pending. */ +	fscache_invalidate_writes(cookie); +	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); +	fscache_cancel_all_ops(object); + +	/* Now we have to wait for in-progress reads and writes */ +	op = kzalloc(sizeof(*op), GFP_KERNEL); +	if (!op) { +		fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); +		_leave(" [ENOMEM]"); +		return; +	} + +	fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); +	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); + +	spin_lock(&cookie->lock); +	if (fscache_submit_exclusive_op(object, op) < 0) +		goto submit_op_failed; +	spin_unlock(&cookie->lock); +	fscache_put_operation(op); + +	/* Once we've completed the invalidation, we know there will be no data +	 * stored in the cache and thus we can reinstate the data-check-skip +	 * optimisation. +	 */ +	set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + +	/* We can allow read and write requests to come in once again.  They'll +	 * queue up behind our exclusive invalidation operation. +	 */ +	fscache_invalidation_complete(cookie); +	_leave(""); +	return; + +submit_op_failed: +	spin_unlock(&cookie->lock); +	kfree(op); +	fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); +	_leave(" [EIO]"); +} diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 30afdfa7aec..762a9ec4ffa 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -37,6 +37,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)  	ASSERT(op->processor != NULL);  	ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);  	ASSERTCMP(atomic_read(&op->usage), >, 0); +	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);  	fscache_stat(&fscache_n_op_enqueue);  	switch (op->flags & FSCACHE_OP_TYPE) { @@ -64,6 +65,9 @@ EXPORT_SYMBOL(fscache_enqueue_operation);  static void fscache_run_op(struct fscache_object *object,  			   struct fscache_operation *op)  { +	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + +	op->state = FSCACHE_OP_ST_IN_PROGRESS;  	object->n_in_progress++;  	if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))  		wake_up_bit(&op->flags, FSCACHE_OP_WAITING); @@ -84,18 +88,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object,  	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); +	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); +	ASSERTCMP(atomic_read(&op->usage), >, 0); +  	spin_lock(&object->lock);  	ASSERTCMP(object->n_ops, >=, object->n_in_progress);  	ASSERTCMP(object->n_ops, >=, object->n_exclusive);  	ASSERT(list_empty(&op->pend_link)); -	ret = -ENOBUFS; +	op->state = FSCACHE_OP_ST_PENDING;  	if (fscache_object_is_active(object)) {  		op->object = object;  		object->n_ops++;  		object->n_exclusive++;	/* reads and writes must wait */ -		if (object->n_ops > 1) { +		if (object->n_in_progress > 0) {  			atomic_inc(&op->usage);  			list_add_tail(&op->pend_link, &object->pending_ops);  			fscache_stat(&fscache_n_op_pend); @@ -121,8 +128,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object,  		fscache_stat(&fscache_n_op_pend);  		ret = 0;  	} else { -		/* not allowed to submit ops in any other state */ -		BUG(); +		/* If we're in any other state, there must have been an I/O +		 * error of some nature. +		 */ +		ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags)); +		ret = -EIO;  	}  	spin_unlock(&object->lock); @@ -186,6 +196,7 @@ int fscache_submit_op(struct fscache_object *object,  	_enter("{OBJ%x OP%x},{%u}",  	       object->debug_id, op->debug_id, atomic_read(&op->usage)); +	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);  	ASSERTCMP(atomic_read(&op->usage), >, 0);  	spin_lock(&object->lock); @@ -196,6 +207,7 @@ int fscache_submit_op(struct fscache_object *object,  	ostate = object->state;  	smp_rmb(); +	op->state = FSCACHE_OP_ST_PENDING;  	if (fscache_object_is_active(object)) {  		op->object = object;  		object->n_ops++; @@ -225,12 +237,15 @@ int fscache_submit_op(struct fscache_object *object,  		   object->state == FSCACHE_OBJECT_LC_DYING ||  		   object->state == FSCACHE_OBJECT_WITHDRAWING) {  		fscache_stat(&fscache_n_op_rejected); +		op->state = FSCACHE_OP_ST_CANCELLED;  		ret = -ENOBUFS;  	} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {  		fscache_report_unexpected_submission(object, op, ostate);  		ASSERT(!fscache_object_is_active(object)); +		op->state = FSCACHE_OP_ST_CANCELLED;  		ret = -ENOBUFS;  	} else { +		op->state = FSCACHE_OP_ST_CANCELLED;  		ret = -ENOBUFS;  	} @@ -283,20 +298,28 @@ void fscache_start_operations(struct fscache_object *object)  /*   * cancel an operation that's pending on an object   */ -int fscache_cancel_op(struct fscache_operation *op) +int fscache_cancel_op(struct fscache_operation *op, +		      void (*do_cancel)(struct fscache_operation *))  {  	struct fscache_object *object = op->object;  	int ret;  	_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); +	ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING); +	ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED); +	ASSERTCMP(atomic_read(&op->usage), >, 0); +  	spin_lock(&object->lock);  	ret = -EBUSY; -	if (!list_empty(&op->pend_link)) { +	if (op->state == FSCACHE_OP_ST_PENDING) { +		ASSERT(!list_empty(&op->pend_link));  		fscache_stat(&fscache_n_op_cancelled);  		list_del_init(&op->pend_link); -		object->n_ops--; +		if (do_cancel) +			do_cancel(op); +		op->state = FSCACHE_OP_ST_CANCELLED;  		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))  			object->n_exclusive--;  		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) @@ -311,6 +334,70 @@ int fscache_cancel_op(struct fscache_operation *op)  }  /* + * Cancel all pending operations on an object + */ +void fscache_cancel_all_ops(struct fscache_object *object) +{ +	struct fscache_operation *op; + +	_enter("OBJ%x", object->debug_id); + +	spin_lock(&object->lock); + +	while (!list_empty(&object->pending_ops)) { +		op = list_entry(object->pending_ops.next, +				struct fscache_operation, pend_link); +		fscache_stat(&fscache_n_op_cancelled); +		list_del_init(&op->pend_link); + +		ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); +		op->state = FSCACHE_OP_ST_CANCELLED; + +		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) +			object->n_exclusive--; +		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) +			wake_up_bit(&op->flags, FSCACHE_OP_WAITING); +		fscache_put_operation(op); +		cond_resched_lock(&object->lock); +	} + +	spin_unlock(&object->lock); +	_leave(""); +} + +/* + * Record the completion or cancellation of an in-progress operation. + */ +void fscache_op_complete(struct fscache_operation *op, bool cancelled) +{ +	struct fscache_object *object = op->object; + +	_enter("OBJ%x", object->debug_id); + +	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); +	ASSERTCMP(object->n_in_progress, >, 0); +	ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), +		    object->n_exclusive, >, 0); +	ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), +		    object->n_in_progress, ==, 1); + +	spin_lock(&object->lock); + +	op->state = cancelled ? +		FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE; + +	if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) +		object->n_exclusive--; +	object->n_in_progress--; +	if (object->n_in_progress == 0) +		fscache_start_operations(object); + +	spin_unlock(&object->lock); +	_leave(""); +} +EXPORT_SYMBOL(fscache_op_complete); + +/*   * release an operation   * - queues pending ops if this is the last in-progress op   */ @@ -328,8 +415,9 @@ void fscache_put_operation(struct fscache_operation *op)  		return;  	_debug("PUT OP"); -	if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) -		BUG(); +	ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE, +		    op->state, ==, FSCACHE_OP_ST_CANCELLED); +	op->state = FSCACHE_OP_ST_DEAD;  	fscache_stat(&fscache_n_op_release); @@ -340,8 +428,14 @@ void fscache_put_operation(struct fscache_operation *op)  	object = op->object; -	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) -		atomic_dec(&object->n_reads); +	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) { +		if (atomic_dec_and_test(&object->n_reads)) { +			clear_bit(FSCACHE_COOKIE_WAITING_ON_READS, +				  &object->cookie->flags); +			wake_up_bit(&object->cookie->flags, +				    FSCACHE_COOKIE_WAITING_ON_READS); +		} +	}  	/* now... we may get called with the object spinlock held, so we  	 * complete the cleanup here only if we can immediately acquire the @@ -359,16 +453,6 @@ void fscache_put_operation(struct fscache_operation *op)  		return;  	} -	if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { -		ASSERTCMP(object->n_exclusive, >, 0); -		object->n_exclusive--; -	} - -	ASSERTCMP(object->n_in_progress, >, 0); -	object->n_in_progress--; -	if (object->n_in_progress == 0) -		fscache_start_operations(object); -  	ASSERTCMP(object->n_ops, >, 0);  	object->n_ops--;  	if (object->n_ops == 0) @@ -407,23 +491,14 @@ void fscache_operation_gc(struct work_struct *work)  		spin_unlock(&cache->op_gc_list_lock);  		object = op->object; +		spin_lock(&object->lock);  		_debug("GC DEFERRED REL OBJ%x OP%x",  		       object->debug_id, op->debug_id);  		fscache_stat(&fscache_n_op_gc);  		ASSERTCMP(atomic_read(&op->usage), ==, 0); - -		spin_lock(&object->lock); -		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { -			ASSERTCMP(object->n_exclusive, >, 0); -			object->n_exclusive--; -		} - -		ASSERTCMP(object->n_in_progress, >, 0); -		object->n_in_progress--; -		if (object->n_in_progress == 0) -			fscache_start_operations(object); +		ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD);  		ASSERTCMP(object->n_ops, >, 0);  		object->n_ops--; @@ -431,6 +506,7 @@ void fscache_operation_gc(struct work_struct *work)  			fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);  		spin_unlock(&object->lock); +		kfree(op);  	} while (count++ < 20); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 3f7a59bfa7a..ff000e52072 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -56,6 +56,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,  	_enter("%p,%p,%x", cookie, page, gfp); +try_again:  	rcu_read_lock();  	val = radix_tree_lookup(&cookie->stores, page->index);  	if (!val) { @@ -104,11 +105,19 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,  	return true;  page_busy: -	/* we might want to wait here, but that could deadlock the allocator as -	 * the work threads writing to the cache may all end up sleeping -	 * on memory allocation */ -	fscache_stat(&fscache_n_store_vmscan_busy); -	return false; +	/* We will wait here if we're allowed to, but that could deadlock the +	 * allocator as the work threads writing to the cache may all end up +	 * sleeping on memory allocation, so we may need to impose a timeout +	 * too. */ +	if (!(gfp & __GFP_WAIT)) { +		fscache_stat(&fscache_n_store_vmscan_busy); +		return false; +	} + +	fscache_stat(&fscache_n_store_vmscan_wait); +	__fscache_wait_on_page_write(cookie, page); +	gfp &= ~__GFP_WAIT; +	goto try_again;  }  EXPORT_SYMBOL(__fscache_maybe_release_page); @@ -162,6 +171,7 @@ static void fscache_attr_changed_op(struct fscache_operation *op)  			fscache_abort_object(object);  	} +	fscache_op_complete(op, true);  	_leave("");  } @@ -223,6 +233,8 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)  	_enter("{OP%x}", op->op.debug_id); +	ASSERTCMP(op->n_pages, ==, 0); +  	fscache_hist(fscache_retrieval_histogram, op->start_time);  	if (op->context)  		fscache_put_context(op->op.object->cookie, op->context); @@ -291,6 +303,17 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)  }  /* + * Handle cancellation of a pending retrieval op + */ +static void fscache_do_cancel_retrieval(struct fscache_operation *_op) +{ +	struct fscache_retrieval *op = +		container_of(_op, struct fscache_retrieval, op); + +	op->n_pages = 0; +} + +/*   * wait for an object to become active (or dead)   */  static int fscache_wait_for_retrieval_activation(struct fscache_object *object, @@ -307,8 +330,8 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,  	fscache_stat(stat_op_waits);  	if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,  			fscache_wait_bit_interruptible, -			TASK_INTERRUPTIBLE) < 0) { -		ret = fscache_cancel_op(&op->op); +			TASK_INTERRUPTIBLE) != 0) { +		ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);  		if (ret == 0)  			return -ERESTARTSYS; @@ -320,7 +343,14 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,  	_debug("<<< GO");  check_if_dead: +	if (op->op.state == FSCACHE_OP_ST_CANCELLED) { +		fscache_stat(stat_object_dead); +		_leave(" = -ENOBUFS [cancelled]"); +		return -ENOBUFS; +	}  	if (unlikely(fscache_object_is_dead(object))) { +		pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state); +		fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);  		fscache_stat(stat_object_dead);  		return -ENOBUFS;  	} @@ -353,6 +383,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,  	if (hlist_empty(&cookie->backing_objects))  		goto nobufs; +	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { +		_leave(" = -ENOBUFS [invalidating]"); +		return -ENOBUFS; +	} +  	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);  	ASSERTCMP(page, !=, NULL); @@ -364,6 +399,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,  		_leave(" = -ENOMEM");  		return -ENOMEM;  	} +	op->n_pages = 1;  	spin_lock(&cookie->lock); @@ -375,10 +411,10 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,  	ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);  	atomic_inc(&object->n_reads); -	set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); +	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);  	if (fscache_submit_op(object, &op->op) < 0) -		goto nobufs_unlock; +		goto nobufs_unlock_dec;  	spin_unlock(&cookie->lock);  	fscache_stat(&fscache_n_retrieval_ops); @@ -425,6 +461,8 @@ error:  	_leave(" = %d", ret);  	return ret; +nobufs_unlock_dec: +	atomic_dec(&object->n_reads);  nobufs_unlock:  	spin_unlock(&cookie->lock);  	kfree(op); @@ -472,6 +510,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,  	if (hlist_empty(&cookie->backing_objects))  		goto nobufs; +	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { +		_leave(" = -ENOBUFS [invalidating]"); +		return -ENOBUFS; +	} +  	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);  	ASSERTCMP(*nr_pages, >, 0);  	ASSERT(!list_empty(pages)); @@ -482,6 +525,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,  	op = fscache_alloc_retrieval(mapping, end_io_func, context);  	if (!op)  		return -ENOMEM; +	op->n_pages = *nr_pages;  	spin_lock(&cookie->lock); @@ -491,10 +535,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,  			     struct fscache_object, cookie_link);  	atomic_inc(&object->n_reads); -	set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); +	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);  	if (fscache_submit_op(object, &op->op) < 0) -		goto nobufs_unlock; +		goto nobufs_unlock_dec;  	spin_unlock(&cookie->lock);  	fscache_stat(&fscache_n_retrieval_ops); @@ -541,6 +585,8 @@ error:  	_leave(" = %d", ret);  	return ret; +nobufs_unlock_dec: +	atomic_dec(&object->n_reads);  nobufs_unlock:  	spin_unlock(&cookie->lock);  	kfree(op); @@ -577,12 +623,18 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,  	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);  	ASSERTCMP(page, !=, NULL); +	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { +		_leave(" = -ENOBUFS [invalidating]"); +		return -ENOBUFS; +	} +  	if (fscache_wait_for_deferred_lookup(cookie) < 0)  		return -ERESTARTSYS;  	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);  	if (!op)  		return -ENOMEM; +	op->n_pages = 1;  	spin_lock(&cookie->lock); @@ -658,9 +710,27 @@ static void fscache_write_op(struct fscache_operation *_op)  	spin_lock(&object->lock);  	cookie = object->cookie; -	if (!fscache_object_is_active(object) || !cookie) { +	if (!fscache_object_is_active(object)) { +		/* If we get here, then the on-disk cache object likely longer +		 * exists, so we should just cancel this write operation. +		 */ +		spin_unlock(&object->lock); +		fscache_op_complete(&op->op, false); +		_leave(" [inactive]"); +		return; +	} + +	if (!cookie) { +		/* If we get here, then the cookie belonging to the object was +		 * detached, probably by the cookie being withdrawn due to +		 * memory pressure, which means that the pages we might write +		 * to the cache from no longer exist - therefore, we can just +		 * cancel this write operation. +		 */  		spin_unlock(&object->lock); -		_leave(""); +		fscache_op_complete(&op->op, false); +		_leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}", +		       _op->flags, _op->state, object->state, object->flags);  		return;  	} @@ -696,6 +766,7 @@ static void fscache_write_op(struct fscache_operation *_op)  	fscache_end_page_write(object, page);  	if (ret < 0) {  		fscache_abort_object(object); +		fscache_op_complete(&op->op, true);  	} else {  		fscache_enqueue_operation(&op->op);  	} @@ -710,6 +781,38 @@ superseded:  	spin_unlock(&cookie->stores_lock);  	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);  	spin_unlock(&object->lock); +	fscache_op_complete(&op->op, true); +	_leave(""); +} + +/* + * Clear the pages pending writing for invalidation + */ +void fscache_invalidate_writes(struct fscache_cookie *cookie) +{ +	struct page *page; +	void *results[16]; +	int n, i; + +	_enter(""); + +	while (spin_lock(&cookie->stores_lock), +	       n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, +					      ARRAY_SIZE(results), +					      FSCACHE_COOKIE_PENDING_TAG), +	       n > 0) { +		for (i = n - 1; i >= 0; i--) { +			page = results[i]; +			radix_tree_delete(&cookie->stores, page->index); +		} + +		spin_unlock(&cookie->stores_lock); + +		for (i = n - 1; i >= 0; i--) +			page_cache_release(results[i]); +	} + +	spin_unlock(&cookie->stores_lock);  	_leave("");  } @@ -759,7 +862,12 @@ int __fscache_write_page(struct fscache_cookie *cookie,  	fscache_stat(&fscache_n_stores); -	op = kzalloc(sizeof(*op), GFP_NOIO); +	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { +		_leave(" = -ENOBUFS [invalidating]"); +		return -ENOBUFS; +	} + +	op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);  	if (!op)  		goto nomem; @@ -915,6 +1023,40 @@ done:  EXPORT_SYMBOL(__fscache_uncache_page);  /** + * fscache_mark_page_cached - Mark a page as being cached + * @op: The retrieval op pages are being marked for + * @page: The page to be marked + * + * Mark a netfs page as being cached.  After this is called, the netfs + * must call fscache_uncache_page() to remove the mark. + */ +void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) +{ +	struct fscache_cookie *cookie = op->op.object->cookie; + +#ifdef CONFIG_FSCACHE_STATS +	atomic_inc(&fscache_n_marks); +#endif + +	_debug("- mark %p{%lx}", page, page->index); +	if (TestSetPageFsCache(page)) { +		static bool once_only; +		if (!once_only) { +			once_only = true; +			printk(KERN_WARNING "FS-Cache:" +			       " Cookie type %s marked page %lx" +			       " multiple times\n", +			       cookie->def->name, page->index); +		} +	} + +	if (cookie->def->mark_page_cached) +		cookie->def->mark_page_cached(cookie->netfs_data, +					      op->mapping, page); +} +EXPORT_SYMBOL(fscache_mark_page_cached); + +/**   * fscache_mark_pages_cached - Mark pages as being cached   * @op: The retrieval op pages are being marked for   * @pagevec: The pages to be marked @@ -925,32 +1067,11 @@ EXPORT_SYMBOL(__fscache_uncache_page);  void fscache_mark_pages_cached(struct fscache_retrieval *op,  			       struct pagevec *pagevec)  { -	struct fscache_cookie *cookie = op->op.object->cookie;  	unsigned long loop; -#ifdef CONFIG_FSCACHE_STATS -	atomic_add(pagevec->nr, &fscache_n_marks); -#endif - -	for (loop = 0; loop < pagevec->nr; loop++) { -		struct page *page = pagevec->pages[loop]; - -		_debug("- mark %p{%lx}", page, page->index); -		if (TestSetPageFsCache(page)) { -			static bool once_only; -			if (!once_only) { -				once_only = true; -				printk(KERN_WARNING "FS-Cache:" -				       " Cookie type %s marked page %lx" -				       " multiple times\n", -				       cookie->def->name, page->index); -			} -		} -	} +	for (loop = 0; loop < pagevec->nr; loop++) +		fscache_mark_page_cached(op, pagevec->pages[loop]); -	if (cookie->def->mark_pages_cached) -		cookie->def->mark_pages_cached(cookie->netfs_data, -					       op->mapping, pagevec);  	pagevec_reinit(pagevec);  }  EXPORT_SYMBOL(fscache_mark_pages_cached); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 4765190d537..8179e8bc4a3 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -69,6 +69,7 @@ atomic_t fscache_n_store_vmscan_not_storing;  atomic_t fscache_n_store_vmscan_gone;  atomic_t fscache_n_store_vmscan_busy;  atomic_t fscache_n_store_vmscan_cancelled; +atomic_t fscache_n_store_vmscan_wait;  atomic_t fscache_n_marks;  atomic_t fscache_n_uncaches; @@ -80,6 +81,9 @@ atomic_t fscache_n_acquires_ok;  atomic_t fscache_n_acquires_nobufs;  atomic_t fscache_n_acquires_oom; +atomic_t fscache_n_invalidates; +atomic_t fscache_n_invalidates_run; +  atomic_t fscache_n_updates;  atomic_t fscache_n_updates_null;  atomic_t fscache_n_updates_run; @@ -112,6 +116,7 @@ atomic_t fscache_n_cop_alloc_object;  atomic_t fscache_n_cop_lookup_object;  atomic_t fscache_n_cop_lookup_complete;  atomic_t fscache_n_cop_grab_object; +atomic_t fscache_n_cop_invalidate_object;  atomic_t fscache_n_cop_update_object;  atomic_t fscache_n_cop_drop_object;  atomic_t fscache_n_cop_put_object; @@ -168,6 +173,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)  		   atomic_read(&fscache_n_object_created),  		   atomic_read(&fscache_n_object_lookups_timed_out)); +	seq_printf(m, "Invals : n=%u run=%u\n", +		   atomic_read(&fscache_n_invalidates), +		   atomic_read(&fscache_n_invalidates_run)); +  	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",  		   atomic_read(&fscache_n_updates),  		   atomic_read(&fscache_n_updates_null), @@ -224,11 +233,12 @@ static int fscache_stats_show(struct seq_file *m, void *v)  		   atomic_read(&fscache_n_store_radix_deletes),  		   atomic_read(&fscache_n_store_pages_over_limit)); -	seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n", +	seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n",  		   atomic_read(&fscache_n_store_vmscan_not_storing),  		   atomic_read(&fscache_n_store_vmscan_gone),  		   atomic_read(&fscache_n_store_vmscan_busy), -		   atomic_read(&fscache_n_store_vmscan_cancelled)); +		   atomic_read(&fscache_n_store_vmscan_cancelled), +		   atomic_read(&fscache_n_store_vmscan_wait));  	seq_printf(m, "Ops    : pend=%u run=%u enq=%u can=%u rej=%u\n",  		   atomic_read(&fscache_n_op_pend), @@ -246,7 +256,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)  		   atomic_read(&fscache_n_cop_lookup_object),  		   atomic_read(&fscache_n_cop_lookup_complete),  		   atomic_read(&fscache_n_cop_grab_object)); -	seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n", +	seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n", +		   atomic_read(&fscache_n_cop_invalidate_object),  		   atomic_read(&fscache_n_cop_update_object),  		   atomic_read(&fscache_n_cop_drop_object),  		   atomic_read(&fscache_n_cop_put_object), diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0cf160a94ed..1b2f6c2c3aa 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -4,12 +4,24 @@ config FUSE_FS  	  With FUSE it is possible to implement a fully functional filesystem  	  in a userspace program. -	  There's also companion library: libfuse.  This library along with -	  utilities is available from the FUSE homepage: +	  There's also a companion library: libfuse2.  This library is available +	  from the FUSE homepage:  	  <http://fuse.sourceforge.net/> +	  although chances are your distribution already has that library +	  installed if you've installed the "fuse" package itself.  	  See <file:Documentation/filesystems/fuse.txt> for more information.  	  See <file:Documentation/Changes> for needed library/utility version.  	  If you want to develop a userspace FS, or if you want to use  	  a filesystem based on FUSE, answer Y or M. + +config CUSE +	tristate "Character device in Userspace support" +	depends on FUSE_FS +	help +	  This FUSE extension allows character devices to be +	  implemented in userspace. + +	  If you want to develop or use a userspace character device +	  based on CUSE, answer Y or M. diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 75a20c092dd..b7978b9f75e 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)  {  	struct fuse_conn *fc;  	mutex_lock(&fuse_mutex); -	fc = file->f_path.dentry->d_inode->i_private; +	fc = file_inode(file)->i_private;  	if (fc)  		fc = fuse_conn_get(fc);  	mutex_unlock(&fuse_mutex); diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index ee8d5504229..6f96a8def14 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -45,7 +45,6 @@  #include <linux/miscdevice.h>  #include <linux/mutex.h>  #include <linux/slab.h> -#include <linux/spinlock.h>  #include <linux/stat.h>  #include <linux/module.h> @@ -63,7 +62,7 @@ struct cuse_conn {  	bool			unrestricted_ioctl;  }; -static DEFINE_SPINLOCK(cuse_lock);		/* protects cuse_conntbl */ +static DEFINE_MUTEX(cuse_lock);		/* protects registration */  static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];  static struct class *cuse_class; @@ -92,19 +91,22 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,  			 loff_t *ppos)  {  	loff_t pos = 0; +	struct iovec iov = { .iov_base = buf, .iov_len = count }; -	return fuse_direct_io(file, buf, count, &pos, 0); +	return fuse_direct_io(file, &iov, 1, count, &pos, 0);  }  static ssize_t cuse_write(struct file *file, const char __user *buf,  			  size_t count, loff_t *ppos)  {  	loff_t pos = 0; +	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; +  	/*  	 * No locking or generic_write_checks(), the server is  	 * responsible for locking and sanity checks.  	 */ -	return fuse_direct_io(file, buf, count, &pos, 1); +	return fuse_direct_io(file, &iov, 1, count, &pos, 1);  }  static int cuse_open(struct inode *inode, struct file *file) @@ -114,14 +116,14 @@ static int cuse_open(struct inode *inode, struct file *file)  	int rc;  	/* look up and get the connection */ -	spin_lock(&cuse_lock); +	mutex_lock(&cuse_lock);  	list_for_each_entry(pos, cuse_conntbl_head(devt), list)  		if (pos->dev->devt == devt) {  			fuse_conn_get(&pos->fc);  			cc = pos;  			break;  		} -	spin_unlock(&cuse_lock); +	mutex_unlock(&cuse_lock);  	/* dead? */  	if (!cc) @@ -267,7 +269,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)  static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)  {  	char *end = p + len; -	char *key, *val; +	char *uninitialized_var(key), *uninitialized_var(val);  	int rc;  	while (true) { @@ -305,14 +307,14 @@ static void cuse_gendev_release(struct device *dev)   */  static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  { -	struct cuse_conn *cc = fc_to_cc(fc); +	struct cuse_conn *cc = fc_to_cc(fc), *pos;  	struct cuse_init_out *arg = req->out.args[0].value;  	struct page *page = req->pages[0];  	struct cuse_devinfo devinfo = { };  	struct device *dev;  	struct cdev *cdev;  	dev_t devt; -	int rc; +	int rc, i;  	if (req->out.h.error ||  	    arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { @@ -356,15 +358,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  	dev_set_drvdata(dev, cc);  	dev_set_name(dev, "%s", devinfo.name); +	mutex_lock(&cuse_lock); + +	/* make sure the device-name is unique */ +	for (i = 0; i < CUSE_CONNTBL_LEN; ++i) { +		list_for_each_entry(pos, &cuse_conntbl[i], list) +			if (!strcmp(dev_name(pos->dev), dev_name(dev))) +				goto err_unlock; +	} +  	rc = device_add(dev);  	if (rc) -		goto err_device; +		goto err_unlock;  	/* register cdev */  	rc = -ENOMEM;  	cdev = cdev_alloc();  	if (!cdev) -		goto err_device; +		goto err_unlock;  	cdev->owner = THIS_MODULE;  	cdev->ops = &cuse_frontend_fops; @@ -377,9 +388,8 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  	cc->cdev = cdev;  	/* make the device available */ -	spin_lock(&cuse_lock);  	list_add(&cc->list, cuse_conntbl_head(devt)); -	spin_unlock(&cuse_lock); +	mutex_unlock(&cuse_lock);  	/* announce device availability */  	dev_set_uevent_suppress(dev, 0); @@ -391,7 +401,8 @@ out:  err_cdev:  	cdev_del(cdev); -err_device: +err_unlock: +	mutex_unlock(&cuse_lock);  	put_device(dev);  err_region:  	unregister_chrdev_region(devt, 1); @@ -411,7 +422,7 @@ static int cuse_send_init(struct cuse_conn *cc)  	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); -	req = fuse_get_req(fc); +	req = fuse_get_req(fc, 1);  	if (IS_ERR(req)) {  		rc = PTR_ERR(req);  		goto err; @@ -441,6 +452,7 @@ static int cuse_send_init(struct cuse_conn *cc)  	req->out.argvar = 1;  	req->out.argpages = 1;  	req->pages[0] = page; +	req->page_descs[0].length = req->out.args[1].size;  	req->num_pages = 1;  	req->end = cuse_process_init_reply;  	fuse_request_send_background(fc, req); @@ -520,9 +532,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file)  	int rc;  	/* remove from the conntbl, no more access from this point on */ -	spin_lock(&cuse_lock); +	mutex_lock(&cuse_lock);  	list_del_init(&cc->list); -	spin_unlock(&cuse_lock); +	mutex_unlock(&cuse_lock);  	/* remove device */  	if (cc->dev) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c16335315e5..11dfa0c3fb4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -34,34 +34,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file)  	return file->private_data;  } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_req *req, struct page **pages, +			      struct fuse_page_desc *page_descs, +			      unsigned npages)  {  	memset(req, 0, sizeof(*req)); +	memset(pages, 0, sizeof(*pages) * npages); +	memset(page_descs, 0, sizeof(*page_descs) * npages);  	INIT_LIST_HEAD(&req->list);  	INIT_LIST_HEAD(&req->intr_entry);  	init_waitqueue_head(&req->waitq);  	atomic_set(&req->count, 1); +	req->pages = pages; +	req->page_descs = page_descs; +	req->max_pages = npages;  } -struct fuse_req *fuse_request_alloc(void) +static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)  { -	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL); -	if (req) -		fuse_request_init(req); +	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); +	if (req) { +		struct page **pages; +		struct fuse_page_desc *page_descs; + +		if (npages <= FUSE_REQ_INLINE_PAGES) { +			pages = req->inline_pages; +			page_descs = req->inline_page_descs; +		} else { +			pages = kmalloc(sizeof(struct page *) * npages, flags); +			page_descs = kmalloc(sizeof(struct fuse_page_desc) * +					     npages, flags); +		} + +		if (!pages || !page_descs) { +			kfree(pages); +			kfree(page_descs); +			kmem_cache_free(fuse_req_cachep, req); +			return NULL; +		} + +		fuse_request_init(req, pages, page_descs, npages); +	}  	return req;  } + +struct fuse_req *fuse_request_alloc(unsigned npages) +{ +	return __fuse_request_alloc(npages, GFP_KERNEL); +}  EXPORT_SYMBOL_GPL(fuse_request_alloc); -struct fuse_req *fuse_request_alloc_nofs(void) +struct fuse_req *fuse_request_alloc_nofs(unsigned npages)  { -	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); -	if (req) -		fuse_request_init(req); -	return req; +	return __fuse_request_alloc(npages, GFP_NOFS);  }  void fuse_request_free(struct fuse_req *req)  { +	if (req->pages != req->inline_pages) { +		kfree(req->pages); +		kfree(req->page_descs); +	}  	kmem_cache_free(fuse_req_cachep, req);  } @@ -97,7 +130,7 @@ static void fuse_req_init_context(struct fuse_req *req)  	req->in.h.pid = current->pid;  } -struct fuse_req *fuse_get_req(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)  {  	struct fuse_req *req;  	sigset_t oldset; @@ -116,7 +149,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)  	if (!fc->connected)  		goto out; -	req = fuse_request_alloc(); +	req = fuse_request_alloc(npages);  	err = -ENOMEM;  	if (!req)  		goto out; @@ -165,7 +198,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)  	struct fuse_file *ff = file->private_data;  	spin_lock(&fc->lock); -	fuse_request_init(req); +	fuse_request_init(req, req->pages, req->page_descs, req->max_pages);  	BUG_ON(ff->reserved_req);  	ff->reserved_req = req;  	wake_up_all(&fc->reserved_req_waitq); @@ -186,13 +219,14 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)   * filesystem should not have it's own file open.  If deadlock is   * intentional, it can still be broken by "aborting" the filesystem.   */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file) +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, +					     struct file *file)  {  	struct fuse_req *req;  	atomic_inc(&fc->num_waiting);  	wait_event(fc->blocked_waitq, !fc->blocked); -	req = fuse_request_alloc(); +	req = fuse_request_alloc(0);  	if (!req)  		req = get_reserved_req(fc, file); @@ -406,9 +440,8 @@ __acquires(fc->lock)  	}  } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)  { -	req->isreply = 1;  	spin_lock(&fc->lock);  	if (!fc->connected)  		req->out.h.error = -ENOTCONN; @@ -425,6 +458,12 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)  	}  	spin_unlock(&fc->lock);  } + +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ +	req->isreply = 1; +	__fuse_request_send(fc, req); +}  EXPORT_SYMBOL_GPL(fuse_request_send);  static void fuse_request_send_nowait_locked(struct fuse_conn *fc, @@ -491,6 +530,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,  	fuse_request_send_nowait_locked(fc, req);  } +void fuse_force_forget(struct file *file, u64 nodeid) +{ +	struct inode *inode = file_inode(file); +	struct fuse_conn *fc = get_fuse_conn(inode); +	struct fuse_req *req; +	struct fuse_forget_in inarg; + +	memset(&inarg, 0, sizeof(inarg)); +	inarg.nlookup = 1; +	req = fuse_get_req_nofail_nopages(fc, file); +	req->in.h.opcode = FUSE_FORGET; +	req->in.h.nodeid = nodeid; +	req->in.numargs = 1; +	req->in.args[0].size = sizeof(inarg); +	req->in.args[0].value = &inarg; +	req->isreply = 0; +	__fuse_request_send(fc, req); +	/* ignore errors */ +	fuse_put_request(fc, req); +} +  /*   * Lock the request.  Up to the next unlock_request() there mustn't be   * anything that could cause a page-fault.  If the request was already @@ -692,8 +752,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)  	struct page *oldpage = *pagep;  	struct page *newpage;  	struct pipe_buffer *buf = cs->pipebufs; -	struct address_space *mapping; -	pgoff_t index;  	unlock_request(cs->fc, cs->req);  	fuse_copy_finish(cs); @@ -724,9 +782,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)  	if (fuse_check_page(newpage) != 0)  		goto out_fallback_unlock; -	mapping = oldpage->mapping; -	index = oldpage->index; -  	/*  	 * This is a new and locked page, it shouldn't be mapped or  	 * have any special flags on it @@ -855,11 +910,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,  {  	unsigned i;  	struct fuse_req *req = cs->req; -	unsigned offset = req->page_offset; -	unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);  	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {  		int err; +		unsigned offset = req->page_descs[i].offset; +		unsigned count = min(nbytes, req->page_descs[i].length);  		err = fuse_copy_page(cs, &req->pages[i], offset, count,  				     zeroing); @@ -867,8 +922,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,  			return err;  		nbytes -= count; -		count = min(nbytes, (unsigned) PAGE_SIZE); -		offset = 0;  	}  	return 0;  } @@ -1541,29 +1594,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,  	unsigned int num;  	unsigned int offset;  	size_t total_len = 0; +	int num_pages; + +	offset = outarg->offset & ~PAGE_CACHE_MASK; +	file_size = i_size_read(inode); + +	num = outarg->size; +	if (outarg->offset > file_size) +		num = 0; +	else if (outarg->offset + num > file_size) +		num = file_size - outarg->offset; -	req = fuse_get_req(fc); +	num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; +	num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + +	req = fuse_get_req(fc, num_pages);  	if (IS_ERR(req))  		return PTR_ERR(req); -	offset = outarg->offset & ~PAGE_CACHE_MASK; -  	req->in.h.opcode = FUSE_NOTIFY_REPLY;  	req->in.h.nodeid = outarg->nodeid;  	req->in.numargs = 2;  	req->in.argpages = 1; -	req->page_offset = offset; +	req->page_descs[0].offset = offset;  	req->end = fuse_retrieve_end;  	index = outarg->offset >> PAGE_CACHE_SHIFT; -	file_size = i_size_read(inode); -	num = outarg->size; -	if (outarg->offset > file_size) -		num = 0; -	else if (outarg->offset + num > file_size) -		num = file_size - outarg->offset; -	while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { +	while (num && req->num_pages < num_pages) {  		struct page *page;  		unsigned int this_num; @@ -1573,6 +1631,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,  		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);  		req->pages[req->num_pages] = page; +		req->page_descs[req->num_pages].length = this_num;  		req->num_pages++;  		offset = 0; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b7c09f9eb40..ff15522481d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,6 +14,29 @@  #include <linux/namei.h>  #include <linux/slab.h> +static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) +{ +	struct fuse_conn *fc = get_fuse_conn(dir); +	struct fuse_inode *fi = get_fuse_inode(dir); + +	if (!fc->do_readdirplus) +		return false; +	if (!fc->readdirplus_auto) +		return true; +	if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) +		return true; +	if (filp->f_pos == 0) +		return true; +	return false; +} + +static void fuse_advise_use_readdirplus(struct inode *dir) +{ +	struct fuse_inode *fi = get_fuse_inode(dir); + +	set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); +} +  #if BITS_PER_LONG >= 64  static inline void fuse_dentry_settime(struct dentry *entry, u64 time)  { @@ -178,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)  			return -ECHILD;  		fc = get_fuse_conn(inode); -		req = fuse_get_req(fc); +		req = fuse_get_req_nopages(fc);  		if (IS_ERR(req))  			return 0; @@ -219,6 +242,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)  				       attr_version);  		fuse_change_entry_timeout(entry, &outarg);  	} +	fuse_advise_use_readdirplus(inode);  	return 1;  } @@ -271,7 +295,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,  	if (name->len > FUSE_NAME_MAX)  		goto out; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	err = PTR_ERR(req);  	if (IS_ERR(req))  		goto out; @@ -355,6 +379,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,  	else  		fuse_invalidate_entry_cache(entry); +	fuse_advise_use_readdirplus(dir);  	return newent;   out_iput: @@ -391,7 +416,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,  	if (!forget)  		goto out_err; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	err = PTR_ERR(req);  	if (IS_ERR(req))  		goto out_put_forget_req; @@ -592,7 +617,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,  {  	struct fuse_mknod_in inarg;  	struct fuse_conn *fc = get_fuse_conn(dir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -623,7 +648,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)  {  	struct fuse_mkdir_in inarg;  	struct fuse_conn *fc = get_fuse_conn(dir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -647,7 +672,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,  {  	struct fuse_conn *fc = get_fuse_conn(dir);  	unsigned len = strlen(link) + 1; -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -664,7 +689,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)  {  	int err;  	struct fuse_conn *fc = get_fuse_conn(dir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -682,7 +707,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)  		spin_lock(&fc->lock);  		fi->attr_version = ++fc->attr_version; -		drop_nlink(inode); +		/* +		 * If i_nlink == 0 then unlink doesn't make sense, yet this can +		 * happen if userspace filesystem is careless.  It would be +		 * difficult to enforce correct nlink usage so just ignore this +		 * condition here +		 */ +		if (inode->i_nlink > 0) +			drop_nlink(inode);  		spin_unlock(&fc->lock);  		fuse_invalidate_attr(inode);  		fuse_invalidate_attr(dir); @@ -696,7 +728,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)  {  	int err;  	struct fuse_conn *fc = get_fuse_conn(dir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -723,7 +755,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,  	int err;  	struct fuse_rename_in inarg;  	struct fuse_conn *fc = get_fuse_conn(olddir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -776,7 +808,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,  	struct fuse_link_in inarg;  	struct inode *inode = entry->d_inode;  	struct fuse_conn *fc = get_fuse_conn(inode); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -848,7 +880,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,  	struct fuse_req *req;  	u64 attr_version; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -985,7 +1017,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,  /*   * Calling into a user-controlled filesystem gives the filesystem - * daemon ptrace-like capabilities over the requester process.  This + * daemon ptrace-like capabilities over the current process.  This   * means, that the filesystem daemon is able to record the exact   * filesystem operations performed, and can also control the behavior   * of the requester process in otherwise impossible ways.  For example @@ -996,27 +1028,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,   * for which the owner of the mount has ptrace privilege.  This   * excludes processes started by other users, suid or sgid processes.   */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) +int fuse_allow_current_process(struct fuse_conn *fc)  {  	const struct cred *cred; -	int ret;  	if (fc->flags & FUSE_ALLOW_OTHER)  		return 1; -	rcu_read_lock(); -	ret = 0; -	cred = __task_cred(task); +	cred = current_cred();  	if (uid_eq(cred->euid, fc->user_id) &&  	    uid_eq(cred->suid, fc->user_id) &&  	    uid_eq(cred->uid,  fc->user_id) &&  	    gid_eq(cred->egid, fc->group_id) &&  	    gid_eq(cred->sgid, fc->group_id) &&  	    gid_eq(cred->gid,  fc->group_id)) -		ret = 1; -	rcu_read_unlock(); +		return 1; -	return ret; +	return 0;  }  static int fuse_access(struct inode *inode, int mask) @@ -1029,7 +1057,7 @@ static int fuse_access(struct inode *inode, int mask)  	if (fc->no_access)  		return 0; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1077,7 +1105,7 @@ static int fuse_permission(struct inode *inode, int mask)  	bool refreshed = false;  	int err = 0; -	if (!fuse_allow_task(fc, current)) +	if (!fuse_allow_current_process(fc))  		return -EACCES;  	/* @@ -1155,19 +1183,157 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,  	return 0;  } -static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +static int fuse_direntplus_link(struct file *file, +				struct fuse_direntplus *direntplus, +				u64 attr_version)  {  	int err; +	struct fuse_entry_out *o = &direntplus->entry_out; +	struct fuse_dirent *dirent = &direntplus->dirent; +	struct dentry *parent = file->f_path.dentry; +	struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); +	struct dentry *dentry; +	struct dentry *alias; +	struct inode *dir = parent->d_inode; +	struct fuse_conn *fc; +	struct inode *inode; + +	if (!o->nodeid) { +		/* +		 * Unlike in the case of fuse_lookup, zero nodeid does not mean +		 * ENOENT. Instead, it only means the userspace filesystem did +		 * not want to return attributes/handle for this entry. +		 * +		 * So do nothing. +		 */ +		return 0; +	} + +	if (name.name[0] == '.') { +		/* +		 * We could potentially refresh the attributes of the directory +		 * and its parent? +		 */ +		if (name.len == 1) +			return 0; +		if (name.name[1] == '.' && name.len == 2) +			return 0; +	} +	fc = get_fuse_conn(dir); + +	name.hash = full_name_hash(name.name, name.len); +	dentry = d_lookup(parent, &name); +	if (dentry && dentry->d_inode) { +		inode = dentry->d_inode; +		if (get_node_id(inode) == o->nodeid) { +			struct fuse_inode *fi; +			fi = get_fuse_inode(inode); +			spin_lock(&fc->lock); +			fi->nlookup++; +			spin_unlock(&fc->lock); + +			/* +			 * The other branch to 'found' comes via fuse_iget() +			 * which bumps nlookup inside +			 */ +			goto found; +		} +		err = d_invalidate(dentry); +		if (err) +			goto out; +		dput(dentry); +		dentry = NULL; +	} + +	dentry = d_alloc(parent, &name); +	err = -ENOMEM; +	if (!dentry) +		goto out; + +	inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, +			  &o->attr, entry_attr_timeout(o), attr_version); +	if (!inode) +		goto out; + +	alias = d_materialise_unique(dentry, inode); +	err = PTR_ERR(alias); +	if (IS_ERR(alias)) +		goto out; +	if (alias) { +		dput(dentry); +		dentry = alias; +	} + +found: +	fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), +			       attr_version); + +	fuse_change_entry_timeout(dentry, o); + +	err = 0; +out: +	if (dentry) +		dput(dentry); +	return err; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, +			     void *dstbuf, filldir_t filldir, u64 attr_version) +{ +	struct fuse_direntplus *direntplus; +	struct fuse_dirent *dirent; +	size_t reclen; +	int over = 0; +	int ret; + +	while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { +		direntplus = (struct fuse_direntplus *) buf; +		dirent = &direntplus->dirent; +		reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + +		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) +			return -EIO; +		if (reclen > nbytes) +			break; + +		if (!over) { +			/* We fill entries into dstbuf only as much as +			   it can hold. But we still continue iterating +			   over remaining entries to link them. If not, +			   we need to send a FORGET for each of those +			   which we did not link. +			*/ +			over = filldir(dstbuf, dirent->name, dirent->namelen, +				       file->f_pos, dirent->ino, +				       dirent->type); +			file->f_pos = dirent->off; +		} + +		buf += reclen; +		nbytes -= reclen; + +		ret = fuse_direntplus_link(file, direntplus, attr_version); +		if (ret) +			fuse_force_forget(file, direntplus->entry_out.nodeid); +	} + +	return 0; +} + +static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +{ +	int plus, err;  	size_t nbytes;  	struct page *page; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_req *req; +	u64 attr_version = 0;  	if (is_bad_inode(inode))  		return -EIO; -	req = fuse_get_req(fc); +	req = fuse_get_req(fc, 1);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1176,17 +1342,34 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)  		fuse_put_request(fc, req);  		return -ENOMEM;  	} + +	plus = fuse_use_readdirplus(inode, file);  	req->out.argpages = 1;  	req->num_pages = 1;  	req->pages[0] = page; -	fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); +	req->page_descs[0].length = PAGE_SIZE; +	if (plus) { +		attr_version = fuse_get_attr_version(fc); +		fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, +			       FUSE_READDIRPLUS); +	} else { +		fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, +			       FUSE_READDIR); +	}  	fuse_request_send(fc, req);  	nbytes = req->out.args[0].size;  	err = req->out.h.error;  	fuse_put_request(fc, req); -	if (!err) -		err = parse_dirfile(page_address(page), nbytes, file, dstbuf, -				    filldir); +	if (!err) { +		if (plus) { +			err = parse_dirplusfile(page_address(page), nbytes, +						file, dstbuf, filldir, +						attr_version); +		} else { +			err = parse_dirfile(page_address(page), nbytes, file, +					    dstbuf, filldir); +		} +	}  	__free_page(page);  	fuse_invalidate_attr(inode); /* atime changed */ @@ -1197,7 +1380,7 @@ static char *read_link(struct dentry *dentry)  {  	struct inode *inode = dentry->d_inode;  	struct fuse_conn *fc = get_fuse_conn(inode); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	char *link;  	if (IS_ERR(req)) @@ -1391,7 +1574,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,  	loff_t oldsize;  	int err; -	if (!fuse_allow_task(fc, current)) +	if (!fuse_allow_current_process(fc))  		return -EACCES;  	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) @@ -1410,7 +1593,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,  	if (attr->ia_valid & ATTR_SIZE)  		is_truncate = true; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1500,7 +1683,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,  	struct inode *inode = entry->d_inode;  	struct fuse_conn *fc = get_fuse_conn(inode); -	if (!fuse_allow_task(fc, current)) +	if (!fuse_allow_current_process(fc))  		return -EACCES;  	return fuse_update_attributes(inode, stat, NULL, NULL); @@ -1518,7 +1701,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,  	if (fc->no_setxattr)  		return -EOPNOTSUPP; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1557,7 +1740,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,  	if (fc->no_getxattr)  		return -EOPNOTSUPP; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1603,13 +1786,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)  	struct fuse_getxattr_out outarg;  	ssize_t ret; -	if (!fuse_allow_task(fc, current)) +	if (!fuse_allow_current_process(fc))  		return -EACCES;  	if (fc->no_listxattr)  		return -EOPNOTSUPP; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1654,7 +1837,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)  	if (fc->no_removexattr)  		return -EOPNOTSUPP; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e21d4d8f87e..34b80ba95ba 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -25,7 +25,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,  	struct fuse_req *req;  	int err; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -57,7 +57,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)  		return NULL;  	ff->fc = fc; -	ff->reserved_req = fuse_request_alloc(); +	ff->reserved_req = fuse_request_alloc(0);  	if (unlikely(!ff->reserved_req)) {  		kfree(ff);  		return NULL; @@ -355,7 +355,7 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)  static int fuse_flush(struct file *file, fl_owner_t id)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_file *ff = file->private_data;  	struct fuse_req *req; @@ -368,7 +368,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)  	if (fc->no_flush)  		return 0; -	req = fuse_get_req_nofail(fc, file); +	req = fuse_get_req_nofail_nopages(fc, file);  	memset(&inarg, 0, sizeof(inarg));  	inarg.fh = ff->fh;  	inarg.lock_owner = fuse_lock_owner_id(fc, id); @@ -436,7 +436,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,  	fuse_sync_writes(inode); -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req)) {  		err = PTR_ERR(req);  		goto out; @@ -544,7 +544,7 @@ static int fuse_readpage(struct file *file, struct page *page)  	 */  	fuse_wait_on_page_writeback(inode, page->index); -	req = fuse_get_req(fc); +	req = fuse_get_req(fc, 1);  	err = PTR_ERR(req);  	if (IS_ERR(req))  		goto out; @@ -555,6 +555,7 @@ static int fuse_readpage(struct file *file, struct page *page)  	req->out.argpages = 1;  	req->num_pages = 1;  	req->pages[0] = page; +	req->page_descs[0].length = count;  	num_read = fuse_send_read(req, file, pos, count, NULL);  	err = req->out.h.error;  	fuse_put_request(fc, req); @@ -641,6 +642,7 @@ struct fuse_fill_data {  	struct fuse_req *req;  	struct file *file;  	struct inode *inode; +	unsigned nr_pages;  };  static int fuse_readpages_fill(void *_data, struct page *page) @@ -656,16 +658,26 @@ static int fuse_readpages_fill(void *_data, struct page *page)  	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||  	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||  	     req->pages[req->num_pages - 1]->index + 1 != page->index)) { +		int nr_alloc = min_t(unsigned, data->nr_pages, +				     FUSE_MAX_PAGES_PER_REQ);  		fuse_send_readpages(req, data->file); -		data->req = req = fuse_get_req(fc); +		data->req = req = fuse_get_req(fc, nr_alloc);  		if (IS_ERR(req)) {  			unlock_page(page);  			return PTR_ERR(req);  		}  	} + +	if (WARN_ON(req->num_pages >= req->max_pages)) { +		fuse_put_request(fc, req); +		return -EIO; +	} +  	page_cache_get(page);  	req->pages[req->num_pages] = page; +	req->page_descs[req->num_pages].length = PAGE_SIZE;  	req->num_pages++; +	data->nr_pages--;  	return 0;  } @@ -676,6 +688,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_fill_data data;  	int err; +	int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);  	err = -EIO;  	if (is_bad_inode(inode)) @@ -683,7 +696,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,  	data.file = file;  	data.inode = inode; -	data.req = fuse_get_req(fc); +	data.req = fuse_get_req(fc, nr_alloc); +	data.nr_pages = nr_pages;  	err = PTR_ERR(data.req);  	if (IS_ERR(data.req))  		goto out; @@ -786,7 +800,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,  	res = fuse_send_write(req, file, pos, count, NULL); -	offset = req->page_offset; +	offset = req->page_descs[0].offset;  	count = res;  	for (i = 0; i < req->num_pages; i++) {  		struct page *page = req->pages[i]; @@ -817,7 +831,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,  	int err;  	req->in.argpages = 1; -	req->page_offset = offset; +	req->page_descs[0].offset = offset;  	do {  		size_t tmp; @@ -857,6 +871,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,  		err = 0;  		req->pages[req->num_pages] = page; +		req->page_descs[req->num_pages].length = tmp;  		req->num_pages++;  		iov_iter_advance(ii, tmp); @@ -869,11 +884,19 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,  		if (!fc->big_writes)  			break;  	} while (iov_iter_count(ii) && count < fc->max_write && -		 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); +		 req->num_pages < req->max_pages && offset == 0);  	return count > 0 ? count : err;  } +static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +{ +	return min_t(unsigned, +		     ((pos + len - 1) >> PAGE_CACHE_SHIFT) - +		     (pos >> PAGE_CACHE_SHIFT) + 1, +		     FUSE_MAX_PAGES_PER_REQ); +} +  static ssize_t fuse_perform_write(struct file *file,  				  struct address_space *mapping,  				  struct iov_iter *ii, loff_t pos) @@ -889,8 +912,9 @@ static ssize_t fuse_perform_write(struct file *file,  	do {  		struct fuse_req *req;  		ssize_t count; +		unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); -		req = fuse_get_req(fc); +		req = fuse_get_req(fc, nr_pages);  		if (IS_ERR(req)) {  			err = PTR_ERR(req);  			break; @@ -1023,47 +1047,110 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)  	}  } -static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, +static inline void fuse_page_descs_length_init(struct fuse_req *req, +		unsigned index, unsigned nr_pages) +{ +	int i; + +	for (i = index; i < index + nr_pages; i++) +		req->page_descs[i].length = PAGE_SIZE - +			req->page_descs[i].offset; +} + +static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) +{ +	return (unsigned long)ii->iov->iov_base + ii->iov_offset; +} + +static inline size_t fuse_get_frag_size(const struct iov_iter *ii, +					size_t max_size) +{ +	return min(iov_iter_single_seg_count(ii), max_size); +} + +static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,  			       size_t *nbytesp, int write)  { -	size_t nbytes = *nbytesp; -	unsigned long user_addr = (unsigned long) buf; -	unsigned offset = user_addr & ~PAGE_MASK; -	int npages; +	size_t nbytes = 0;  /* # bytes already packed in req */  	/* Special case for kernel I/O: can copy directly into the buffer */  	if (segment_eq(get_fs(), KERNEL_DS)) { +		unsigned long user_addr = fuse_get_user_addr(ii); +		size_t frag_size = fuse_get_frag_size(ii, *nbytesp); +  		if (write)  			req->in.args[1].value = (void *) user_addr;  		else  			req->out.args[0].value = (void *) user_addr; +		iov_iter_advance(ii, frag_size); +		*nbytesp = frag_size;  		return 0;  	} -	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); -	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; -	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); -	npages = get_user_pages_fast(user_addr, npages, !write, req->pages); -	if (npages < 0) -		return npages; +	while (nbytes < *nbytesp && req->num_pages < req->max_pages) { +		unsigned npages; +		unsigned long user_addr = fuse_get_user_addr(ii); +		unsigned offset = user_addr & ~PAGE_MASK; +		size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes); +		int ret; + +		unsigned n = req->max_pages - req->num_pages; +		frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT); + +		npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; +		npages = clamp(npages, 1U, n); + +		ret = get_user_pages_fast(user_addr, npages, !write, +					  &req->pages[req->num_pages]); +		if (ret < 0) +			return ret; -	req->num_pages = npages; -	req->page_offset = offset; +		npages = ret; +		frag_size = min_t(size_t, frag_size, +				  (npages << PAGE_SHIFT) - offset); +		iov_iter_advance(ii, frag_size); + +		req->page_descs[req->num_pages].offset = offset; +		fuse_page_descs_length_init(req, req->num_pages, npages); + +		req->num_pages += npages; +		req->page_descs[req->num_pages - 1].length -= +			(npages << PAGE_SHIFT) - offset - frag_size; + +		nbytes += frag_size; +	}  	if (write)  		req->in.argpages = 1;  	else  		req->out.argpages = 1; -	nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; -	*nbytesp = min(*nbytesp, nbytes); +	*nbytesp = nbytes;  	return 0;  } -ssize_t fuse_direct_io(struct file *file, const char __user *buf, -		       size_t count, loff_t *ppos, int write) +static inline int fuse_iter_npages(const struct iov_iter *ii_p) +{ +	struct iov_iter ii = *ii_p; +	int npages = 0; + +	while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) { +		unsigned long user_addr = fuse_get_user_addr(&ii); +		unsigned offset = user_addr & ~PAGE_MASK; +		size_t frag_size = iov_iter_single_seg_count(&ii); + +		npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; +		iov_iter_advance(&ii, frag_size); +	} + +	return min(npages, FUSE_MAX_PAGES_PER_REQ); +} + +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, +		       unsigned long nr_segs, size_t count, loff_t *ppos, +		       int write)  {  	struct fuse_file *ff = file->private_data;  	struct fuse_conn *fc = ff->fc; @@ -1071,8 +1158,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,  	loff_t pos = *ppos;  	ssize_t res = 0;  	struct fuse_req *req; +	struct iov_iter ii; + +	iov_iter_init(&ii, iov, nr_segs, count, 0); -	req = fuse_get_req(fc); +	req = fuse_get_req(fc, fuse_iter_npages(&ii));  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1080,7 +1170,7 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,  		size_t nres;  		fl_owner_t owner = current->files;  		size_t nbytes = min(count, nmax); -		int err = fuse_get_user_pages(req, buf, &nbytes, write); +		int err = fuse_get_user_pages(req, &ii, &nbytes, write);  		if (err) {  			res = err;  			break; @@ -1103,12 +1193,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,  		count -= nres;  		res += nres;  		pos += nres; -		buf += nres;  		if (nres != nbytes)  			break;  		if (count) {  			fuse_put_request(fc, req); -			req = fuse_get_req(fc); +			req = fuse_get_req(fc, fuse_iter_npages(&ii));  			if (IS_ERR(req))  				break;  		} @@ -1122,31 +1211,40 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,  }  EXPORT_SYMBOL_GPL(fuse_direct_io); -static ssize_t fuse_direct_read(struct file *file, char __user *buf, -				     size_t count, loff_t *ppos) +static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov, +				  unsigned long nr_segs, loff_t *ppos)  {  	ssize_t res; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	if (is_bad_inode(inode))  		return -EIO; -	res = fuse_direct_io(file, buf, count, ppos, 0); +	res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs), +			     ppos, 0);  	fuse_invalidate_attr(inode);  	return res;  } -static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, -				   size_t count, loff_t *ppos) +static ssize_t fuse_direct_read(struct file *file, char __user *buf, +				     size_t count, loff_t *ppos)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct iovec iov = { .iov_base = buf, .iov_len = count }; +	return __fuse_direct_read(file, &iov, 1, ppos); +} + +static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov, +				   unsigned long nr_segs, loff_t *ppos) +{ +	struct inode *inode = file_inode(file); +	size_t count = iov_length(iov, nr_segs);  	ssize_t res;  	res = generic_write_checks(file, ppos, &count, 0);  	if (!res) { -		res = fuse_direct_io(file, buf, count, ppos, 1); +		res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1);  		if (res > 0)  			fuse_write_update_size(inode, *ppos);  	} @@ -1159,7 +1257,8 @@ static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,  static ssize_t fuse_direct_write(struct file *file, const char __user *buf,  				 size_t count, loff_t *ppos)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; +	struct inode *inode = file_inode(file);  	ssize_t res;  	if (is_bad_inode(inode)) @@ -1167,7 +1266,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,  	/* Don't allow parallel writes to the same file */  	mutex_lock(&inode->i_mutex); -	res = __fuse_direct_write(file, buf, count, ppos); +	res = __fuse_direct_write(file, &iov, 1, ppos);  	mutex_unlock(&inode->i_mutex);  	return res; @@ -1272,7 +1371,7 @@ static int fuse_writepage_locked(struct page *page)  	set_page_writeback(page); -	req = fuse_request_alloc_nofs(); +	req = fuse_request_alloc_nofs(1);  	if (!req)  		goto err; @@ -1293,7 +1392,8 @@ static int fuse_writepage_locked(struct page *page)  	req->in.argpages = 1;  	req->num_pages = 1;  	req->pages[0] = tmp_page; -	req->page_offset = 0; +	req->page_descs[0].offset = 0; +	req->page_descs[0].length = PAGE_SIZE;  	req->end = fuse_writepage_end;  	req->inode = inode; @@ -1385,7 +1485,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = {  static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)  {  	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { -		struct inode *inode = file->f_dentry->d_inode; +		struct inode *inode = file_inode(file);  		struct fuse_conn *fc = get_fuse_conn(inode);  		struct fuse_inode *fi = get_fuse_inode(inode);  		struct fuse_file *ff = file->private_data; @@ -1443,7 +1543,7 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file,  			 const struct file_lock *fl, int opcode, pid_t pid,  			 int flock)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_file *ff = file->private_data;  	struct fuse_lk_in *arg = &req->misc.lk_in; @@ -1465,13 +1565,13 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file,  static int fuse_getlk(struct file *file, struct file_lock *fl)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_req *req;  	struct fuse_lk_out outarg;  	int err; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1490,7 +1590,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)  static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_req *req;  	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; @@ -1506,7 +1606,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)  	if (fl->fl_flags & FL_CLOSE)  		return 0; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1522,7 +1622,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)  static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	int err; @@ -1545,7 +1645,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)  static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	int err; @@ -1575,7 +1675,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)  	if (!inode->i_sb->s_bdev || fc->no_bmap)  		return 0; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return 0; @@ -1602,7 +1702,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)  static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)  {  	loff_t retval; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */  	if (whence == SEEK_CUR || whence == SEEK_SET) @@ -1873,7 +1973,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  		num_pages++;  	} -	req = fuse_get_req(fc); +	req = fuse_get_req(fc, num_pages);  	if (IS_ERR(req)) {  		err = PTR_ERR(req);  		req = NULL; @@ -1881,6 +1981,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  	}  	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);  	req->num_pages = num_pages; +	fuse_page_descs_length_init(req, 0, req->num_pages);  	/* okay, let's send it to the client */  	req->in.h.opcode = FUSE_IOCTL; @@ -1978,10 +2079,10 @@ EXPORT_SYMBOL_GPL(fuse_do_ioctl);  long fuse_ioctl_common(struct file *file, unsigned int cmd,  		       unsigned long arg, unsigned int flags)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode); -	if (!fuse_allow_task(fc, current)) +	if (!fuse_allow_current_process(fc))  		return -EACCES;  	if (is_bad_inode(inode)) @@ -2066,6 +2167,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)  		return DEFAULT_POLLMASK;  	poll_wait(file, &ff->poll_wait, wait); +	inarg.events = (__u32)poll_requested_events(wait);  	/*  	 * Ask for notification iff there's someone waiting for it. @@ -2076,7 +2178,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)  		fuse_register_polled_file(fc, ff);  	} -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return POLLERR; @@ -2126,41 +2228,6 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,  	return 0;  } -static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov, -			     unsigned long nr_segs, loff_t *ppos, int rw) -{ -	const struct iovec *vector = iov; -	ssize_t ret = 0; - -	while (nr_segs > 0) { -		void __user *base; -		size_t len; -		ssize_t nr; - -		base = vector->iov_base; -		len = vector->iov_len; -		vector++; -		nr_segs--; - -		if (rw == WRITE) -			nr = __fuse_direct_write(filp, base, len, ppos); -		else -			nr = fuse_direct_read(filp, base, len, ppos); - -		if (nr < 0) { -			if (!ret) -				ret = nr; -			break; -		} -		ret += nr; -		if (nr != len) -			break; -	} - -	return ret; -} - -  static ssize_t  fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,  			loff_t offset, unsigned long nr_segs) @@ -2172,13 +2239,16 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,  	file = iocb->ki_filp;  	pos = offset; -	ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw); +	if (rw == WRITE) +		ret = __fuse_direct_write(file, iov, nr_segs, &pos); +	else +		ret = __fuse_direct_read(file, iov, nr_segs, &pos);  	return ret;  } -long fuse_file_fallocate(struct file *file, int mode, loff_t offset, -			    loff_t length) +static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, +				loff_t length)  {  	struct fuse_file *ff = file->private_data;  	struct fuse_conn *fc = ff->fc; @@ -2194,7 +2264,7 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset,  	if (fc->no_fallocate)  		return -EOPNOTSUPP; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -2213,7 +2283,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset,  	return err;  } -EXPORT_SYMBOL_GPL(fuse_file_fallocate);  static const struct file_operations fuse_file_operations = {  	.llseek		= fuse_file_llseek, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e105a53fc72..6aeba864f07 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -44,6 +44,9 @@      doing the mount will be allowed to access the filesystem */  #define FUSE_ALLOW_OTHER         (1 << 1) +/** Number of page pointers embedded in fuse_req */ +#define FUSE_REQ_INLINE_PAGES 1 +  /** List of active connections */  extern struct list_head fuse_conn_list; @@ -103,6 +106,15 @@ struct fuse_inode {  	/** List of writepage requestst (pending or sent) */  	struct list_head writepages; + +	/** Miscellaneous bits describing inode state */ +	unsigned long state; +}; + +/** FUSE inode state bits */ +enum { +	/** Advise readdirplus  */ +	FUSE_I_ADVISE_RDPLUS,  };  struct fuse_conn; @@ -200,6 +212,12 @@ struct fuse_out {  	struct fuse_arg args[3];  }; +/** FUSE page descriptor */ +struct fuse_page_desc { +	unsigned int length; +	unsigned int offset; +}; +  /** The request state */  enum fuse_req_state {  	FUSE_REQ_INIT = 0, @@ -291,14 +309,23 @@ struct fuse_req {  	} misc;  	/** page vector */ -	struct page *pages[FUSE_MAX_PAGES_PER_REQ]; +	struct page **pages; + +	/** page-descriptor vector */ +	struct fuse_page_desc *page_descs; + +	/** size of the 'pages' array */ +	unsigned max_pages; + +	/** inline page vector */ +	struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; + +	/** inline page-descriptor vector */ +	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];  	/** number of pages in vector */  	unsigned num_pages; -	/** offset of data on first page */ -	unsigned page_offset; -  	/** File used in the request (or NULL) */  	struct fuse_file *ff; @@ -487,6 +514,12 @@ struct fuse_conn {  	/** Use enhanced/automatic page cache invalidation. */  	unsigned auto_inval_data:1; +	/** Does the filesystem support readdirplus? */ +	unsigned do_readdirplus:1; + +	/** Does the filesystem want adaptive readdirplus? */ +	unsigned readdirplus_auto:1; +  	/** The number of requests waiting for completion */  	atomic_t num_waiting; @@ -578,6 +611,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,  struct fuse_forget_link *fuse_alloc_forget(void); +/* Used by READDIRPLUS */ +void fuse_force_forget(struct file *file, u64 nodeid); +  /**   * Initialize READ or READDIR request   */ @@ -658,9 +694,9 @@ void fuse_ctl_cleanup(void);  /**   * Allocate a request   */ -struct fuse_req *fuse_request_alloc(void); +struct fuse_req *fuse_request_alloc(unsigned npages); -struct fuse_req *fuse_request_alloc_nofs(void); +struct fuse_req *fuse_request_alloc_nofs(unsigned npages);  /**   * Free a request @@ -668,14 +704,25 @@ struct fuse_req *fuse_request_alloc_nofs(void);  void fuse_request_free(struct fuse_req *req);  /** - * Get a request, may fail with -ENOMEM + * Get a request, may fail with -ENOMEM, + * caller should specify # elements in req->pages[] explicitly   */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); + +/** + * Get a request, may fail with -ENOMEM, + * useful for callers who doesn't use req->pages[] + */ +static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc) +{ +	return fuse_get_req(fc, 0); +}  /**   * Gets a requests for a file operation, always succeeds   */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file); +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, +					     struct file *file);  /**   * Decrement reference count of a request.  If count goes to zero free @@ -739,9 +786,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc);  int fuse_valid_type(int m);  /** - * Is task allowed to perform filesystem operation? + * Is current process allowed to perform filesystem operation?   */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task); +int fuse_allow_current_process(struct fuse_conn *fc);  u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); @@ -776,8 +823,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,  int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,  		 bool isdir); -ssize_t fuse_direct_io(struct file *file, const char __user *buf, -		       size_t count, loff_t *ppos, int write); +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, +		       unsigned long nr_segs, size_t count, loff_t *ppos, +		       int write);  long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  		   unsigned int flags);  long fuse_ioctl_common(struct file *file, unsigned int cmd, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 73ca6b72bea..df00993ed10 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -92,6 +92,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)  	fi->attr_version = 0;  	fi->writectr = 0;  	fi->orig_ino = 0; +	fi->state = 0;  	INIT_LIST_HEAD(&fi->write_files);  	INIT_LIST_HEAD(&fi->queued_writes);  	INIT_LIST_HEAD(&fi->writepages); @@ -408,12 +409,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)  	struct fuse_statfs_out outarg;  	int err; -	if (!fuse_allow_task(fc, current)) { +	if (!fuse_allow_current_process(fc)) {  		buf->f_type = FUSE_SUPER_MAGIC;  		return 0;  	} -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -678,7 +679,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,  	if (*max_len < len) {  		*max_len = len; -		return  255; +		return  FILEID_INVALID;  	}  	nodeid = get_fuse_inode(inode)->nodeid; @@ -863,6 +864,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  				fc->dont_mask = 1;  			if (arg->flags & FUSE_AUTO_INVAL_DATA)  				fc->auto_inval_data = 1; +			if (arg->flags & FUSE_DO_READDIRPLUS) +				fc->do_readdirplus = 1; +			if (arg->flags & FUSE_READDIRPLUS_AUTO) +				fc->readdirplus_auto = 1;  		} else {  			ra_pages = fc->max_read / PAGE_CACHE_SIZE;  			fc->no_lock = 1; @@ -889,7 +894,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)  	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |  		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |  		FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | -		FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA; +		FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | +		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO;  	req->in.h.opcode = FUSE_INIT;  	req->in.numargs = 1;  	req->in.args[0].size = sizeof(*arg); @@ -1034,12 +1040,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)  	/* only now - we want root dentry with NULL ->d_op */  	sb->s_d_op = &fuse_dentry_operations; -	init_req = fuse_request_alloc(); +	init_req = fuse_request_alloc(0);  	if (!init_req)  		goto err_put_root;  	if (is_bdev) { -		fc->destroy_req = fuse_request_alloc(); +		fc->destroy_req = fuse_request_alloc(0);  		if (!fc->destroy_req)  			goto err_free_init_req;  	} diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index f850020ad90..f69ac0af549 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -237,7 +237,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,  		return -EINVAL;  	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))  		return value ? -EACCES : 0; -	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) +	if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER))  		return -EPERM;  	if (S_ISLNK(inode->i_mode))  		return -EOPNOTSUPP; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 30de4f2a2ea..24f414f0ce6 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -51,7 +51,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,  			continue;  		if (gfs2_is_jdata(ip))  			set_buffer_uptodate(bh); -		gfs2_trans_add_bh(ip->i_gl, bh, 0); +		gfs2_trans_add_data(ip->i_gl, bh);  	}  } @@ -230,16 +230,14 @@ out_ignore:  }  /** - * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk + * gfs2_writepages - Write a bunch of dirty pages back to disk   * @mapping: The mapping to write   * @wbc: Write-back control   * - * For the data=writeback case we can already ignore buffer heads - * and write whole extents at once. This is a big reduction in the - * number of I/O requests we send and the bmap calls we make in this case. + * Used for both ordered and writeback modes.   */ -static int gfs2_writeback_writepages(struct address_space *mapping, -				     struct writeback_control *wbc) +static int gfs2_writepages(struct address_space *mapping, +			   struct writeback_control *wbc)  {  	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);  } @@ -852,7 +850,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,  		goto failed;  	} -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	if (gfs2_is_stuffed(ip))  		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); @@ -1102,7 +1100,7 @@ cannot_release:  static const struct address_space_operations gfs2_writeback_aops = {  	.writepage = gfs2_writeback_writepage, -	.writepages = gfs2_writeback_writepages, +	.writepages = gfs2_writepages,  	.readpage = gfs2_readpage,  	.readpages = gfs2_readpages,  	.write_begin = gfs2_write_begin, @@ -1118,6 +1116,7 @@ static const struct address_space_operations gfs2_writeback_aops = {  static const struct address_space_operations gfs2_ordered_aops = {  	.writepage = gfs2_ordered_writepage, +	.writepages = gfs2_writepages,  	.readpage = gfs2_readpage,  	.readpages = gfs2_readpages,  	.write_begin = gfs2_write_begin, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index a68e91bcef3..5e83657f046 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -22,6 +22,7 @@  #include "meta_io.h"  #include "quota.h"  #include "rgrp.h" +#include "log.h"  #include "super.h"  #include "trans.h"  #include "dir.h" @@ -93,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,  	if (!gfs2_is_jdata(ip))  		mark_buffer_dirty(bh);  	if (!gfs2_is_writeback(ip)) -		gfs2_trans_add_bh(ip->i_gl, bh, 0); +		gfs2_trans_add_data(ip->i_gl, bh);  	if (release) {  		unlock_page(page); @@ -153,7 +154,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)  	/*  Set up the pointer to the new block  */ -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	di = (struct gfs2_dinode *)dibh->b_data;  	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); @@ -405,7 +406,7 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp,  	BUG_ON(i < 1);  	BUG_ON(mp->mp_bh[i] != NULL);  	mp->mp_bh[i] = gfs2_meta_new(gl, bn); -	gfs2_trans_add_bh(gl, mp->mp_bh[i], 1); +	gfs2_trans_add_meta(gl, mp->mp_bh[i]);  	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);  	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));  	ptr += offset; @@ -468,7 +469,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,  	BUG_ON(sheight < 1);  	BUG_ON(dibh == NULL); -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	if (height == sheight) {  		struct buffer_head *bh; @@ -544,7 +545,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,  		/* Branching from existing tree */  		case ALLOC_GROW_DEPTH:  			if (i > 1 && i < height) -				gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1); +				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);  			for (; i < height && n > 0; i++, n--)  				gfs2_indirect_init(mp, ip->i_gl, i,  						   mp->mp_list[i-1], bn++); @@ -556,7 +557,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,  		case ALLOC_DATA:  			BUG_ON(n > dblks);  			BUG_ON(mp->mp_bh[end_of_metadata] == NULL); -			gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1); +			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);  			dblks = n;  			ptr = metapointer(end_of_metadata, mp);  			dblock = bn; @@ -796,8 +797,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,  	down_write(&ip->i_rw_mutex); -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); -	gfs2_trans_add_bh(ip->i_gl, bh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh); +	gfs2_trans_add_meta(ip->i_gl, bh);  	bstart = 0;  	blen = 0; @@ -981,7 +982,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)  	}  	if (!gfs2_is_writeback(ip)) -		gfs2_trans_add_bh(ip->i_gl, bh, 0); +		gfs2_trans_add_data(ip->i_gl, bh);  	zero_user(page, offset, length);  	mark_buffer_dirty(bh); @@ -1046,7 +1047,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)  	if (error)  		goto out; -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	if (gfs2_is_stuffed(ip)) {  		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); @@ -1098,7 +1099,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)  	if (error)  		return error; -	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); +	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);  	if (error)  		return error; @@ -1137,11 +1138,12 @@ static int trunc_end(struct gfs2_inode *ip)  		ip->i_height = 0;  		ip->i_goal = ip->i_no_addr;  		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); +		gfs2_ordered_del_inode(ip);  	}  	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;  	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	gfs2_dinode_out(ip, dibh->b_data);  	brelse(dibh); @@ -1246,7 +1248,7 @@ static int do_grow(struct inode *inode, u64 size)  	i_size_write(inode, size);  	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	gfs2_dinode_out(ip, dibh->b_data);  	brelse(dibh); @@ -1286,6 +1288,10 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)  	inode_dio_wait(inode); +	ret = gfs2_rs_alloc(GFS2_I(inode)); +	if (ret) +		return ret; +  	oldsize = inode->i_size;  	if (newsize >= oldsize)  		return do_grow(inode, newsize); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 9a35670fdc3..c3e82bd2317 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -93,7 +93,7 @@ int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,  	struct buffer_head *bh;  	bh = gfs2_meta_new(ip->i_gl, block); -	gfs2_trans_add_bh(ip->i_gl, bh, 1); +	gfs2_trans_add_meta(ip->i_gl, bh);  	gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);  	gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));  	*bhp = bh; @@ -127,7 +127,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,  	if (error)  		return error; -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);  	if (ip->i_inode.i_size < offset + size)  		i_size_write(&ip->i_inode, offset + size); @@ -209,7 +209,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,  		if (error)  			goto fail; -		gfs2_trans_add_bh(ip->i_gl, bh, 1); +		gfs2_trans_add_meta(ip->i_gl, bh);  		memcpy(bh->b_data + o, buf, amount);  		brelse(bh); @@ -231,7 +231,7 @@ out:  		i_size_write(&ip->i_inode, offset + copied);  	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	gfs2_dinode_out(ip, dibh->b_data);  	brelse(dibh); @@ -647,7 +647,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,  		return;  	} -	gfs2_trans_add_bh(dip->i_gl, bh, 1); +	gfs2_trans_add_meta(dip->i_gl, bh);  	/* If there is no prev entry, this is the first entry in the block.  	   The de_rec_len is already as big as it needs to be.  Just zero @@ -690,7 +690,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,  		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));  	totlen = be16_to_cpu(dent->de_rec_len);  	BUG_ON(offset + name->len > totlen); -	gfs2_trans_add_bh(ip->i_gl, bh, 1); +	gfs2_trans_add_meta(ip->i_gl, bh);  	ndent = (struct gfs2_dirent *)((char *)dent + offset);  	dent->de_rec_len = cpu_to_be16(offset);  	gfs2_qstr2dirent(name, totlen - offset, ndent); @@ -831,7 +831,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,  		return NULL;  	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); -	gfs2_trans_add_bh(ip->i_gl, bh, 1); +	gfs2_trans_add_meta(ip->i_gl, bh);  	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);  	leaf = (struct gfs2_leaf *)bh->b_data;  	leaf->lf_depth = cpu_to_be16(depth); @@ -916,7 +916,7 @@ static int dir_make_exhash(struct inode *inode)  	/*  We're done with the new leaf block, now setup the new  	    hash table.  */ -	gfs2_trans_add_bh(dip->i_gl, dibh, 1); +	gfs2_trans_add_meta(dip->i_gl, dibh);  	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));  	lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); @@ -976,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)  		return 1; /* can't split */  	} -	gfs2_trans_add_bh(dip->i_gl, obh, 1); +	gfs2_trans_add_meta(dip->i_gl, obh);  	nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);  	if (!nleaf) { @@ -1069,7 +1069,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)  	error = gfs2_meta_inode_buffer(dip, &dibh);  	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { -		gfs2_trans_add_bh(dip->i_gl, dibh, 1); +		gfs2_trans_add_meta(dip->i_gl, dibh);  		gfs2_add_inode_blocks(&dip->i_inode, 1);  		gfs2_dinode_out(dip, dibh->b_data);  		brelse(dibh); @@ -1622,7 +1622,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)  			return error;  	} while(1); -	gfs2_trans_add_bh(ip->i_gl, obh, 1); +	gfs2_trans_add_meta(ip->i_gl, obh);  	leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));  	if (!leaf) { @@ -1636,7 +1636,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)  	error = gfs2_meta_inode_buffer(ip, &bh);  	if (error)  		return error; -	gfs2_trans_add_bh(ip->i_gl, bh, 1); +	gfs2_trans_add_meta(ip->i_gl, bh);  	gfs2_add_inode_blocks(&ip->i_inode, 1);  	gfs2_dinode_out(ip, bh->b_data);  	brelse(bh); @@ -1795,7 +1795,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,  	if (IS_ERR(dent))  		return PTR_ERR(dent); -	gfs2_trans_add_bh(dip->i_gl, bh, 1); +	gfs2_trans_add_meta(dip->i_gl, bh);  	gfs2_inum_out(nip, dent);  	dent->de_type = cpu_to_be16(new_type); @@ -1804,7 +1804,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,  		error = gfs2_meta_inode_buffer(dip, &bh);  		if (error)  			return error; -		gfs2_trans_add_bh(dip->i_gl, bh, 1); +		gfs2_trans_add_meta(dip->i_gl, bh);  	}  	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; @@ -1849,7 +1849,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,  	if (!ht)  		return -ENOMEM; -	error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); +	error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);  	if (error)  		goto out; @@ -1917,7 +1917,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,  	if (error)  		goto out_end_trans; -	gfs2_trans_add_bh(dip->i_gl, dibh, 1); +	gfs2_trans_add_meta(dip->i_gl, dibh);  	/* On the last dealloc, make this a regular file in case we crash.  	   (We don't want to free these blocks a second time.)  */  	if (last_dealloc) diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 4767774a5f3..9973df4ff56 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -37,10 +37,10 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,  	if (parent && (*len < GFS2_LARGE_FH_SIZE)) {  		*len = GFS2_LARGE_FH_SIZE; -		return 255; +		return FILEID_INVALID;  	} else if (*len < GFS2_SMALL_FH_SIZE) {  		*len = GFS2_SMALL_FH_SIZE; -		return 255; +		return FILEID_INVALID;  	}  	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 991ab2d484d..019f45e4509 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -157,7 +157,7 @@ static const u32 gfs2_to_fsflags[32] = {  static int gfs2_get_flags(struct file *filp, u32 __user *ptr)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct gfs2_inode *ip = GFS2_I(inode);  	struct gfs2_holder gh;  	int error; @@ -217,7 +217,7 @@ void gfs2_set_inode_flags(struct inode *inode)   */  static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct gfs2_inode *ip = GFS2_I(inode);  	struct gfs2_sbd *sdp = GFS2_SB(inode);  	struct buffer_head *bh; @@ -276,7 +276,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)  	error = gfs2_meta_inode_buffer(ip, &bh);  	if (error)  		goto out_trans_end; -	gfs2_trans_add_bh(ip->i_gl, bh, 1); +	gfs2_trans_add_meta(ip->i_gl, bh);  	ip->i_diskflags = new_flags;  	gfs2_dinode_out(ip, bh->b_data);  	brelse(bh); @@ -293,7 +293,7 @@ out_drop_write:  static int gfs2_set_flags(struct file *filp, u32 __user *ptr)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	u32 fsflags, gfsflags;  	if (get_user(fsflags, ptr)) @@ -336,7 +336,7 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)  { -	struct inode *inode = filep->f_dentry->d_inode; +	struct inode *inode = file_inode(filep);  	struct gfs2_sbd *sdp = GFS2_SB(inode);  	struct gfs2_inode *ip = GFS2_I(inode);  	size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; @@ -386,7 +386,7 @@ static int gfs2_allocate_page_backing(struct page *page)  static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	struct page *page = vmf->page; -	struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(vma->vm_file);  	struct gfs2_inode *ip = GFS2_I(inode);  	struct gfs2_sbd *sdp = GFS2_SB(inode);  	unsigned long last_index; @@ -483,7 +483,7 @@ out:  	gfs2_holder_uninit(&gh);  	if (ret == 0) {  		set_page_dirty(page); -		wait_on_page_writeback(page); +		wait_for_stable_page(page);  	}  	sb_end_pagefault(inode->i_sb);  	return block_page_mkwrite_return(ret); @@ -673,8 +673,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  {  	struct file *file = iocb->ki_filp;  	size_t writesize = iov_length(iov, nr_segs); -	struct dentry *dentry = file->f_dentry; -	struct gfs2_inode *ip = GFS2_I(dentry->d_inode); +	struct gfs2_inode *ip = GFS2_I(file_inode(file));  	int ret;  	ret = gfs2_rs_alloc(ip); @@ -709,7 +708,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,  	if (unlikely(error))  		return error; -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	if (gfs2_is_stuffed(ip)) {  		error = gfs2_unstuff_dinode(ip, NULL); @@ -772,7 +771,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,  static long gfs2_fallocate(struct file *file, int mode, loff_t offset,  			   loff_t len)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct gfs2_sbd *sdp = GFS2_SB(inode);  	struct gfs2_inode *ip = GFS2_I(inode);  	unsigned int data_blocks = 0, ind_blocks = 0, rblocks; @@ -938,7 +937,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)  {  	struct gfs2_file *fp = file->private_data;  	struct gfs2_holder *fl_gh = &fp->f_fl_gh; -	struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode); +	struct gfs2_inode *ip = GFS2_I(file_inode(file));  	struct gfs2_glock *gl;  	unsigned int state;  	int flags; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 992c5c0cb50..cf351554673 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -30,6 +30,7 @@  #include <linux/rculist_bl.h>  #include <linux/bit_spinlock.h>  #include <linux/percpu.h> +#include <linux/list_sort.h>  #include "gfs2.h"  #include "incore.h" @@ -1376,56 +1377,105 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)  		gfs2_glock_put(gl);  } +static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) +{ +	struct gfs2_glock *gla, *glb; -static int gfs2_shrink_glock_memory(struct shrinker *shrink, -				    struct shrink_control *sc) +	gla = list_entry(a, struct gfs2_glock, gl_lru); +	glb = list_entry(b, struct gfs2_glock, gl_lru); + +	if (gla->gl_name.ln_number > glb->gl_name.ln_number) +		return 1; +	if (gla->gl_name.ln_number < glb->gl_name.ln_number) +		return -1; + +	return 0; +} + +/** + * gfs2_dispose_glock_lru - Demote a list of glocks + * @list: The list to dispose of + * + * Disposing of glocks may involve disk accesses, so that here we sort + * the glocks by number (i.e. disk location of the inodes) so that if + * there are any such accesses, they'll be sent in order (mostly). + * + * Must be called under the lru_lock, but may drop and retake this + * lock. While the lru_lock is dropped, entries may vanish from the + * list, but no new entries will appear on the list (since it is + * private) + */ + +static void gfs2_dispose_glock_lru(struct list_head *list) +__releases(&lru_lock) +__acquires(&lru_lock)  {  	struct gfs2_glock *gl; -	int may_demote; -	int nr_skipped = 0; -	int nr = sc->nr_to_scan; -	gfp_t gfp_mask = sc->gfp_mask; -	LIST_HEAD(skipped); -	if (nr == 0) -		goto out; +	list_sort(NULL, list, glock_cmp); -	if (!(gfp_mask & __GFP_FS)) -		return -1; +	while(!list_empty(list)) { +		gl = list_entry(list->next, struct gfs2_glock, gl_lru); +		list_del_init(&gl->gl_lru); +		clear_bit(GLF_LRU, &gl->gl_flags); +		gfs2_glock_hold(gl); +		spin_unlock(&lru_lock); +		spin_lock(&gl->gl_spin); +		if (demote_ok(gl)) +			handle_callback(gl, LM_ST_UNLOCKED, 0); +		WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); +		smp_mb__after_clear_bit(); +		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) +			gfs2_glock_put_nolock(gl); +		spin_unlock(&gl->gl_spin); +		spin_lock(&lru_lock); +	} +} + +/** + * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote + * @nr: The number of entries to scan + * + * This function selects the entries on the LRU which are able to + * be demoted, and then kicks off the process by calling + * gfs2_dispose_glock_lru() above. + */ + +static void gfs2_scan_glock_lru(int nr) +{ +	struct gfs2_glock *gl; +	LIST_HEAD(skipped); +	LIST_HEAD(dispose);  	spin_lock(&lru_lock);  	while(nr && !list_empty(&lru_list)) {  		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); -		list_del_init(&gl->gl_lru); -		clear_bit(GLF_LRU, &gl->gl_flags); -		atomic_dec(&lru_count);  		/* Test for being demotable */  		if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { -			gfs2_glock_hold(gl); -			spin_unlock(&lru_lock); -			spin_lock(&gl->gl_spin); -			may_demote = demote_ok(gl); -			if (may_demote) { -				handle_callback(gl, LM_ST_UNLOCKED, 0); -				nr--; -			} -			clear_bit(GLF_LOCK, &gl->gl_flags); -			smp_mb__after_clear_bit(); -			if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) -				gfs2_glock_put_nolock(gl); -			spin_unlock(&gl->gl_spin); -			spin_lock(&lru_lock); +			list_move(&gl->gl_lru, &dispose); +			atomic_dec(&lru_count); +			nr--;  			continue;  		} -		nr_skipped++; -		list_add(&gl->gl_lru, &skipped); -		set_bit(GLF_LRU, &gl->gl_flags); + +		list_move(&gl->gl_lru, &skipped);  	}  	list_splice(&skipped, &lru_list); -	atomic_add(nr_skipped, &lru_count); +	if (!list_empty(&dispose)) +		gfs2_dispose_glock_lru(&dispose);  	spin_unlock(&lru_lock); -out: +} + +static int gfs2_shrink_glock_memory(struct shrinker *shrink, +				    struct shrink_control *sc) +{ +	if (sc->nr_to_scan) { +		if (!(sc->gfp_mask & __GFP_FS)) +			return -1; +		gfs2_scan_glock_lru(sc->nr_to_scan); +	} +  	return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;  } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 78d4184ffc7..444b6503ebc 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -322,8 +322,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)  		break;  	}; -	ip->i_inode.i_uid = be32_to_cpu(str->di_uid); -	ip->i_inode.i_gid = be32_to_cpu(str->di_gid); +	i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); +	i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));  	gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));  	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));  	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c373a24fedd..156e42ec84e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -52,7 +52,6 @@ struct gfs2_log_header_host {   */  struct gfs2_log_operations { -	void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);  	void (*lo_before_commit) (struct gfs2_sbd *sdp);  	void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);  	void (*lo_before_scan) (struct gfs2_jdesc *jd, @@ -341,6 +340,7 @@ enum {  	GIF_QD_LOCKED		= 1,  	GIF_ALLOC_FAILED	= 2,  	GIF_SW_PAGED		= 3, +	GIF_ORDERED		= 4,  };  struct gfs2_inode { @@ -357,6 +357,7 @@ struct gfs2_inode {  	struct gfs2_rgrpd *i_rgd;  	u64 i_goal;	/* goal block for allocations */  	struct rw_semaphore i_rw_mutex; +	struct list_head i_ordered;  	struct list_head i_trunc_list;  	__be64 *i_hash_cache;  	u32 i_entries; @@ -391,7 +392,6 @@ struct gfs2_revoke_replay {  };  enum { -	QDF_USER		= 0,  	QDF_CHANGE		= 1,  	QDF_LOCKED		= 2,  	QDF_REFRESH		= 3, @@ -403,7 +403,7 @@ struct gfs2_quota_data {  	atomic_t qd_count; -	u32 qd_id; +	struct kqid qd_id;  	unsigned long qd_flags;		/* QDF_... */  	s64 qd_change; @@ -641,6 +641,7 @@ struct gfs2_sbd {  	wait_queue_head_t sd_glock_wait;  	atomic_t sd_glock_disposal;  	struct completion sd_locking_init; +	struct completion sd_wdack;  	struct delayed_work sd_control_work;  	/* Inode Stuff */ @@ -723,6 +724,7 @@ struct gfs2_sbd {  	struct list_head sd_log_le_revoke;  	struct list_head sd_log_le_databuf;  	struct list_head sd_log_le_ordered; +	spinlock_t sd_ordered_lock;  	atomic_t sd_log_thresh1;  	atomic_t sd_log_thresh2; @@ -758,10 +760,7 @@ struct gfs2_sbd {  	unsigned int sd_replayed_blocks;  	/* For quiescing the filesystem */ -  	struct gfs2_holder sd_freeze_gh; -	struct mutex sd_freeze_lock; -	unsigned int sd_freeze_count;  	char sd_fsname[GFS2_FSNAME_LEN];  	char sd_table_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2b6f5698ef1..cc00bd1d1f8 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -368,10 +368,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,  			       struct inode *inode)  {  	if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && -	    (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) { +	    (dip->i_inode.i_mode & S_ISUID) && +	    !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) {  		if (S_ISDIR(inode->i_mode))  			inode->i_mode |= S_ISUID; -		else if (dip->i_inode.i_uid != current_fsuid()) +		else if (!uid_eq(dip->i_inode.i_uid, current_fsuid()))  			inode->i_mode &= ~07111;  		inode->i_uid = dip->i_inode.i_uid;  	} else @@ -447,7 +448,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,  	struct timespec tv = CURRENT_TIME;  	dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);  	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));  	di = (struct gfs2_dinode *)dibh->b_data; @@ -455,8 +456,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,  	di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);  	di->di_num.no_addr = cpu_to_be64(ip->i_no_addr);  	di->di_mode = cpu_to_be32(ip->i_inode.i_mode); -	di->di_uid = cpu_to_be32(ip->i_inode.i_uid); -	di->di_gid = cpu_to_be32(ip->i_inode.i_gid); +	di->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); +	di->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));  	di->di_nlink = 0;  	di->di_size = cpu_to_be64(ip->i_inode.i_size);  	di->di_blocks = cpu_to_be64(1); @@ -548,7 +549,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,  	if (error)  		return error; -	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); +	error = gfs2_quota_lock(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);  	if (error)  		goto fail; @@ -584,7 +585,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,  	if (error)  		goto fail_end_trans;  	set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	gfs2_dinode_out(ip, dibh->b_data);  	brelse(dibh);  	return 0; @@ -931,7 +932,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,  	if (error)  		goto out_brelse; -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	inc_nlink(&ip->i_inode);  	ip->i_inode.i_ctime = CURRENT_TIME;  	ihold(inode); @@ -978,8 +979,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,  		return -EPERM;  	if ((dip->i_inode.i_mode & S_ISVTX) && -	    dip->i_inode.i_uid != current_fsuid() && -	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) +	    !uid_eq(dip->i_inode.i_uid, current_fsuid()) && +	    !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER))  		return -EPERM;  	if (IS_APPEND(&dip->i_inode)) @@ -1412,7 +1413,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,  		if (error)  			goto out_end_trans;  		ip->i_inode.i_ctime = CURRENT_TIME; -		gfs2_trans_add_bh(ip->i_gl, dibh, 1); +		gfs2_trans_add_meta(ip->i_gl, dibh);  		gfs2_dinode_out(ip, dibh->b_data);  		brelse(dibh);  	} @@ -1580,7 +1581,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)  {  	struct gfs2_inode *ip = GFS2_I(inode);  	struct gfs2_sbd *sdp = GFS2_SB(inode); -	u32 ouid, ogid, nuid, ngid; +	kuid_t ouid, nuid; +	kgid_t ogid, ngid;  	int error;  	ouid = inode->i_uid; @@ -1588,16 +1590,17 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)  	nuid = attr->ia_uid;  	ngid = attr->ia_gid; -	if (!(attr->ia_valid & ATTR_UID) || ouid == nuid) -		ouid = nuid = NO_QUOTA_CHANGE; -	if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) -		ogid = ngid = NO_QUOTA_CHANGE; +	if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid)) +		ouid = nuid = NO_UID_QUOTA_CHANGE; +	if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) +		ogid = ngid = NO_GID_QUOTA_CHANGE;  	error = gfs2_quota_lock(ip, nuid, ngid);  	if (error)  		return error; -	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { +	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || +	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {  		error = gfs2_quota_check(ip, nuid, ngid);  		if (error)  			goto out_gunlock_q; @@ -1611,7 +1614,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)  	if (error)  		goto out_end_trans; -	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { +	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || +	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {  		u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);  		gfs2_quota_change(ip, -blocks, ouid, ogid);  		gfs2_quota_change(ip, blocks, nuid, ngid); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 8dad6b09371..9802de0f85e 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -241,6 +241,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,  static void gfs2_reverse_hex(char *c, u64 value)  { +	*c = '0';  	while (value) {  		*c-- = hex_asc[value & 0x0f];  		value >>= 4; @@ -280,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)  {  	struct gfs2_sbd *sdp = gl->gl_sbd;  	struct lm_lockstruct *ls = &sdp->sd_lockstruct; +	int lvb_needs_unlock = 0;  	int error;  	if (gl->gl_lksb.sb_lkid == 0) { @@ -293,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl)  	gfs2_update_request_times(gl);  	/* don't want to skip dlm_unlock writing the lvb when lock is ex */ + +	if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) +		lvb_needs_unlock = 1; +  	if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && -	    gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) { +	    !lvb_needs_unlock) {  		gfs2_glock_free(gl);  		return;  	} diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f4beeb9c81c..9a2ca8be764 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -482,70 +482,66 @@ static void log_flush_wait(struct gfs2_sbd *sdp)  	}  } -static int bd_cmp(void *priv, struct list_head *a, struct list_head *b) +static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)  { -	struct gfs2_bufdata *bda, *bdb; +	struct gfs2_inode *ipa, *ipb; -	bda = list_entry(a, struct gfs2_bufdata, bd_list); -	bdb = list_entry(b, struct gfs2_bufdata, bd_list); +	ipa = list_entry(a, struct gfs2_inode, i_ordered); +	ipb = list_entry(b, struct gfs2_inode, i_ordered); -	if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) +	if (ipa->i_no_addr < ipb->i_no_addr)  		return -1; -	if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) +	if (ipa->i_no_addr > ipb->i_no_addr)  		return 1;  	return 0;  }  static void gfs2_ordered_write(struct gfs2_sbd *sdp)  { -	struct gfs2_bufdata *bd; -	struct buffer_head *bh; +	struct gfs2_inode *ip;  	LIST_HEAD(written); -	gfs2_log_lock(sdp); -	list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp); +	spin_lock(&sdp->sd_ordered_lock); +	list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp);  	while (!list_empty(&sdp->sd_log_le_ordered)) { -		bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list); -		list_move(&bd->bd_list, &written); -		bh = bd->bd_bh; -		if (!buffer_dirty(bh)) +		ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); +		list_move(&ip->i_ordered, &written); +		if (ip->i_inode.i_mapping->nrpages == 0)  			continue; -		get_bh(bh); -		gfs2_log_unlock(sdp); -		lock_buffer(bh); -		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { -			bh->b_end_io = end_buffer_write_sync; -			submit_bh(WRITE_SYNC, bh); -		} else { -			unlock_buffer(bh); -			brelse(bh); -		} -		gfs2_log_lock(sdp); +		spin_unlock(&sdp->sd_ordered_lock); +		filemap_fdatawrite(ip->i_inode.i_mapping); +		spin_lock(&sdp->sd_ordered_lock);  	}  	list_splice(&written, &sdp->sd_log_le_ordered); -	gfs2_log_unlock(sdp); +	spin_unlock(&sdp->sd_ordered_lock);  }  static void gfs2_ordered_wait(struct gfs2_sbd *sdp)  { -	struct gfs2_bufdata *bd; -	struct buffer_head *bh; +	struct gfs2_inode *ip; -	gfs2_log_lock(sdp); +	spin_lock(&sdp->sd_ordered_lock);  	while (!list_empty(&sdp->sd_log_le_ordered)) { -		bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list); -		bh = bd->bd_bh; -		if (buffer_locked(bh)) { -			get_bh(bh); -			gfs2_log_unlock(sdp); -			wait_on_buffer(bh); -			brelse(bh); -			gfs2_log_lock(sdp); +		ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); +		list_del(&ip->i_ordered); +		WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); +		if (ip->i_inode.i_mapping->nrpages == 0)  			continue; -		} -		list_del_init(&bd->bd_list); +		spin_unlock(&sdp->sd_ordered_lock); +		filemap_fdatawait(ip->i_inode.i_mapping); +		spin_lock(&sdp->sd_ordered_lock);  	} -	gfs2_log_unlock(sdp); +	spin_unlock(&sdp->sd_ordered_lock); +} + +void gfs2_ordered_del_inode(struct gfs2_inode *ip) +{ +	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + +	spin_lock(&sdp->sd_ordered_lock); +	if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) +		list_del(&ip->i_ordered); +	spin_unlock(&sdp->sd_ordered_lock);  }  /** diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 3fd5215ea25..3566f35915e 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -48,6 +48,18 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,  	sdp->sd_log_head = sdp->sd_log_tail = value;  } +static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) +{ +	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + +	if (!test_bit(GIF_ORDERED, &ip->i_flags)) { +		spin_lock(&sdp->sd_ordered_lock); +		if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) +			list_add(&ip->i_ordered, &sdp->sd_log_le_ordered); +		spin_unlock(&sdp->sd_ordered_lock); +	} +} +extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);  extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,  			    unsigned int ssize); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 9ceccb1595a..a5055977a21 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -37,7 +37,7 @@   *   * The log lock must be held when calling this function   */ -static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)  {  	struct gfs2_bufdata *bd; @@ -388,32 +388,6 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,  	return page;  } -static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ -	struct gfs2_meta_header *mh; -	struct gfs2_trans *tr; - -	tr = current->journal_info; -	tr->tr_touched = 1; -	if (!list_empty(&bd->bd_list)) -		return; -	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); -	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); -	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; -	if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { -		printk(KERN_ERR -		       "Attempting to add uninitialised block to journal (inplace block=%lld)\n", -		       (unsigned long long)bd->bd_bh->b_blocknr); -		BUG(); -	} -	gfs2_pin(sdp, bd->bd_bh); -	mh->__pad0 = cpu_to_be64(0); -	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); -	sdp->sd_log_num_buf++; -	list_add(&bd->bd_list, &sdp->sd_log_le_buf); -	tr->tr_num_buf_new++; -} -  static void gfs2_check_magic(struct buffer_head *bh)  {  	void *kaddr; @@ -600,20 +574,6 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)  	        jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);  } -static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ -	struct gfs2_glock *gl = bd->bd_gl; -	struct gfs2_trans *tr; - -	tr = current->journal_info; -	tr->tr_touched = 1; -	tr->tr_num_revoke++; -	sdp->sd_log_num_revoke++; -	atomic_inc(&gl->gl_revokes); -	set_bit(GLF_LFLUSH, &gl->gl_flags); -	list_add(&bd->bd_list, &sdp->sd_log_le_revoke); -} -  static void revoke_lo_before_commit(struct gfs2_sbd *sdp)  {  	struct gfs2_meta_header *mh; @@ -749,44 +709,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)  }  /** - * databuf_lo_add - Add a databuf to the transaction. - * - * This is used in two distinct cases: - * i) In ordered write mode - *    We put the data buffer on a list so that we can ensure that its - *    synced to disk at the right time - * ii) In journaled data mode - *    We need to journal the data block in the same way as metadata in - *    the functions above. The difference is that here we have a tag - *    which is two __be64's being the block number (as per meta data) - *    and a flag which says whether the data block needs escaping or - *    not. This means we need a new log entry for each 251 or so data - *    blocks, which isn't an enormous overhead but twice as much as - *    for normal metadata blocks. - */ -static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ -	struct gfs2_trans *tr = current->journal_info; -	struct address_space *mapping = bd->bd_bh->b_page->mapping; -	struct gfs2_inode *ip = GFS2_I(mapping->host); - -	if (tr) -		tr->tr_touched = 1; -	if (!list_empty(&bd->bd_list)) -		return; -	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); -	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); -	if (gfs2_is_jdata(ip)) { -		gfs2_pin(sdp, bd->bd_bh); -		tr->tr_num_databuf_new++; -		sdp->sd_log_num_databuf++; -		list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); -	} else { -		list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); -	} -} - -/**   * databuf_lo_before_commit - Scan the data buffers, writing as we go   *   */ @@ -885,7 +807,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)  const struct gfs2_log_operations gfs2_buf_lops = { -	.lo_add = buf_lo_add,  	.lo_before_commit = buf_lo_before_commit,  	.lo_after_commit = buf_lo_after_commit,  	.lo_before_scan = buf_lo_before_scan, @@ -895,7 +816,6 @@ const struct gfs2_log_operations gfs2_buf_lops = {  };  const struct gfs2_log_operations gfs2_revoke_lops = { -	.lo_add = revoke_lo_add,  	.lo_before_commit = revoke_lo_before_commit,  	.lo_after_commit = revoke_lo_after_commit,  	.lo_before_scan = revoke_lo_before_scan, @@ -909,7 +829,6 @@ const struct gfs2_log_operations gfs2_rg_lops = {  };  const struct gfs2_log_operations gfs2_databuf_lops = { -	.lo_add = databuf_lo_add,  	.lo_before_commit = databuf_lo_before_commit,  	.lo_after_commit = databuf_lo_after_commit,  	.lo_scan_elements = databuf_lo_scan_elements, diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 954a330585f..ba77b7da832 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -29,6 +29,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops;  extern const struct gfs2_log_operations *gfs2_log_ops[];  extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);  extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw); +extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);  static inline unsigned int buf_limit(struct gfs2_sbd *sdp)  { @@ -46,19 +47,6 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)  	return limit;  } -static inline void lops_init_le(struct gfs2_bufdata *bd, -				const struct gfs2_log_operations *lops) -{ -	INIT_LIST_HEAD(&bd->bd_list); -	bd->bd_ops = lops; -} - -static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ -	if (bd->bd_ops->lo_add) -		bd->bd_ops->lo_add(sdp, bd); -} -  static inline void lops_before_commit(struct gfs2_sbd *sdp)  {  	int x; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 22255d96b27..b059bbb5059 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -271,41 +271,6 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)  	return 0;  } -/** - * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer - * @gl: the glock the buffer belongs to - * @bh: The buffer to be attached to - * @meta: Flag to indicate whether its metadata or not - */ - -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, -			 int meta) -{ -	struct gfs2_bufdata *bd; - -	if (meta) -		lock_page(bh->b_page); - -	if (bh->b_private) { -		if (meta) -			unlock_page(bh->b_page); -		return; -	} - -	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); -	bd->bd_bh = bh; -	bd->bd_gl = gl; - -	if (meta) -		lops_init_le(bd, &gfs2_buf_lops); -	else -		lops_init_le(bd, &gfs2_databuf_lops); -	bh->b_private = bd; - -	if (meta) -		unlock_page(bh->b_page); -} -  void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)  {  	struct address_space *mapping = bh->b_page->mapping; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index c30973b07a7..0d4c843b6f8 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -56,9 +56,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,  int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);  struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, -			 int meta); -  void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,  			      int meta); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 0e3554edb8f..1b612be4b87 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -81,6 +81,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)  	init_waitqueue_head(&sdp->sd_glock_wait);  	atomic_set(&sdp->sd_glock_disposal, 0);  	init_completion(&sdp->sd_locking_init); +	init_completion(&sdp->sd_wdack);  	spin_lock_init(&sdp->sd_statfs_spin);  	spin_lock_init(&sdp->sd_rindex_spin); @@ -102,6 +103,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)  	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);  	INIT_LIST_HEAD(&sdp->sd_log_le_databuf);  	INIT_LIST_HEAD(&sdp->sd_log_le_ordered); +	spin_lock_init(&sdp->sd_ordered_lock);  	init_waitqueue_head(&sdp->sd_log_waitq);  	init_waitqueue_head(&sdp->sd_logd_waitq); @@ -115,8 +117,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)  	INIT_LIST_HEAD(&sdp->sd_revoke_list); -	mutex_init(&sdp->sd_freeze_lock); -  	return sdp;  } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index ae55e248c3b..c7c840e916f 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -65,13 +65,10 @@  #include "inode.h"  #include "util.h" -#define QUOTA_USER 1 -#define QUOTA_GROUP 0 -  struct gfs2_quota_change_host {  	u64 qc_change;  	u32 qc_flags; /* GFS2_QCF_... */ -	u32 qc_id; +	struct kqid qc_id;  };  static LIST_HEAD(qd_lru_list); @@ -120,17 +117,24 @@ out:  	return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;  } +static u64 qd2index(struct gfs2_quota_data *qd) +{ +	struct kqid qid = qd->qd_id; +	return (2 * (u64)from_kqid(&init_user_ns, qid)) + +		(qid.type == USRQUOTA) ? 0 : 1; +} +  static u64 qd2offset(struct gfs2_quota_data *qd)  {  	u64 offset; -	offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags); +	offset = qd2index(qd);  	offset *= sizeof(struct gfs2_quota);  	return offset;  } -static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, +static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,  		    struct gfs2_quota_data **qdp)  {  	struct gfs2_quota_data *qd; @@ -141,13 +145,11 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,  		return -ENOMEM;  	atomic_set(&qd->qd_count, 1); -	qd->qd_id = id; -	if (user) -		set_bit(QDF_USER, &qd->qd_flags); +	qd->qd_id = qid;  	qd->qd_slot = -1;  	INIT_LIST_HEAD(&qd->qd_reclaim); -	error = gfs2_glock_get(sdp, 2 * (u64)id + !user, +	error = gfs2_glock_get(sdp, qd2index(qd),  			      &gfs2_quota_glops, CREATE, &qd->qd_gl);  	if (error)  		goto fail; @@ -161,7 +163,7 @@ fail:  	return error;  } -static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,  		  struct gfs2_quota_data **qdp)  {  	struct gfs2_quota_data *qd = NULL, *new_qd = NULL; @@ -173,8 +175,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,  		found = 0;  		spin_lock(&qd_lru_lock);  		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { -			if (qd->qd_id == id && -			    !test_bit(QDF_USER, &qd->qd_flags) == !user) { +			if (qid_eq(qd->qd_id, qid)) {  				if (!atomic_read(&qd->qd_count) &&  				    !list_empty(&qd->qd_reclaim)) {  					/* Remove it from reclaim list */ @@ -208,7 +209,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,  			return 0;  		} -		error = qd_alloc(sdp, user, id, &new_qd); +		error = qd_alloc(sdp, qid, &new_qd);  		if (error)  			return error;  	} @@ -458,12 +459,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)  	qd_put(qd);  } -static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, +static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid,  		    struct gfs2_quota_data **qdp)  {  	int error; -	error = qd_get(sdp, user, id, qdp); +	error = qd_get(sdp, qid, qdp);  	if (error)  		return error; @@ -491,7 +492,7 @@ static void qdsb_put(struct gfs2_quota_data *qd)  	qd_put(qd);  } -int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)  {  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);  	struct gfs2_quota_data **qd; @@ -512,28 +513,30 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)  	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)  		return 0; -	error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd); +	error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);  	if (error)  		goto out;  	ip->i_res->rs_qa_qd_num++;  	qd++; -	error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd); +	error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);  	if (error)  		goto out;  	ip->i_res->rs_qa_qd_num++;  	qd++; -	if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { -		error = qdsb_get(sdp, QUOTA_USER, uid, qd); +	if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && +	    !uid_eq(uid, ip->i_inode.i_uid)) { +		error = qdsb_get(sdp, make_kqid_uid(uid), qd);  		if (error)  			goto out;  		ip->i_res->rs_qa_qd_num++;  		qd++;  	} -	if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) { -		error = qdsb_get(sdp, QUOTA_GROUP, gid, qd); +	if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) && +	    !gid_eq(gid, ip->i_inode.i_gid)) { +		error = qdsb_get(sdp, make_kqid_gid(gid), qd);  		if (error)  			goto out;  		ip->i_res->rs_qa_qd_num++; @@ -567,18 +570,10 @@ static int sort_qd(const void *a, const void *b)  	const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;  	const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; -	if (!test_bit(QDF_USER, &qd_a->qd_flags) != -	    !test_bit(QDF_USER, &qd_b->qd_flags)) { -		if (test_bit(QDF_USER, &qd_a->qd_flags)) -			return -1; -		else -			return 1; -	} -	if (qd_a->qd_id < qd_b->qd_id) +	if (qid_lt(qd_a->qd_id, qd_b->qd_id))  		return -1; -	if (qd_a->qd_id > qd_b->qd_id) +	if (qid_lt(qd_b->qd_id, qd_a->qd_id))  		return 1; -  	return 0;  } @@ -590,14 +585,14 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)  	s64 x;  	mutex_lock(&sdp->sd_quota_mutex); -	gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1); +	gfs2_trans_add_meta(ip->i_gl, qd->qd_bh);  	if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {  		qc->qc_change = 0;  		qc->qc_flags = 0; -		if (test_bit(QDF_USER, &qd->qd_flags)) +		if (qd->qd_id.type == USRQUOTA)  			qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); -		qc->qc_id = cpu_to_be32(qd->qd_id); +		qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id));  	}  	x = be64_to_cpu(qc->qc_change) + change; @@ -726,7 +721,7 @@ get_a_page:  			goto unlock_out;  	} -	gfs2_trans_add_bh(ip->i_gl, bh, 0); +	gfs2_trans_add_meta(ip->i_gl, bh);  	kaddr = kmap_atomic(page);  	if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) @@ -925,7 +920,7 @@ fail:  	return error;  } -int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)  {  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);  	struct gfs2_quota_data *qd; @@ -1040,13 +1035,13 @@ static int print_message(struct gfs2_quota_data *qd, char *type)  	printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",  	       sdp->sd_fsname, type, -	       (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", -	       qd->qd_id); +	       (qd->qd_id.type == USRQUOTA) ? "user" : "group", +	       from_kqid(&init_user_ns, qd->qd_id));  	return 0;  } -int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)  {  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);  	struct gfs2_quota_data *qd; @@ -1063,8 +1058,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)  	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {  		qd = ip->i_res->rs_qa_qd[x]; -		if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || -		      (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) +		if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || +		      qid_eq(qd->qd_id, make_kqid_gid(gid))))  			continue;  		value = (s64)be64_to_cpu(qd->qd_qb.qb_value); @@ -1074,10 +1069,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)  		if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {  			print_message(qd, "exceeded"); -			quota_send_warning(make_kqid(&init_user_ns, -						     test_bit(QDF_USER, &qd->qd_flags) ? -						     USRQUOTA : GRPQUOTA, -						     qd->qd_id), +			quota_send_warning(qd->qd_id,  					   sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);  			error = -EDQUOT; @@ -1087,10 +1079,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)  			   time_after_eq(jiffies, qd->qd_last_warn +  					 gfs2_tune_get(sdp,  						gt_quota_warn_period) * HZ)) { -			quota_send_warning(make_kqid(&init_user_ns, -						     test_bit(QDF_USER, &qd->qd_flags) ? -						     USRQUOTA : GRPQUOTA, -						     qd->qd_id), +			quota_send_warning(qd->qd_id,  					   sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);  			error = print_message(qd, "warning");  			qd->qd_last_warn = jiffies; @@ -1101,7 +1090,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)  }  void gfs2_quota_change(struct gfs2_inode *ip, s64 change, -		       u32 uid, u32 gid) +		       kuid_t uid, kgid_t gid)  {  	struct gfs2_quota_data *qd;  	unsigned int x; @@ -1114,8 +1103,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,  	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {  		qd = ip->i_res->rs_qa_qd[x]; -		if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || -		    (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { +		if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || +		    qid_eq(qd->qd_id, make_kqid_gid(gid))) {  			do_qc(qd, change);  		}  	} @@ -1170,13 +1159,13 @@ static int gfs2_quota_sync_timeo(struct super_block *sb, int type)  	return gfs2_quota_sync(sb, type);  } -int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) +int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)  {  	struct gfs2_quota_data *qd;  	struct gfs2_holder q_gh;  	int error; -	error = qd_get(sdp, user, id, &qd); +	error = qd_get(sdp, qid, &qd);  	if (error)  		return error; @@ -1194,7 +1183,9 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *  	qc->qc_change = be64_to_cpu(str->qc_change);  	qc->qc_flags = be32_to_cpu(str->qc_flags); -	qc->qc_id = be32_to_cpu(str->qc_id); +	qc->qc_id = make_kqid(&init_user_ns, +			      (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA, +			      be32_to_cpu(str->qc_id));  }  int gfs2_quota_init(struct gfs2_sbd *sdp) @@ -1257,8 +1248,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)  			if (!qc.qc_change)  				continue; -			error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER), -					 qc.qc_id, &qd); +			error = qd_alloc(sdp, qc.qc_id, &qd);  			if (error) {  				brelse(bh);  				goto fail; @@ -1485,21 +1475,17 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,  	struct gfs2_quota_data *qd;  	struct gfs2_holder q_gh;  	int error; -	int type;  	memset(fdq, 0, sizeof(struct fs_disk_quota));  	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)  		return -ESRCH; /* Crazy XFS error code */ -	if (qid.type == USRQUOTA) -		type = QUOTA_USER; -	else if (qid.type == GRPQUOTA) -		type = QUOTA_GROUP; -	else +	if ((qid.type != USRQUOTA) && +	    (qid.type != GRPQUOTA))  		return -EINVAL; -	error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd); +	error = qd_get(sdp, qid, &qd);  	if (error)  		return error;  	error = do_glock(qd, FORCE, &q_gh); @@ -1508,8 +1494,8 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,  	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;  	fdq->d_version = FS_DQUOT_VERSION; -	fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; -	fdq->d_id = from_kqid(&init_user_ns, qid); +	fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA; +	fdq->d_id = from_kqid_munged(current_user_ns(), qid);  	fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;  	fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;  	fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; @@ -1535,32 +1521,18 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,  	int alloc_required;  	loff_t offset;  	int error; -	int type;  	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)  		return -ESRCH; /* Crazy XFS error code */ -	switch(qid.type) { -	case USRQUOTA: -		type = QUOTA_USER; -		if (fdq->d_flags != FS_USER_QUOTA) -			return -EINVAL; -		break; -	case GRPQUOTA: -		type = QUOTA_GROUP; -		if (fdq->d_flags != FS_GROUP_QUOTA) -			return -EINVAL; -		break; -	default: +	if ((qid.type != USRQUOTA) && +	    (qid.type != GRPQUOTA))  		return -EINVAL; -	}  	if (fdq->d_fieldmask & ~GFS2_FIELDMASK)  		return -EINVAL; -	if (fdq->d_id != from_kqid(&init_user_ns, qid)) -		return -EINVAL; -	error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd); +	error = qd_get(sdp, qid, &qd);  	if (error)  		return error; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index f25d98b8790..4f5e6e44ed8 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -14,20 +14,21 @@ struct gfs2_inode;  struct gfs2_sbd;  struct shrink_control; -#define NO_QUOTA_CHANGE ((u32)-1) +#define NO_UID_QUOTA_CHANGE INVALID_UID +#define NO_GID_QUOTA_CHANGE INVALID_GID -extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); +extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);  extern void gfs2_quota_unhold(struct gfs2_inode *ip); -extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); +extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);  extern void gfs2_quota_unlock(struct gfs2_inode *ip); -extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); +extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);  extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, -			      u32 uid, u32 gid); +			      kuid_t uid, kgid_t gid);  extern int gfs2_quota_sync(struct super_block *sb, int type); -extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); +extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);  extern int gfs2_quota_init(struct gfs2_sbd *sdp);  extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); @@ -41,7 +42,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)  	int ret;  	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)  		return 0; -	ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); +	ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);  	if (ret)  		return ret;  	if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 37ee061d899..d1f51fd73f8 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -350,10 +350,14 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)  		BUG_ON(len < chunk_size);  		len -= chunk_size;  		block = gfs2_rbm_to_block(&rbm); -		gfs2_rbm_from_block(&rbm, block + chunk_size); -		n_unaligned = 3; -		if (ptr) +		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) { +			n_unaligned = 0;  			break; +		} +		if (ptr) { +			n_unaligned = 3; +			break; +		}  		n_unaligned = len & 3;  	} @@ -557,22 +561,20 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)   */  int gfs2_rs_alloc(struct gfs2_inode *ip)  { -	struct gfs2_blkreserv *res; +	int error = 0; +	down_write(&ip->i_rw_mutex);  	if (ip->i_res) -		return 0; - -	res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); -	if (!res) -		return -ENOMEM; +		goto out; -	RB_CLEAR_NODE(&res->rs_node); +	ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); +	if (!ip->i_res) { +		error = -ENOMEM; +		goto out; +	} -	down_write(&ip->i_rw_mutex); -	if (ip->i_res) -		kmem_cache_free(gfs2_rsrv_cachep, res); -	else -		ip->i_res = res; +	RB_CLEAR_NODE(&ip->i_res->rs_node); +out:  	up_write(&ip->i_rw_mutex);  	return 0;  } @@ -1255,7 +1257,7 @@ fail:  int gfs2_fitrim(struct file *filp, void __user *argp)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct gfs2_sbd *sdp = GFS2_SB(inode);  	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);  	struct buffer_head *bh; @@ -1321,7 +1323,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)  			if (ret == 0) {  				bh = rgd->rd_bits[0].bi_bh;  				rgd->rd_flags |= GFS2_RGF_TRIMMED; -				gfs2_trans_add_bh(rgd->rd_gl, bh, 1); +				gfs2_trans_add_meta(rgd->rd_gl, bh);  				gfs2_rgrp_out(rgd, bh->b_data);  				gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);  				gfs2_trans_end(sdp); @@ -1424,6 +1426,9 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,  		rs->rs_free = extlen;  		rs->rs_inum = ip->i_no_addr;  		rs_insert(ip); +	} else { +		if (goal == rgd->rd_last_alloc + rgd->rd_data0) +			rgd->rd_last_alloc = 0;  	}  } @@ -1963,14 +1968,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,  	*n = 1;  	block = gfs2_rbm_to_block(rbm); -	gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1); +	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh);  	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);  	block++;  	while (*n < elen) {  		ret = gfs2_rbm_from_block(&pos, block);  		if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)  			break; -		gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1); +		gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh);  		gfs2_setbit(&pos, true, GFS2_BLKST_USED);  		(*n)++;  		block++; @@ -2009,7 +2014,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,  			       rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,  			       rbm.bi->bi_len);  		} -		gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1); +		gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh);  		gfs2_setbit(&rbm, false, new_state);  	} @@ -2152,7 +2157,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,  		if (error == 0) {  			struct gfs2_dinode *di =  				(struct gfs2_dinode *)dibh->b_data; -			gfs2_trans_add_bh(ip->i_gl, dibh, 1); +			gfs2_trans_add_meta(ip->i_gl, dibh);  			di->di_goal_meta = di->di_goal_data =  				cpu_to_be64(ip->i_goal);  			brelse(dibh); @@ -2171,7 +2176,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,  			*generation = rbm.rgd->rd_igeneration++;  	} -	gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1); +	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);  	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);  	gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data); @@ -2218,7 +2223,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)  	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);  	rgd->rd_free += blen;  	rgd->rd_flags &= ~GFS2_RGF_TRIMMED; -	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); +	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);  	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); @@ -2255,7 +2260,7 @@ void gfs2_unlink_di(struct inode *inode)  	if (!rgd)  		return;  	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); -	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); +	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);  	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);  	update_rgrp_lvb_unlinked(rgd, 1); @@ -2276,7 +2281,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)  	rgd->rd_dinodes--;  	rgd->rd_free++; -	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); +	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);  	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);  	update_rgrp_lvb_unlinked(rgd, -1); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d6488674d91..cab77b8ba84 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -500,7 +500,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,  	if (error)  		return; -	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); +	gfs2_trans_add_meta(l_ip->i_gl, l_bh);  	spin_lock(&sdp->sd_statfs_spin);  	l_sc->sc_total += total; @@ -528,7 +528,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,  	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;  	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; -	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); +	gfs2_trans_add_meta(l_ip->i_gl, l_bh);  	spin_lock(&sdp->sd_statfs_spin);  	m_sc->sc_total += l_sc->sc_total; @@ -539,7 +539,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,  	       0, sizeof(struct gfs2_statfs_change));  	spin_unlock(&sdp->sd_statfs_spin); -	gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); +	gfs2_trans_add_meta(m_ip->i_gl, m_bh);  	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));  } @@ -663,54 +663,6 @@ out:  	return error;  } -/** - * gfs2_freeze_fs - freezes the file system - * @sdp: the file system - * - * This function flushes data and meta data for all machines by - * acquiring the transaction log exclusively.  All journals are - * ensured to be in a clean state as well. - * - * Returns: errno - */ - -int gfs2_freeze_fs(struct gfs2_sbd *sdp) -{ -	int error = 0; - -	mutex_lock(&sdp->sd_freeze_lock); - -	if (!sdp->sd_freeze_count++) { -		error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); -		if (error) -			sdp->sd_freeze_count--; -	} - -	mutex_unlock(&sdp->sd_freeze_lock); - -	return error; -} - -/** - * gfs2_unfreeze_fs - unfreezes the file system - * @sdp: the file system - * - * This function allows the file system to proceed by unlocking - * the exclusively held transaction lock.  Other GFS2 nodes are - * now free to acquire the lock shared and go on with their lives. - * - */ - -void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) -{ -	mutex_lock(&sdp->sd_freeze_lock); - -	if (sdp->sd_freeze_count && !--sdp->sd_freeze_count) -		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - -	mutex_unlock(&sdp->sd_freeze_lock); -} -  void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)  {  	struct gfs2_dinode *str = buf; @@ -721,8 +673,8 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)  	str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);  	str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);  	str->di_mode = cpu_to_be32(ip->i_inode.i_mode); -	str->di_uid = cpu_to_be32(ip->i_inode.i_uid); -	str->di_gid = cpu_to_be32(ip->i_inode.i_gid); +	str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); +	str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));  	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);  	str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));  	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); @@ -824,7 +776,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)  	ret = gfs2_meta_inode_buffer(ip, &bh);  	if (ret == 0) { -		gfs2_trans_add_bh(ip->i_gl, bh, 1); +		gfs2_trans_add_meta(ip->i_gl, bh);  		gfs2_dinode_out(ip, bh->b_data);  		brelse(bh);  	} @@ -888,13 +840,6 @@ static void gfs2_put_super(struct super_block *sb)  	int error;  	struct gfs2_jdesc *jd; -	/*  Unfreeze the filesystem, if we need to  */ - -	mutex_lock(&sdp->sd_freeze_lock); -	if (sdp->sd_freeze_count) -		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); -	mutex_unlock(&sdp->sd_freeze_lock); -  	/* No more recovery requests */  	set_bit(SDF_NORECOVERY, &sdp->sd_flags);  	smp_mb(); @@ -985,7 +930,7 @@ static int gfs2_freeze(struct super_block *sb)  		return -EINVAL;  	for (;;) { -		error = gfs2_freeze_fs(sdp); +		error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);  		if (!error)  			break; @@ -1013,7 +958,9 @@ static int gfs2_freeze(struct super_block *sb)  static int gfs2_unfreeze(struct super_block *sb)  { -	gfs2_unfreeze_fs(sb->s_fs_info); +	struct gfs2_sbd *sdp = sb->s_fs_info; + +	gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);  	return 0;  } @@ -1429,7 +1376,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)  	if (error)  		return error; -	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); +	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);  	if (error)  		return error; @@ -1577,6 +1524,7 @@ out:  	/* Case 3 starts here */  	truncate_inode_pages(&inode->i_data, 0);  	gfs2_rs_delete(ip); +	gfs2_ordered_del_inode(ip);  	clear_inode(inode);  	gfs2_dir_hash_inval(ip);  	ip->i_gl->gl_object = NULL; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index a0464680af0..90e3322ffa1 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -46,9 +46,6 @@ extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,  			  struct buffer_head *l_bh);  extern int gfs2_statfs_sync(struct super_block *sb, int type); -extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); -extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); -  extern struct file_system_type gfs2_fs_type;  extern struct file_system_type gfs2meta_fs_type;  extern const struct export_operations gfs2_export_ops; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 8056b7b7238..aa5c4804496 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -91,39 +91,37 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)  static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)  { -	unsigned int count; - -	mutex_lock(&sdp->sd_freeze_lock); -	count = sdp->sd_freeze_count; -	mutex_unlock(&sdp->sd_freeze_lock); +	struct super_block *sb = sdp->sd_vfs; +	int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; -	return snprintf(buf, PAGE_SIZE, "%u\n", count); +	return snprintf(buf, PAGE_SIZE, "%u\n", frozen);  }  static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)  { -	ssize_t ret = len; -	int error = 0; +	int error;  	int n = simple_strtol(buf, NULL, 0);  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	switch (n) {  	case 0: -		gfs2_unfreeze_fs(sdp); +		error = thaw_super(sdp->sd_vfs);  		break;  	case 1: -		error = gfs2_freeze_fs(sdp); +		error = freeze_super(sdp->sd_vfs);  		break;  	default: -		ret = -EINVAL; +		return -EINVAL;  	} -	if (error) +	if (error) {  		fs_warn(sdp, "freeze %d error %d", n, error); +		return error; +	} -	return ret; +	return len;  }  static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) @@ -135,7 +133,7 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)  static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)  {  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	if (simple_strtol(buf, NULL, 0) != 1)  		return -EINVAL; @@ -150,7 +148,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,  				 size_t len)  {  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	if (simple_strtol(buf, NULL, 0) != 1)  		return -EINVAL; @@ -163,7 +161,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,  				size_t len)  {  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	if (simple_strtol(buf, NULL, 0) != 1)  		return -EINVAL; @@ -175,30 +173,40 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,  static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,  					size_t len)  { +	struct kqid qid;  	int error;  	u32 id;  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	id = simple_strtoul(buf, NULL, 0); -	error = gfs2_quota_refresh(sdp, 1, id); +	qid = make_kqid(current_user_ns(), USRQUOTA, id); +	if (!qid_valid(qid)) +		return -EINVAL; + +	error = gfs2_quota_refresh(sdp, qid);  	return error ? error : len;  }  static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,  					 size_t len)  { +	struct kqid qid;  	int error;  	u32 id;  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	id = simple_strtoul(buf, NULL, 0); -	error = gfs2_quota_refresh(sdp, 0, id); +	qid = make_kqid(current_user_ns(), GRPQUOTA, id); +	if (!qid_valid(qid)) +		return -EINVAL; + +	error = gfs2_quota_refresh(sdp, qid);  	return error ? error : len;  } @@ -213,7 +221,7 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len  	int rv;  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum,  		    mode); @@ -332,6 +340,28 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)  	return ret;  } +static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) +{ +	int val = completion_done(&sdp->sd_wdack) ? 1 : 0; + +	return sprintf(buf, "%d\n", val); +} + +static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ +	ssize_t ret = len; +	int val; + +	val = simple_strtol(buf, NULL, 0); + +	if ((val == 1) && +	    !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) +		complete(&sdp->sd_wdack); +	else +		ret = -EINVAL; +	return ret; +} +  static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)  {  	struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -463,7 +493,7 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)  GDLM_ATTR(proto_name,		0444, proto_name_show,		NULL);  GDLM_ATTR(block,		0644, block_show,		block_store); -GDLM_ATTR(withdraw,		0644, withdraw_show,		withdraw_store); +GDLM_ATTR(withdraw,		0644, wdack_show,		wdack_store);  GDLM_ATTR(jid,			0644, jid_show,			jid_store);  GDLM_ATTR(first,		0644, lkfirst_show,		lkfirst_store);  GDLM_ATTR(first_done,		0444, first_done_show,		NULL); @@ -502,7 +532,7 @@ static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,  	unsigned int x, y;  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)  		return -EINVAL; @@ -521,7 +551,7 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,  	unsigned int x;  	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; +		return -EPERM;  	x = simple_strtoul(buf, NULL, 0); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 413627072f3..88162fae27a 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -18,6 +18,7 @@  #include "gfs2.h"  #include "incore.h"  #include "glock.h" +#include "inode.h"  #include "log.h"  #include "lops.h"  #include "meta_io.h" @@ -142,44 +143,143 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)  	sb_end_intwrite(sdp->sd_vfs);  } +static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, +					       struct buffer_head *bh, +					       const struct gfs2_log_operations *lops) +{ +	struct gfs2_bufdata *bd; + +	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); +	bd->bd_bh = bh; +	bd->bd_gl = gl; +	bd->bd_ops = lops; +	INIT_LIST_HEAD(&bd->bd_list); +	bh->b_private = bd; +	return bd; +} +  /** - * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction - * @gl: the glock the buffer belongs to + * gfs2_trans_add_data - Add a databuf to the transaction. + * @gl: The inode glock associated with the buffer   * @bh: The buffer to add - * @meta: True in the case of adding metadata   * + * This is used in two distinct cases: + * i) In ordered write mode + *    We put the data buffer on a list so that we can ensure that its + *    synced to disk at the right time + * ii) In journaled data mode + *    We need to journal the data block in the same way as metadata in + *    the functions above. The difference is that here we have a tag + *    which is two __be64's being the block number (as per meta data) + *    and a flag which says whether the data block needs escaping or + *    not. This means we need a new log entry for each 251 or so data + *    blocks, which isn't an enormous overhead but twice as much as + *    for normal metadata blocks.   */ +void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) +{ +	struct gfs2_trans *tr = current->journal_info; +	struct gfs2_sbd *sdp = gl->gl_sbd; +	struct address_space *mapping = bh->b_page->mapping; +	struct gfs2_inode *ip = GFS2_I(mapping->host); +	struct gfs2_bufdata *bd; -void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) +	if (!gfs2_is_jdata(ip)) { +		gfs2_ordered_add_inode(ip); +		return; +	} + +	lock_buffer(bh); +	gfs2_log_lock(sdp); +	bd = bh->b_private; +	if (bd == NULL) { +		gfs2_log_unlock(sdp); +		unlock_buffer(bh); +		if (bh->b_private == NULL) +			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops); +		lock_buffer(bh); +		gfs2_log_lock(sdp); +	} +	gfs2_assert(sdp, bd->bd_gl == gl); +	tr->tr_touched = 1; +	if (list_empty(&bd->bd_list)) { +		set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); +		set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); +		gfs2_pin(sdp, bd->bd_bh); +		tr->tr_num_databuf_new++; +		sdp->sd_log_num_databuf++; +		list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); +	} +	gfs2_log_unlock(sdp); +	unlock_buffer(bh); +} + +static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)  { +	struct gfs2_meta_header *mh; +	struct gfs2_trans *tr; + +	tr = current->journal_info; +	tr->tr_touched = 1; +	if (!list_empty(&bd->bd_list)) +		return; +	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); +	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); +	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; +	if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { +		printk(KERN_ERR +		       "Attempting to add uninitialised block to journal (inplace block=%lld)\n", +		       (unsigned long long)bd->bd_bh->b_blocknr); +		BUG(); +	} +	gfs2_pin(sdp, bd->bd_bh); +	mh->__pad0 = cpu_to_be64(0); +	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); +	sdp->sd_log_num_buf++; +	list_add(&bd->bd_list, &sdp->sd_log_le_buf); +	tr->tr_num_buf_new++; +} + +void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) +{ +  	struct gfs2_sbd *sdp = gl->gl_sbd;  	struct gfs2_bufdata *bd;  	lock_buffer(bh);  	gfs2_log_lock(sdp);  	bd = bh->b_private; -	if (bd) -		gfs2_assert(sdp, bd->bd_gl == gl); -	else { +	if (bd == NULL) {  		gfs2_log_unlock(sdp);  		unlock_buffer(bh); -		gfs2_attach_bufdata(gl, bh, meta); -		bd = bh->b_private; +		lock_page(bh->b_page); +		if (bh->b_private == NULL) +			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops); +		unlock_page(bh->b_page);  		lock_buffer(bh);  		gfs2_log_lock(sdp);  	} -	lops_add(sdp, bd); +	gfs2_assert(sdp, bd->bd_gl == gl); +	meta_lo_add(sdp, bd);  	gfs2_log_unlock(sdp);  	unlock_buffer(bh);  }  void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)  { +	struct gfs2_glock *gl = bd->bd_gl; +	struct gfs2_trans *tr = current->journal_info; +  	BUG_ON(!list_empty(&bd->bd_list));  	BUG_ON(!list_empty(&bd->bd_ail_st_list));  	BUG_ON(!list_empty(&bd->bd_ail_gl_list)); -	lops_init_le(bd, &gfs2_revoke_lops); -	lops_add(sdp, bd); +	bd->bd_ops = &gfs2_revoke_lops; +	tr->tr_touched = 1; +	tr->tr_num_revoke++; +	sdp->sd_log_num_revoke++; +	atomic_inc(&gl->gl_revokes); +	set_bit(GLF_LFLUSH, &gl->gl_flags); +	list_add(&bd->bd_list, &sdp->sd_log_le_revoke);  }  void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index bf2ae9aeee7..1e6e7da25a1 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -39,7 +39,8 @@ extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,  			    unsigned int revokes);  extern void gfs2_trans_end(struct gfs2_sbd *sdp); -extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);  extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);  extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index f00d7c5744f..6402fb69d71 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -54,6 +54,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)  		kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); +		if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) +			wait_for_completion(&sdp->sd_wdack); +  		if (lm->lm_unmount) {  			fs_err(sdp, "telling LM to unmount\n");  			lm->lm_unmount(sdp); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 76c144b3c9b..ecd37f30ab9 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -270,7 +270,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,  	if (error)  		goto out_gunlock; -	gfs2_trans_add_bh(ip->i_gl, bh, 1); +	gfs2_trans_add_meta(ip->i_gl, bh);  	dataptrs = GFS2_EA2DATAPTRS(ea);  	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { @@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,  	error = gfs2_meta_inode_buffer(ip, &dibh);  	if (!error) {  		ip->i_inode.i_ctime = CURRENT_TIME; -		gfs2_trans_add_bh(ip->i_gl, dibh, 1); +		gfs2_trans_add_meta(ip->i_gl, dibh);  		gfs2_dinode_out(ip, dibh->b_data);  		brelse(dibh);  	} @@ -331,7 +331,7 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,  	if (error)  		return error; -	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); +	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);  	if (error)  		goto out_alloc; @@ -509,7 +509,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,  		}  		if (din) { -			gfs2_trans_add_bh(ip->i_gl, bh[x], 1); +			gfs2_trans_add_meta(ip->i_gl, bh[x]);  			memcpy(pos, din, cp_size);  			din += sdp->sd_jbsize;  		} @@ -629,7 +629,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)  		return error;  	gfs2_trans_add_unrevoke(sdp, block, 1);  	*bhp = gfs2_meta_new(ip->i_gl, block); -	gfs2_trans_add_bh(ip->i_gl, *bhp, 1); +	gfs2_trans_add_meta(ip->i_gl, *bhp);  	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);  	gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); @@ -691,7 +691,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,  				return error;  			gfs2_trans_add_unrevoke(sdp, block, 1);  			bh = gfs2_meta_new(ip->i_gl, block); -			gfs2_trans_add_bh(ip->i_gl, bh, 1); +			gfs2_trans_add_meta(ip->i_gl, bh);  			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);  			gfs2_add_inode_blocks(&ip->i_inode, 1); @@ -751,7 +751,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,  	error = gfs2_meta_inode_buffer(ip, &dibh);  	if (!error) {  		ip->i_inode.i_ctime = CURRENT_TIME; -		gfs2_trans_add_bh(ip->i_gl, dibh, 1); +		gfs2_trans_add_meta(ip->i_gl, dibh);  		gfs2_dinode_out(ip, dibh->b_data);  		brelse(dibh);  	} @@ -834,7 +834,7 @@ static void ea_set_remove_stuffed(struct gfs2_inode *ip,  	struct gfs2_ea_header *prev = el->el_prev;  	u32 len; -	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); +	gfs2_trans_add_meta(ip->i_gl, el->el_bh);  	if (!prev || !GFS2_EA_IS_STUFFED(ea)) {  		ea->ea_type = GFS2_EATYPE_UNUSED; @@ -872,7 +872,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,  	if (error)  		return error; -	gfs2_trans_add_bh(ip->i_gl, bh, 1); +	gfs2_trans_add_meta(ip->i_gl, bh);  	if (es->ea_split)  		ea = ea_split_ea(ea); @@ -886,7 +886,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,  	if (error)  		goto out;  	ip->i_inode.i_ctime = CURRENT_TIME; -	gfs2_trans_add_bh(ip->i_gl, dibh, 1); +	gfs2_trans_add_meta(ip->i_gl, dibh);  	gfs2_dinode_out(ip, dibh->b_data);  	brelse(dibh);  out: @@ -901,7 +901,7 @@ static int ea_set_simple_alloc(struct gfs2_inode *ip,  	struct gfs2_ea_header *ea = es->es_ea;  	int error; -	gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1); +	gfs2_trans_add_meta(ip->i_gl, es->es_bh);  	if (es->ea_split)  		ea = ea_split_ea(ea); @@ -997,7 +997,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,  			goto out;  		} -		gfs2_trans_add_bh(ip->i_gl, indbh, 1); +		gfs2_trans_add_meta(ip->i_gl, indbh);  	} else {  		u64 blk;  		unsigned int n = 1; @@ -1006,7 +1006,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,  			return error;  		gfs2_trans_add_unrevoke(sdp, blk, 1);  		indbh = gfs2_meta_new(ip->i_gl, blk); -		gfs2_trans_add_bh(ip->i_gl, indbh, 1); +		gfs2_trans_add_meta(ip->i_gl, indbh);  		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);  		gfs2_buffer_clear_tail(indbh, mh_size); @@ -1092,7 +1092,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)  	if (error)  		return error; -	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); +	gfs2_trans_add_meta(ip->i_gl, el->el_bh);  	if (prev) {  		u32 len; @@ -1109,7 +1109,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)  	error = gfs2_meta_inode_buffer(ip, &dibh);  	if (!error) {  		ip->i_inode.i_ctime = CURRENT_TIME; -		gfs2_trans_add_bh(ip->i_gl, dibh, 1); +		gfs2_trans_add_meta(ip->i_gl, dibh);  		gfs2_dinode_out(ip, dibh->b_data);  		brelse(dibh);  	} @@ -1265,7 +1265,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)  	if (GFS2_EA_IS_STUFFED(el.el_ea)) {  		error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);  		if (error == 0) { -			gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); +			gfs2_trans_add_meta(ip->i_gl, el.el_bh);  			memcpy(GFS2_EA2DATA(el.el_ea), data,  			       GFS2_EA_DATA_LEN(el.el_ea));  		} @@ -1352,7 +1352,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)  	if (error)  		goto out_gunlock; -	gfs2_trans_add_bh(ip->i_gl, indbh, 1); +	gfs2_trans_add_meta(ip->i_gl, indbh);  	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));  	bstart = 0; @@ -1384,7 +1384,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)  	error = gfs2_meta_inode_buffer(ip, &dibh);  	if (!error) { -		gfs2_trans_add_bh(ip->i_gl, dibh, 1); +		gfs2_trans_add_meta(ip->i_gl, dibh);  		gfs2_dinode_out(ip, dibh->b_data);  		brelse(dibh);  	} @@ -1434,7 +1434,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)  	error = gfs2_meta_inode_buffer(ip, &dibh);  	if (!error) { -		gfs2_trans_add_bh(ip->i_gl, dibh, 1); +		gfs2_trans_add_meta(ip->i_gl, dibh);  		gfs2_dinode_out(ip, dibh->b_data);  		brelse(dibh);  	} @@ -1461,7 +1461,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)  	if (error)  		return error; -	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); +	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);  	if (error)  		return error; diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index b77c5bc20f8..998e3a6decf 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -1,6 +1,6 @@  config HFS_FS -	tristate "Apple Macintosh file system support (EXPERIMENTAL)" -	depends on BLOCK && EXPERIMENTAL +	tristate "Apple Macintosh file system support" +	depends on BLOCK  	select NLS  	help  	  If you say Y here, you will be able to mount Macintosh-formatted diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 422dde2ec0a..5f7f1abd5f6 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -51,7 +51,7 @@ done:   */  static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	int len, err;  	char strbuf[HFS_MAX_NAMELEN]; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 0b35903219b..3031dfdd235 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -35,6 +35,16 @@ static int hfs_readpage(struct file *file, struct page *page)  	return block_read_full_page(page, hfs_get_block);  } +static void hfs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		hfs_file_truncate(inode); +	} +} +  static int hfs_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -45,11 +55,8 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,  	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,  				hfs_get_block,  				&HFS_I(mapping->host)->phys_size); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		hfs_write_failed(mapping, pos + len);  	return ret;  } @@ -120,7 +127,8 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,  		const struct iovec *iov, loff_t offset, unsigned long nr_segs)  {  	struct file *file = iocb->ki_filp; -	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; +	struct address_space *mapping = file->f_mapping; +	struct inode *inode = file_inode(file)->i_mapping->host;  	ssize_t ret;  	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, @@ -135,7 +143,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,  		loff_t end = offset + iov_length(iov, nr_segs);  		if (end > isize) -			vmtruncate(inode, isize); +			hfs_write_failed(mapping, end);  	}  	return ret; @@ -617,9 +625,12 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)  	    attr->ia_size != i_size_read(inode)) {  		inode_dio_wait(inode); -		error = vmtruncate(inode, attr->ia_size); +		error = inode_newsize_ok(inode, attr->ia_size);  		if (error)  			return error; + +		truncate_setsize(inode, attr->ia_size); +		hfs_file_truncate(inode);  	}  	setattr_copy(inode, attr); @@ -668,7 +679,6 @@ static const struct file_operations hfs_file_operations = {  static const struct inode_operations hfs_file_inode_operations = {  	.lookup		= hfs_file_lookup, -	.truncate	= hfs_file_truncate,  	.setattr	= hfs_inode_setattr,  	.setxattr	= hfs_setxattr,  	.getxattr	= hfs_getxattr, diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile index 3cc0df73015..09d278bb7b9 100644 --- a/fs/hfsplus/Makefile +++ b/fs/hfsplus/Makefile @@ -5,5 +5,5 @@  obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o  hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ -		bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o - +		bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ +		attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c new file mode 100644 index 00000000000..8d691f12471 --- /dev/null +++ b/fs/hfsplus/attributes.c @@ -0,0 +1,399 @@ +/* + * linux/fs/hfsplus/attributes.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handling of records in attributes tree + */ + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +static struct kmem_cache *hfsplus_attr_tree_cachep; + +int hfsplus_create_attr_tree_cache(void) +{ +	if (hfsplus_attr_tree_cachep) +		return -EEXIST; + +	hfsplus_attr_tree_cachep = +		kmem_cache_create("hfsplus_attr_cache", +			sizeof(hfsplus_attr_entry), 0, +			SLAB_HWCACHE_ALIGN, NULL); +	if (!hfsplus_attr_tree_cachep) +		return -ENOMEM; + +	return 0; +} + +void hfsplus_destroy_attr_tree_cache(void) +{ +	kmem_cache_destroy(hfsplus_attr_tree_cachep); +} + +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1, +				const hfsplus_btree_key *k2) +{ +	__be32 k1_cnid, k2_cnid; + +	k1_cnid = k1->attr.cnid; +	k2_cnid = k2->attr.cnid; +	if (k1_cnid != k2_cnid) +		return be32_to_cpu(k1_cnid) < be32_to_cpu(k2_cnid) ? -1 : 1; + +	return hfsplus_strcmp( +			(const struct hfsplus_unistr *)&k1->attr.key_name, +			(const struct hfsplus_unistr *)&k2->attr.key_name); +} + +int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, +			u32 cnid, const char *name) +{ +	int len; + +	memset(key, 0, sizeof(struct hfsplus_attr_key)); +	key->attr.cnid = cpu_to_be32(cnid); +	if (name) { +		len = strlen(name); +		if (len > HFSPLUS_ATTR_MAX_STRLEN) { +			printk(KERN_ERR "hfs: invalid xattr name's length\n"); +			return -EINVAL; +		} +		hfsplus_asc2uni(sb, +				(struct hfsplus_unistr *)&key->attr.key_name, +				HFSPLUS_ATTR_MAX_STRLEN, name, len); +		len = be16_to_cpu(key->attr.key_name.length); +	} else { +		key->attr.key_name.length = 0; +		len = 0; +	} + +	/* The length of the key, as stored in key_len field, does not include +	 * the size of the key_len field itself. +	 * So, offsetof(hfsplus_attr_key, key_name) is a trick because +	 * it takes into consideration key_len field (__be16) of +	 * hfsplus_attr_key structure instead of length field (__be16) of +	 * hfsplus_attr_unistr structure. +	 */ +	key->key_len = +		cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + +				2 * len); + +	return 0; +} + +void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, +					u32 cnid, +					struct hfsplus_attr_unistr *name) +{ +	int ustrlen; + +	memset(key, 0, sizeof(struct hfsplus_attr_key)); +	ustrlen = be16_to_cpu(name->length); +	key->attr.cnid = cpu_to_be32(cnid); +	key->attr.key_name.length = cpu_to_be16(ustrlen); +	ustrlen *= 2; +	memcpy(key->attr.key_name.unicode, name->unicode, ustrlen); + +	/* The length of the key, as stored in key_len field, does not include +	 * the size of the key_len field itself. +	 * So, offsetof(hfsplus_attr_key, key_name) is a trick because +	 * it takes into consideration key_len field (__be16) of +	 * hfsplus_attr_key structure instead of length field (__be16) of +	 * hfsplus_attr_unistr structure. +	 */ +	key->key_len = +		cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + +				ustrlen); +} + +hfsplus_attr_entry *hfsplus_alloc_attr_entry(void) +{ +	return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL); +} + +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry) +{ +	if (entry) +		kmem_cache_free(hfsplus_attr_tree_cachep, entry); +} + +#define HFSPLUS_INVALID_ATTR_RECORD -1 + +static int hfsplus_attr_build_record(hfsplus_attr_entry *entry, int record_type, +				u32 cnid, const void *value, size_t size) +{ +	if (record_type == HFSPLUS_ATTR_FORK_DATA) { +		/* +		 * Mac OS X supports only inline data attributes. +		 * Do nothing +		 */ +		memset(entry, 0, sizeof(*entry)); +		return sizeof(struct hfsplus_attr_fork_data); +	} else if (record_type == HFSPLUS_ATTR_EXTENTS) { +		/* +		 * Mac OS X supports only inline data attributes. +		 * Do nothing. +		 */ +		memset(entry, 0, sizeof(*entry)); +		return sizeof(struct hfsplus_attr_extents); +	} else if (record_type == HFSPLUS_ATTR_INLINE_DATA) { +		u16 len; + +		memset(entry, 0, sizeof(struct hfsplus_attr_inline_data)); +		entry->inline_data.record_type = cpu_to_be32(record_type); +		if (size <= HFSPLUS_MAX_INLINE_DATA_SIZE) +			len = size; +		else +			return HFSPLUS_INVALID_ATTR_RECORD; +		entry->inline_data.length = cpu_to_be16(len); +		memcpy(entry->inline_data.raw_bytes, value, len); +		/* +		 * Align len on two-byte boundary. +		 * It needs to add pad byte if we have odd len. +		 */ +		len = round_up(len, 2); +		return offsetof(struct hfsplus_attr_inline_data, raw_bytes) + +					len; +	} else /* invalid input */ +		memset(entry, 0, sizeof(*entry)); + +	return HFSPLUS_INVALID_ATTR_RECORD; +} + +int hfsplus_find_attr(struct super_block *sb, u32 cnid, +			const char *name, struct hfs_find_data *fd) +{ +	int err = 0; + +	dprint(DBG_ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + +	if (!HFSPLUS_SB(sb)->attr_tree) { +		printk(KERN_ERR "hfs: attributes file doesn't exist\n"); +		return -EINVAL; +	} + +	if (name) { +		err = hfsplus_attr_build_key(sb, fd->search_key, cnid, name); +		if (err) +			goto failed_find_attr; +		err = hfs_brec_find(fd, hfs_find_rec_by_key); +		if (err) +			goto failed_find_attr; +	} else { +		err = hfsplus_attr_build_key(sb, fd->search_key, cnid, NULL); +		if (err) +			goto failed_find_attr; +		err = hfs_brec_find(fd, hfs_find_1st_rec_by_cnid); +		if (err) +			goto failed_find_attr; +	} + +failed_find_attr: +	return err; +} + +int hfsplus_attr_exists(struct inode *inode, const char *name) +{ +	int err = 0; +	struct super_block *sb = inode->i_sb; +	struct hfs_find_data fd; + +	if (!HFSPLUS_SB(sb)->attr_tree) +		return 0; + +	err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); +	if (err) +		return 0; + +	err = hfsplus_find_attr(sb, inode->i_ino, name, &fd); +	if (err) +		goto attr_not_found; + +	hfs_find_exit(&fd); +	return 1; + +attr_not_found: +	hfs_find_exit(&fd); +	return 0; +} + +int hfsplus_create_attr(struct inode *inode, +				const char *name, +				const void *value, size_t size) +{ +	struct super_block *sb = inode->i_sb; +	struct hfs_find_data fd; +	hfsplus_attr_entry *entry_ptr; +	int entry_size; +	int err; + +	dprint(DBG_ATTR_MOD, "create_attr: %s,%ld\n", +		name ? name : NULL, inode->i_ino); + +	if (!HFSPLUS_SB(sb)->attr_tree) { +		printk(KERN_ERR "hfs: attributes file doesn't exist\n"); +		return -EINVAL; +	} + +	entry_ptr = hfsplus_alloc_attr_entry(); +	if (!entry_ptr) +		return -ENOMEM; + +	err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); +	if (err) +		goto failed_init_create_attr; + +	if (name) { +		err = hfsplus_attr_build_key(sb, fd.search_key, +						inode->i_ino, name); +		if (err) +			goto failed_create_attr; +	} else { +		err = -EINVAL; +		goto failed_create_attr; +	} + +	/* Mac OS X supports only inline data attributes. */ +	entry_size = hfsplus_attr_build_record(entry_ptr, +					HFSPLUS_ATTR_INLINE_DATA, +					inode->i_ino, +					value, size); +	if (entry_size == HFSPLUS_INVALID_ATTR_RECORD) { +		err = -EINVAL; +		goto failed_create_attr; +	} + +	err = hfs_brec_find(&fd, hfs_find_rec_by_key); +	if (err != -ENOENT) { +		if (!err) +			err = -EEXIST; +		goto failed_create_attr; +	} + +	err = hfs_brec_insert(&fd, entry_ptr, entry_size); +	if (err) +		goto failed_create_attr; + +	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); + +failed_create_attr: +	hfs_find_exit(&fd); + +failed_init_create_attr: +	hfsplus_destroy_attr_entry(entry_ptr); +	return err; +} + +static int __hfsplus_delete_attr(struct inode *inode, u32 cnid, +					struct hfs_find_data *fd) +{ +	int err = 0; +	__be32 found_cnid, record_type; + +	hfs_bnode_read(fd->bnode, &found_cnid, +			fd->keyoffset + +			offsetof(struct hfsplus_attr_key, cnid), +			sizeof(__be32)); +	if (cnid != be32_to_cpu(found_cnid)) +		return -ENOENT; + +	hfs_bnode_read(fd->bnode, &record_type, +			fd->entryoffset, sizeof(record_type)); + +	switch (be32_to_cpu(record_type)) { +	case HFSPLUS_ATTR_INLINE_DATA: +		/* All is OK. Do nothing. */ +		break; +	case HFSPLUS_ATTR_FORK_DATA: +	case HFSPLUS_ATTR_EXTENTS: +		printk(KERN_ERR "hfs: only inline data xattr are supported\n"); +		return -EOPNOTSUPP; +	default: +		printk(KERN_ERR "hfs: invalid extended attribute record\n"); +		return -ENOENT; +	} + +	err = hfs_brec_remove(fd); +	if (err) +		return err; + +	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); +	return err; +} + +int hfsplus_delete_attr(struct inode *inode, const char *name) +{ +	int err = 0; +	struct super_block *sb = inode->i_sb; +	struct hfs_find_data fd; + +	dprint(DBG_ATTR_MOD, "delete_attr: %s,%ld\n", +		name ? name : NULL, inode->i_ino); + +	if (!HFSPLUS_SB(sb)->attr_tree) { +		printk(KERN_ERR "hfs: attributes file doesn't exist\n"); +		return -EINVAL; +	} + +	err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); +	if (err) +		return err; + +	if (name) { +		err = hfsplus_attr_build_key(sb, fd.search_key, +						inode->i_ino, name); +		if (err) +			goto out; +	} else { +		printk(KERN_ERR "hfs: invalid extended attribute name\n"); +		err = -EINVAL; +		goto out; +	} + +	err = hfs_brec_find(&fd, hfs_find_rec_by_key); +	if (err) +		goto out; + +	err = __hfsplus_delete_attr(inode, inode->i_ino, &fd); +	if (err) +		goto out; + +out: +	hfs_find_exit(&fd); +	return err; +} + +int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) +{ +	int err = 0; +	struct hfs_find_data fd; + +	dprint(DBG_ATTR_MOD, "delete_all_attrs: %d\n", cnid); + +	if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { +		printk(KERN_ERR "hfs: attributes file doesn't exist\n"); +		return -EINVAL; +	} + +	err = hfs_find_init(HFSPLUS_SB(dir->i_sb)->attr_tree, &fd); +	if (err) +		return err; + +	for (;;) { +		err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd); +		if (err) { +			if (err != -ENOENT) +				printk(KERN_ERR "hfs: xattr search failed.\n"); +			goto end_delete_all; +		} + +		err = __hfsplus_delete_attr(dir, cnid, &fd); +		if (err) +			goto end_delete_all; +	} + +end_delete_all: +	hfs_find_exit(&fd); +	return err; +} diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 5d799c13205..d73c98d1ee9 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -24,7 +24,19 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)  	fd->key = ptr + tree->max_key_len + 2;  	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",  		tree->cnid, __builtin_return_address(0)); -	mutex_lock(&tree->tree_lock); +	switch (tree->cnid) { +	case HFSPLUS_CAT_CNID: +		mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); +		break; +	case HFSPLUS_EXT_CNID: +		mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); +		break; +	case HFSPLUS_ATTR_CNID: +		mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); +		break; +	default: +		BUG(); +	}  	return 0;  } @@ -38,15 +50,73 @@ void hfs_find_exit(struct hfs_find_data *fd)  	fd->tree = NULL;  } -/* Find the record in bnode that best matches key (not greater than...)*/ -int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, +				struct hfs_find_data *fd, +				int *begin, +				int *end, +				int *cur_rec) +{ +	__be32 cur_cnid, search_cnid; + +	if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { +		cur_cnid = fd->key->ext.cnid; +		search_cnid = fd->search_key->ext.cnid; +	} else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) { +		cur_cnid = fd->key->cat.parent; +		search_cnid = fd->search_key->cat.parent; +	} else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { +		cur_cnid = fd->key->attr.cnid; +		search_cnid = fd->search_key->attr.cnid; +	} else +		BUG(); + +	if (cur_cnid == search_cnid) { +		(*end) = (*cur_rec); +		if ((*begin) == (*end)) +			return 1; +	} else { +		if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid)) +			(*begin) = (*cur_rec) + 1; +		else +			(*end) = (*cur_rec) - 1; +	} + +	return 0; +} + +int hfs_find_rec_by_key(struct hfs_bnode *bnode, +				struct hfs_find_data *fd, +				int *begin, +				int *end, +				int *cur_rec)  {  	int cmpval; + +	cmpval = bnode->tree->keycmp(fd->key, fd->search_key); +	if (!cmpval) { +		(*end) = (*cur_rec); +		return 1; +	} +	if (cmpval < 0) +		(*begin) = (*cur_rec) + 1; +	else +		*(end) = (*cur_rec) - 1; + +	return 0; +} + +/* Find the record in bnode that best matches key (not greater than...)*/ +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, +					search_strategy_t rec_found) +{  	u16 off, len, keylen;  	int rec;  	int b, e;  	int res; +	if (!rec_found) +		BUG(); +  	b = 0;  	e = bnode->num_recs - 1;  	res = -ENOENT; @@ -59,17 +129,12 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)  			goto fail;  		}  		hfs_bnode_read(bnode, fd->key, off, keylen); -		cmpval = bnode->tree->keycmp(fd->key, fd->search_key); -		if (!cmpval) { -			e = rec; +		if (rec_found(bnode, fd, &b, &e, &rec)) {  			res = 0;  			goto done;  		} -		if (cmpval < 0) -			b = rec + 1; -		else -			e = rec - 1;  	} while (b <= e); +  	if (rec != e && e >= 0) {  		len = hfs_brec_lenoff(bnode, e, &off);  		keylen = hfs_brec_keylen(bnode, e); @@ -79,19 +144,21 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)  		}  		hfs_bnode_read(bnode, fd->key, off, keylen);  	} +  done:  	fd->record = e;  	fd->keyoffset = off;  	fd->keylength = keylen;  	fd->entryoffset = off + keylen;  	fd->entrylength = len - keylen; +  fail:  	return res;  }  /* Traverse a B*Tree from the root to a leaf finding best fit to key */  /* Return allocated copy of node found, set recnum to best record */ -int hfs_brec_find(struct hfs_find_data *fd) +int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare)  {  	struct hfs_btree *tree;  	struct hfs_bnode *bnode; @@ -122,7 +189,7 @@ int hfs_brec_find(struct hfs_find_data *fd)  			goto invalid;  		bnode->parent = parent; -		res = __hfs_brec_find(bnode, fd); +		res = __hfs_brec_find(bnode, fd, do_key_compare);  		if (!height)  			break;  		if (fd->record < 0) @@ -149,7 +216,7 @@ int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len)  {  	int res; -	res = hfs_brec_find(fd); +	res = hfs_brec_find(fd, hfs_find_rec_by_key);  	if (res)  		return res;  	if (fd->entrylength > rec_len) diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index 4cfbe2edd29..6feefc0cb48 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -176,12 +176,14 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)  	dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);  	/* are all of the bits in range? */  	if ((offset + count) > sbi->total_blocks) -		return -2; +		return -ENOENT;  	mutex_lock(&sbi->alloc_mutex);  	mapping = sbi->alloc_file->i_mapping;  	pnr = offset / PAGE_CACHE_BITS;  	page = read_mapping_page(mapping, pnr, NULL); +	if (IS_ERR(page)) +		goto kaboom;  	pptr = kmap(page);  	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;  	end = pptr + PAGE_CACHE_BITS / 32; @@ -214,6 +216,8 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)  		set_page_dirty(page);  		kunmap(page);  		page = read_mapping_page(mapping, ++pnr, NULL); +		if (IS_ERR(page)) +			goto kaboom;  		pptr = kmap(page);  		curr = pptr;  		end = pptr + PAGE_CACHE_BITS / 32; @@ -232,4 +236,11 @@ out:  	mutex_unlock(&sbi->alloc_mutex);  	return 0; + +kaboom: +	printk(KERN_CRIT "hfsplus: unable to mark blocks free: error %ld\n", +			PTR_ERR(page)); +	mutex_unlock(&sbi->alloc_mutex); + +	return -EIO;  } diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 1c42cc5b899..f31ac6f404f 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -62,7 +62,8 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)  	tree = node->tree;  	if (node->type == HFS_NODE_LEAF || -	    tree->attributes & HFS_TREE_VARIDXKEYS) +	    tree->attributes & HFS_TREE_VARIDXKEYS || +	    node->tree->cnid == HFSPLUS_ATTR_CNID)  		key_len = hfs_bnode_read_u16(node, off) + 2;  	else  		key_len = tree->max_key_len + 2; @@ -314,7 +315,8 @@ void hfs_bnode_dump(struct hfs_bnode *node)  		if (i && node->type == HFS_NODE_INDEX) {  			int tmp; -			if (node->tree->attributes & HFS_TREE_VARIDXKEYS) +			if (node->tree->attributes & HFS_TREE_VARIDXKEYS || +					node->tree->cnid == HFSPLUS_ATTR_CNID)  				tmp = hfs_bnode_read_u16(node, key_off) + 2;  			else  				tmp = node->tree->max_key_len + 2; @@ -646,6 +648,8 @@ void hfs_bnode_put(struct hfs_bnode *node)  		if (test_bit(HFS_BNODE_DELETED, &node->flags)) {  			hfs_bnode_unhash(node);  			spin_unlock(&tree->hash_lock); +			hfs_bnode_clear(node, 0, +				PAGE_CACHE_SIZE * tree->pages_per_bnode);  			hfs_bmap_free(node);  			hfs_bnode_free(node);  			return; diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 2a734cfccc9..298d4e45604 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -36,7 +36,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)  		return 0;  	if ((node->type == HFS_NODE_INDEX) && -	   !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) { +	   !(node->tree->attributes & HFS_TREE_VARIDXKEYS) && +	   (node->tree->cnid != HFSPLUS_ATTR_CNID)) {  		retval = node->tree->max_key_len + 2;  	} else {  		recoff = hfs_bnode_read_u16(node, @@ -151,12 +152,13 @@ skip:  		/* get index key */  		hfs_bnode_read_key(new_node, fd->search_key, 14); -		__hfs_brec_find(fd->bnode, fd); +		__hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);  		hfs_bnode_put(new_node);  		new_node = NULL; -		if (tree->attributes & HFS_TREE_VARIDXKEYS) +		if ((tree->attributes & HFS_TREE_VARIDXKEYS) || +				(tree->cnid == HFSPLUS_ATTR_CNID))  			key_len = be16_to_cpu(fd->search_key->key_len) + 2;  		else {  			fd->search_key->key_len = @@ -201,7 +203,7 @@ again:  		hfs_bnode_put(node);  		node = fd->bnode = parent; -		__hfs_brec_find(node, fd); +		__hfs_brec_find(node, fd, hfs_find_rec_by_key);  		goto again;  	}  	hfs_bnode_write_u16(node, @@ -367,12 +369,13 @@ again:  	parent = hfs_bnode_find(tree, node->parent);  	if (IS_ERR(parent))  		return PTR_ERR(parent); -	__hfs_brec_find(parent, fd); +	__hfs_brec_find(parent, fd, hfs_find_rec_by_key);  	hfs_bnode_dump(parent);  	rec = fd->record;  	/* size difference between old and new key */ -	if (tree->attributes & HFS_TREE_VARIDXKEYS) +	if ((tree->attributes & HFS_TREE_VARIDXKEYS) || +				(tree->cnid == HFSPLUS_ATTR_CNID))  		newkeylen = hfs_bnode_read_u16(node, 14) + 2;  	else  		fd->keylength = newkeylen = tree->max_key_len + 2; @@ -427,7 +430,7 @@ skip:  		hfs_bnode_read_key(new_node, fd->search_key, 14);  		cnid = cpu_to_be32(new_node->this); -		__hfs_brec_find(fd->bnode, fd); +		__hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);  		hfs_brec_insert(fd, &cnid, sizeof(cnid));  		hfs_bnode_put(fd->bnode);  		hfs_bnode_put(new_node); @@ -495,13 +498,15 @@ static int hfs_btree_inc_height(struct hfs_btree *tree)  		/* insert old root idx into new root */  		node->parent = tree->root;  		if (node->type == HFS_NODE_LEAF || -		    tree->attributes & HFS_TREE_VARIDXKEYS) +				tree->attributes & HFS_TREE_VARIDXKEYS || +				tree->cnid == HFSPLUS_ATTR_CNID)  			key_size = hfs_bnode_read_u16(node, 14) + 2;  		else  			key_size = tree->max_key_len + 2;  		hfs_bnode_copy(new_node, 14, node, 14, key_size); -		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { +		if (!(tree->attributes & HFS_TREE_VARIDXKEYS) && +				(tree->cnid != HFSPLUS_ATTR_CNID)) {  			key_size = tree->max_key_len + 2;  			hfs_bnode_write_u16(new_node, 14, tree->max_key_len);  		} diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 21023d9f8ff..efb689c21a9 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -98,6 +98,14 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)  			set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);  		}  		break; +	case HFSPLUS_ATTR_CNID: +		if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) { +			printk(KERN_ERR "hfs: invalid attributes max_key_len %d\n", +				tree->max_key_len); +			goto fail_page; +		} +		tree->keycmp = hfsplus_attr_bin_cmp_key; +		break;  	default:  		printk(KERN_ERR "hfs: unknown B*Tree requested\n");  		goto fail_page; @@ -159,7 +167,7 @@ void hfs_btree_close(struct hfs_btree *tree)  	kfree(tree);  } -void hfs_btree_write(struct hfs_btree *tree) +int hfs_btree_write(struct hfs_btree *tree)  {  	struct hfs_btree_header_rec *head;  	struct hfs_bnode *node; @@ -168,7 +176,7 @@ void hfs_btree_write(struct hfs_btree *tree)  	node = hfs_bnode_find(tree, 0);  	if (IS_ERR(node))  		/* panic? */ -		return; +		return -EIO;  	/* Load the header */  	page = node->page[0];  	head = (struct hfs_btree_header_rec *)(kmap(page) + @@ -186,6 +194,7 @@ void hfs_btree_write(struct hfs_btree *tree)  	kunmap(page);  	set_page_dirty(page);  	hfs_bnode_put(node); +	return 0;  }  static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 798d9c4c5e7..840d71edd19 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -45,7 +45,8 @@ void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,  	key->cat.parent = cpu_to_be32(parent);  	if (str) { -		hfsplus_asc2uni(sb, &key->cat.name, str->name, str->len); +		hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, +					str->name, str->len);  		len = be16_to_cpu(key->cat.name.length);  	} else {  		key->cat.name.length = 0; @@ -167,7 +168,8 @@ static int hfsplus_fill_cat_thread(struct super_block *sb,  	entry->type = cpu_to_be16(type);  	entry->thread.reserved = 0;  	entry->thread.parentID = cpu_to_be32(parentid); -	hfsplus_asc2uni(sb, &entry->thread.nodeName, str->name, str->len); +	hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, +				str->name, str->len);  	return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2;  } @@ -198,7 +200,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,  	hfsplus_cat_build_key_uni(fd->search_key,  		be32_to_cpu(tmp.thread.parentID),  		&tmp.thread.nodeName); -	return hfs_brec_find(fd); +	return hfs_brec_find(fd, hfs_find_rec_by_key);  }  int hfsplus_create_cat(u32 cnid, struct inode *dir, @@ -221,7 +223,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,  		S_ISDIR(inode->i_mode) ?  			HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,  		dir->i_ino, str); -	err = hfs_brec_find(&fd); +	err = hfs_brec_find(&fd, hfs_find_rec_by_key);  	if (err != -ENOENT) {  		if (!err)  			err = -EEXIST; @@ -233,7 +235,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,  	hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);  	entry_size = hfsplus_cat_build_record(&entry, cnid, inode); -	err = hfs_brec_find(&fd); +	err = hfs_brec_find(&fd, hfs_find_rec_by_key);  	if (err != -ENOENT) {  		/* panic? */  		if (!err) @@ -253,7 +255,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,  err1:  	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); -	if (!hfs_brec_find(&fd)) +	if (!hfs_brec_find(&fd, hfs_find_rec_by_key))  		hfs_brec_remove(&fd);  err2:  	hfs_find_exit(&fd); @@ -279,7 +281,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)  		int len;  		hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); -		err = hfs_brec_find(&fd); +		err = hfs_brec_find(&fd, hfs_find_rec_by_key);  		if (err)  			goto out; @@ -296,7 +298,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)  	} else  		hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); -	err = hfs_brec_find(&fd); +	err = hfs_brec_find(&fd, hfs_find_rec_by_key);  	if (err)  		goto out; @@ -326,7 +328,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)  		goto out;  	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); -	err = hfs_brec_find(&fd); +	err = hfs_brec_find(&fd, hfs_find_rec_by_key);  	if (err)  		goto out; @@ -337,6 +339,12 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)  	dir->i_size--;  	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;  	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); + +	if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) { +		if (HFSPLUS_SB(sb)->attr_tree) +			hfsplus_delete_all_attrs(dir, cnid); +	} +  out:  	hfs_find_exit(&fd); @@ -363,7 +371,7 @@ int hfsplus_rename_cat(u32 cnid,  	/* find the old dir entry and read the data */  	hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); -	err = hfs_brec_find(&src_fd); +	err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);  	if (err)  		goto out;  	if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { @@ -376,7 +384,7 @@ int hfsplus_rename_cat(u32 cnid,  	/* create new dir entry with the data from the old entry */  	hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); -	err = hfs_brec_find(&dst_fd); +	err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);  	if (err != -ENOENT) {  		if (!err)  			err = -EEXIST; @@ -391,7 +399,7 @@ int hfsplus_rename_cat(u32 cnid,  	/* finally remove the old entry */  	hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); -	err = hfs_brec_find(&src_fd); +	err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);  	if (err)  		goto out;  	err = hfs_brec_remove(&src_fd); @@ -402,7 +410,7 @@ int hfsplus_rename_cat(u32 cnid,  	/* remove old thread entry */  	hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); -	err = hfs_brec_find(&src_fd); +	err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);  	if (err)  		goto out;  	type = hfs_bnode_read_u16(src_fd.bnode, src_fd.entryoffset); @@ -414,7 +422,7 @@ int hfsplus_rename_cat(u32 cnid,  	hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);  	entry_size = hfsplus_fill_cat_thread(sb, &entry, type,  		dst_dir->i_ino, dst_name); -	err = hfs_brec_find(&dst_fd); +	err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);  	if (err != -ENOENT) {  		if (!err)  			err = -EEXIST; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 6b9f921ef2f..031c24e5052 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -15,6 +15,7 @@  #include "hfsplus_fs.h"  #include "hfsplus_raw.h" +#include "xattr.h"  static inline void hfsplus_instantiate(struct dentry *dentry,  				       struct inode *inode, u32 cnid) @@ -122,7 +123,7 @@ fail:  static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	int len, err;  	char strbuf[HFSPLUS_MAX_STRLEN + 1]; @@ -138,7 +139,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)  	if (err)  		return err;  	hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); -	err = hfs_brec_find(&fd); +	err = hfs_brec_find(&fd, hfs_find_rec_by_key);  	if (err)  		goto out; @@ -421,6 +422,15 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,  	if (res)  		goto out_err; +	res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); +	if (res == -EOPNOTSUPP) +		res = 0; /* Operation is not supported. */ +	else if (res) { +		/* Try to delete anyway without error analysis. */ +		hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); +		goto out_err; +	} +  	hfsplus_instantiate(dentry, inode, inode->i_ino);  	mark_inode_dirty(inode);  	goto out; @@ -450,15 +460,26 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,  		init_special_inode(inode, mode, rdev);  	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); -	if (res) { -		clear_nlink(inode); -		hfsplus_delete_inode(inode); -		iput(inode); -		goto out; +	if (res) +		goto failed_mknod; + +	res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); +	if (res == -EOPNOTSUPP) +		res = 0; /* Operation is not supported. */ +	else if (res) { +		/* Try to delete anyway without error analysis. */ +		hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); +		goto failed_mknod;  	}  	hfsplus_instantiate(dentry, inode, inode->i_ino);  	mark_inode_dirty(inode); +	goto out; + +failed_mknod: +	clear_nlink(inode); +	hfsplus_delete_inode(inode); +	iput(inode);  out:  	mutex_unlock(&sbi->vh_mutex);  	return res; @@ -499,15 +520,19 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,  }  const struct inode_operations hfsplus_dir_inode_operations = { -	.lookup		= hfsplus_lookup, -	.create		= hfsplus_create, -	.link		= hfsplus_link, -	.unlink		= hfsplus_unlink, -	.mkdir		= hfsplus_mkdir, -	.rmdir		= hfsplus_rmdir, -	.symlink	= hfsplus_symlink, -	.mknod		= hfsplus_mknod, -	.rename		= hfsplus_rename, +	.lookup			= hfsplus_lookup, +	.create			= hfsplus_create, +	.link			= hfsplus_link, +	.unlink			= hfsplus_unlink, +	.mkdir			= hfsplus_mkdir, +	.rmdir			= hfsplus_rmdir, +	.symlink		= hfsplus_symlink, +	.mknod			= hfsplus_mknod, +	.rename			= hfsplus_rename, +	.setxattr		= generic_setxattr, +	.getxattr		= generic_getxattr, +	.listxattr		= hfsplus_listxattr, +	.removexattr		= hfsplus_removexattr,  };  const struct file_operations hfsplus_dir_operations = { diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 5849e3ef35c..a94f0f779d5 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -95,7 +95,7 @@ static void __hfsplus_ext_write_extent(struct inode *inode,  			      HFSPLUS_IS_RSRC(inode) ?  				HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); -	res = hfs_brec_find(fd); +	res = hfs_brec_find(fd, hfs_find_rec_by_key);  	if (hip->extent_state & HFSPLUS_EXT_NEW) {  		if (res != -ENOENT)  			return; @@ -154,7 +154,7 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,  	hfsplus_ext_build_key(fd->search_key, cnid, block, type);  	fd->key->ext.cnid = 0; -	res = hfs_brec_find(fd); +	res = hfs_brec_find(fd, hfs_find_rec_by_key);  	if (res && res != -ENOENT)  		return res;  	if (fd->key->ext.cnid != fd->search_key->ext.cnid || @@ -329,6 +329,7 @@ static int hfsplus_free_extents(struct super_block *sb,  {  	u32 count, start;  	int i; +	int err = 0;  	hfsplus_dump_extent(extent);  	for (i = 0; i < 8; extent++, i++) { @@ -345,18 +346,33 @@ found:  	for (;;) {  		start = be32_to_cpu(extent->start_block);  		if (count <= block_nr) { -			hfsplus_block_free(sb, start, count); +			err = hfsplus_block_free(sb, start, count); +			if (err) { +				printk(KERN_ERR "hfs: can't free extent\n"); +				dprint(DBG_EXTENT, " start: %u count: %u\n", +					start, count); +			}  			extent->block_count = 0;  			extent->start_block = 0;  			block_nr -= count;  		} else {  			count -= block_nr; -			hfsplus_block_free(sb, start + count, block_nr); +			err = hfsplus_block_free(sb, start + count, block_nr); +			if (err) { +				printk(KERN_ERR "hfs: can't free extent\n"); +				dprint(DBG_EXTENT, " start: %u count: %u\n", +					start, count); +			}  			extent->block_count = cpu_to_be32(count);  			block_nr = 0;  		} -		if (!block_nr || !i) -			return 0; +		if (!block_nr || !i) { +			/* +			 * Try to free all extents and +			 * return only last error +			 */ +			return err; +		}  		i--;  		extent--;  		count = be32_to_cpu(extent->block_count); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index c571de224b1..05b11f36024 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -23,6 +23,7 @@  #define DBG_SUPER	0x00000010  #define DBG_EXTENT	0x00000020  #define DBG_BITMAP	0x00000040 +#define DBG_ATTR_MOD	0x00000080  #if 0  #define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) @@ -46,6 +47,13 @@ typedef int (*btree_keycmp)(const hfsplus_btree_key *,  #define NODE_HASH_SIZE	256 +/* B-tree mutex nested subclasses */ +enum hfsplus_btree_mutex_classes { +	CATALOG_BTREE_MUTEX, +	EXTENTS_BTREE_MUTEX, +	ATTR_BTREE_MUTEX, +}; +  /* An HFS+ BTree held in memory */  struct hfs_btree {  	struct super_block *sb; @@ -223,6 +231,7 @@ struct hfsplus_inode_info {  #define HFSPLUS_I_CAT_DIRTY	1	/* has changes in the catalog tree */  #define HFSPLUS_I_EXT_DIRTY	2	/* has changes in the extent tree */  #define HFSPLUS_I_ALLOC_DIRTY	3	/* has changes in the allocation file */ +#define HFSPLUS_I_ATTR_DIRTY	4	/* has changes in the attributes tree */  #define HFSPLUS_IS_RSRC(inode) \  	test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags) @@ -302,7 +311,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)  #define hfs_brec_remove hfsplus_brec_remove  #define hfs_find_init hfsplus_find_init  #define hfs_find_exit hfsplus_find_exit -#define __hfs_brec_find __hplusfs_brec_find +#define __hfs_brec_find __hfsplus_brec_find  #define hfs_brec_find hfsplus_brec_find  #define hfs_brec_read hfsplus_brec_read  #define hfs_brec_goto hfsplus_brec_goto @@ -324,10 +333,33 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)   */  #define HFSPLUS_IOC_BLESS _IO('h', 0x80) +typedef int (*search_strategy_t)(struct hfs_bnode *, +				struct hfs_find_data *, +				int *, int *, int *); +  /*   * Functions in any *.c used in other files   */ +/* attributes.c */ +int hfsplus_create_attr_tree_cache(void); +void hfsplus_destroy_attr_tree_cache(void); +hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *, +		const hfsplus_btree_key *); +int hfsplus_attr_build_key(struct super_block *, hfsplus_btree_key *, +			u32, const char *); +void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, +					u32 cnid, +					struct hfsplus_attr_unistr *name); +int hfsplus_find_attr(struct super_block *, u32, +			const char *, struct hfs_find_data *); +int hfsplus_attr_exists(struct inode *inode, const char *name); +int hfsplus_create_attr(struct inode *, const char *, const void *, size_t); +int hfsplus_delete_attr(struct inode *, const char *); +int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid); +  /* bitmap.c */  int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *);  int hfsplus_block_free(struct super_block *, u32, u32); @@ -335,7 +367,7 @@ int hfsplus_block_free(struct super_block *, u32, u32);  /* btree.c */  struct hfs_btree *hfs_btree_open(struct super_block *, u32);  void hfs_btree_close(struct hfs_btree *); -void hfs_btree_write(struct hfs_btree *); +int hfs_btree_write(struct hfs_btree *);  struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *);  void hfs_bmap_free(struct hfs_bnode *); @@ -369,8 +401,15 @@ int hfs_brec_remove(struct hfs_find_data *);  /* bfind.c */  int hfs_find_init(struct hfs_btree *, struct hfs_find_data *);  void hfs_find_exit(struct hfs_find_data *); -int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *); -int hfs_brec_find(struct hfs_find_data *); +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *, +				struct hfs_find_data *, +				int *, int *, int *); +int hfs_find_rec_by_key(struct hfs_bnode *, +				struct hfs_find_data *, +				int *, int *, int *); +int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *, +				search_strategy_t); +int hfs_brec_find(struct hfs_find_data *, search_strategy_t);  int hfs_brec_read(struct hfs_find_data *, void *, int);  int hfs_brec_goto(struct hfs_find_data *, int); @@ -417,11 +456,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,  /* ioctl.c */  long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); -int hfsplus_setxattr(struct dentry *dentry, const char *name, -		     const void *value, size_t size, int flags); -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, -			 void *value, size_t size); -ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);  /* options.c */  int hfsplus_parse_options(char *, struct hfsplus_sb_info *); @@ -446,7 +480,7 @@ int hfsplus_strcmp(const struct hfsplus_unistr *,  int hfsplus_uni2asc(struct super_block *,  		const struct hfsplus_unistr *, char *, int *);  int hfsplus_asc2uni(struct super_block *, -		struct hfsplus_unistr *, const char *, int); +		struct hfsplus_unistr *, int, const char *, int);  int hfsplus_hash_dentry(const struct dentry *dentry,  		const struct inode *inode, struct qstr *str);  int hfsplus_compare_dentry(const struct dentry *parent, diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 921967e5abb..452ede01b03 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -52,13 +52,23 @@  typedef __be32 hfsplus_cnid;  typedef __be16 hfsplus_unichr; +#define HFSPLUS_MAX_STRLEN 255 +#define HFSPLUS_ATTR_MAX_STRLEN 127 +  /* A "string" as used in filenames, etc. */  struct hfsplus_unistr {  	__be16 length; -	hfsplus_unichr unicode[255]; +	hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN];  } __packed; -#define HFSPLUS_MAX_STRLEN 255 +/* + * A "string" is used in attributes file + * for name of extended attribute + */ +struct hfsplus_attr_unistr { +	__be16 length; +	hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN]; +} __packed;  /* POSIX permissions */  struct hfsplus_perm { @@ -291,6 +301,8 @@ struct hfsplus_cat_file {  /* File attribute bits */  #define HFSPLUS_FILE_LOCKED		0x0001  #define HFSPLUS_FILE_THREAD_EXISTS	0x0002 +#define HFSPLUS_XATTR_EXISTS		0x0004 +#define HFSPLUS_ACL_EXISTS		0x0008  /* HFS+ catalog thread (part of a cat_entry) */  struct hfsplus_cat_thread { @@ -327,11 +339,63 @@ struct hfsplus_ext_key {  #define HFSPLUS_EXT_KEYLEN	sizeof(struct hfsplus_ext_key) +#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo" +#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security" + +#define HFSPLUS_ATTR_INLINE_DATA 0x10 +#define HFSPLUS_ATTR_FORK_DATA   0x20 +#define HFSPLUS_ATTR_EXTENTS     0x30 + +/* HFS+ attributes tree key */ +struct hfsplus_attr_key { +	__be16 key_len; +	__be16 pad; +	hfsplus_cnid cnid; +	__be32 start_block; +	struct hfsplus_attr_unistr key_name; +} __packed; + +#define HFSPLUS_ATTR_KEYLEN	sizeof(struct hfsplus_attr_key) + +/* HFS+ fork data attribute */ +struct hfsplus_attr_fork_data { +	__be32 record_type; +	__be32 reserved; +	struct hfsplus_fork_raw the_fork; +} __packed; + +/* HFS+ extension attribute */ +struct hfsplus_attr_extents { +	__be32 record_type; +	__be32 reserved; +	struct hfsplus_extent extents; +} __packed; + +#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802 + +/* HFS+ attribute inline data */ +struct hfsplus_attr_inline_data { +	__be32 record_type; +	__be32 reserved1; +	u8 reserved2[6]; +	__be16 length; +	u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE]; +} __packed; + +/* A data record in the attributes tree */ +typedef union { +	__be32 record_type; +	struct hfsplus_attr_fork_data fork_data; +	struct hfsplus_attr_extents extents; +	struct hfsplus_attr_inline_data inline_data; +} __packed hfsplus_attr_entry; +  /* HFS+ generic BTree key */  typedef union {  	__be16 key_len;  	struct hfsplus_cat_key cat;  	struct hfsplus_ext_key ext; +	struct hfsplus_attr_key attr;  } __packed hfsplus_btree_key;  #endif diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 2172aa5976f..160ccc9cdb4 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -17,6 +17,7 @@  #include "hfsplus_fs.h"  #include "hfsplus_raw.h" +#include "xattr.h"  static int hfsplus_readpage(struct file *file, struct page *page)  { @@ -28,6 +29,16 @@ static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)  	return block_write_full_page(page, hfsplus_get_block, wbc);  } +static void hfsplus_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		hfsplus_file_truncate(inode); +	} +} +  static int hfsplus_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -38,11 +49,8 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,  	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,  				hfsplus_get_block,  				&HFSPLUS_I(mapping->host)->phys_size); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		hfsplus_write_failed(mapping, pos + len);  	return ret;  } @@ -116,7 +124,8 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,  		const struct iovec *iov, loff_t offset, unsigned long nr_segs)  {  	struct file *file = iocb->ki_filp; -	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; +	struct address_space *mapping = file->f_mapping; +	struct inode *inode = file_inode(file)->i_mapping->host;  	ssize_t ret;  	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, @@ -131,7 +140,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,  		loff_t end = offset + iov_length(iov, nr_segs);  		if (end > isize) -			vmtruncate(inode, isize); +			hfsplus_write_failed(mapping, end);  	}  	return ret; @@ -300,10 +309,8 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)  	if ((attr->ia_valid & ATTR_SIZE) &&  	    attr->ia_size != i_size_read(inode)) {  		inode_dio_wait(inode); - -		error = vmtruncate(inode, attr->ia_size); -		if (error) -			return error; +		truncate_setsize(inode, attr->ia_size); +		hfsplus_file_truncate(inode);  	}  	setattr_copy(inode, attr); @@ -342,6 +349,18 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,  			error = error2;  	} +	if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags)) { +		if (sbi->attr_tree) { +			error2 = +				filemap_write_and_wait( +					    sbi->attr_tree->inode->i_mapping); +			if (!error) +				error = error2; +		} else { +			printk(KERN_ERR "hfs: sync non-existent attributes tree\n"); +		} +	} +  	if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {  		error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);  		if (!error) @@ -358,11 +377,11 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,  static const struct inode_operations hfsplus_file_inode_operations = {  	.lookup		= hfsplus_file_lookup, -	.truncate	= hfsplus_file_truncate,  	.setattr	= hfsplus_setattr, -	.setxattr	= hfsplus_setxattr, -	.getxattr	= hfsplus_getxattr, +	.setxattr	= generic_setxattr, +	.getxattr	= generic_getxattr,  	.listxattr	= hfsplus_listxattr, +	.removexattr	= hfsplus_removexattr,  };  static const struct file_operations hfsplus_file_operations = { diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 09addc8615f..d3ff5cc317d 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -16,7 +16,6 @@  #include <linux/fs.h>  #include <linux/mount.h>  #include <linux/sched.h> -#include <linux/xattr.h>  #include <asm/uaccess.h>  #include "hfsplus_fs.h" @@ -59,7 +58,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)  static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);  	unsigned int flags = 0; @@ -75,7 +74,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)  static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);  	unsigned int flags;  	int err = 0; @@ -151,110 +150,3 @@ long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  		return -ENOTTY;  	}  } - -int hfsplus_setxattr(struct dentry *dentry, const char *name, -		     const void *value, size_t size, int flags) -{ -	struct inode *inode = dentry->d_inode; -	struct hfs_find_data fd; -	hfsplus_cat_entry entry; -	struct hfsplus_cat_file *file; -	int res; - -	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) -		return -EOPNOTSUPP; - -	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); -	if (res) -		return res; -	res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); -	if (res) -		goto out; -	hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, -			sizeof(struct hfsplus_cat_file)); -	file = &entry.file; - -	if (!strcmp(name, "hfs.type")) { -		if (size == 4) -			memcpy(&file->user_info.fdType, value, 4); -		else -			res = -ERANGE; -	} else if (!strcmp(name, "hfs.creator")) { -		if (size == 4) -			memcpy(&file->user_info.fdCreator, value, 4); -		else -			res = -ERANGE; -	} else -		res = -EOPNOTSUPP; -	if (!res) { -		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, -				sizeof(struct hfsplus_cat_file)); -		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); -	} -out: -	hfs_find_exit(&fd); -	return res; -} - -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, -			 void *value, size_t size) -{ -	struct inode *inode = dentry->d_inode; -	struct hfs_find_data fd; -	hfsplus_cat_entry entry; -	struct hfsplus_cat_file *file; -	ssize_t res = 0; - -	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) -		return -EOPNOTSUPP; - -	if (size) { -		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); -		if (res) -			return res; -		res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); -		if (res) -			goto out; -		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, -				sizeof(struct hfsplus_cat_file)); -	} -	file = &entry.file; - -	if (!strcmp(name, "hfs.type")) { -		if (size >= 4) { -			memcpy(value, &file->user_info.fdType, 4); -			res = 4; -		} else -			res = size ? -ERANGE : 4; -	} else if (!strcmp(name, "hfs.creator")) { -		if (size >= 4) { -			memcpy(value, &file->user_info.fdCreator, 4); -			res = 4; -		} else -			res = size ? -ERANGE : 4; -	} else -		res = -EOPNOTSUPP; -out: -	if (size) -		hfs_find_exit(&fd); -	return res; -} - -#define HFSPLUS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type")) - -ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ -	struct inode *inode = dentry->d_inode; - -	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) -		return -EOPNOTSUPP; - -	if (!buffer || !size) -		return HFSPLUS_ATTRLIST_SIZE; -	if (size < HFSPLUS_ATTRLIST_SIZE) -		return -ERANGE; -	strcpy(buffer, "hfs.type"); -	strcpy(buffer + sizeof("hfs.type"), "hfs.creator"); - -	return HFSPLUS_ATTRLIST_SIZE; -} diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 811a84d2d96..974c26f96fa 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -20,6 +20,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb);  static void hfsplus_destroy_inode(struct inode *inode);  #include "hfsplus_fs.h" +#include "xattr.h"  static int hfsplus_system_read_inode(struct inode *inode)  { @@ -118,6 +119,7 @@ static int hfsplus_system_write_inode(struct inode *inode)  	case HFSPLUS_ATTR_CNID:  		fork = &vhdr->attr_file;  		tree = sbi->attr_tree; +		break;  	default:  		return -EIO;  	} @@ -127,8 +129,14 @@ static int hfsplus_system_write_inode(struct inode *inode)  		hfsplus_mark_mdb_dirty(inode->i_sb);  	}  	hfsplus_inode_write_fork(inode, fork); -	if (tree) -		hfs_btree_write(tree); +	if (tree) { +		int err = hfs_btree_write(tree); +		if (err) { +			printk(KERN_ERR "hfs: b-tree write err: %d, ino %lu\n", +					err, inode->i_ino); +			return err; +		} +	}  	return 0;  } @@ -185,6 +193,12 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)  	error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);  	if (!error)  		error = error2; +	if (sbi->attr_tree) { +		error2 = +		    filemap_write_and_wait(sbi->attr_tree->inode->i_mapping); +		if (!error) +			error = error2; +	}  	error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);  	if (!error)  		error = error2; @@ -226,6 +240,7 @@ out:  static void delayed_sync_fs(struct work_struct *work)  { +	int err;  	struct hfsplus_sb_info *sbi;  	sbi = container_of(work, struct hfsplus_sb_info, sync_work.work); @@ -234,7 +249,9 @@ static void delayed_sync_fs(struct work_struct *work)  	sbi->work_queued = 0;  	spin_unlock(&sbi->work_lock); -	hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); +	err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); +	if (err) +		printk(KERN_ERR "hfs: delayed sync fs err %d\n", err);  }  void hfsplus_mark_mdb_dirty(struct super_block *sb) @@ -272,6 +289,7 @@ static void hfsplus_put_super(struct super_block *sb)  		hfsplus_sync_fs(sb, 1);  	} +	hfs_btree_close(sbi->attr_tree);  	hfs_btree_close(sbi->cat_tree);  	hfs_btree_close(sbi->ext_tree);  	iput(sbi->alloc_file); @@ -468,12 +486,20 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)  		printk(KERN_ERR "hfs: failed to load catalog file\n");  		goto out_close_ext_tree;  	} +	if (vhdr->attr_file.total_blocks != 0) { +		sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); +		if (!sbi->attr_tree) { +			printk(KERN_ERR "hfs: failed to load attributes file\n"); +			goto out_close_cat_tree; +		} +	} +	sb->s_xattr = hfsplus_xattr_handlers;  	inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);  	if (IS_ERR(inode)) {  		printk(KERN_ERR "hfs: failed to load allocation file\n");  		err = PTR_ERR(inode); -		goto out_close_cat_tree; +		goto out_close_attr_tree;  	}  	sbi->alloc_file = inode; @@ -533,10 +559,27 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)  			}  			err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,  						 &str, sbi->hidden_dir); -			mutex_unlock(&sbi->vh_mutex); -			if (err) +			if (err) { +				mutex_unlock(&sbi->vh_mutex); +				goto out_put_hidden_dir; +			} + +			err = hfsplus_init_inode_security(sbi->hidden_dir, +								root, &str); +			if (err == -EOPNOTSUPP) +				err = 0; /* Operation is not supported. */ +			else if (err) { +				/* +				 * Try to delete anyway without +				 * error analysis. +				 */ +				hfsplus_delete_cat(sbi->hidden_dir->i_ino, +							root, &str); +				mutex_unlock(&sbi->vh_mutex);  				goto out_put_hidden_dir; +			} +			mutex_unlock(&sbi->vh_mutex);  			hfsplus_mark_inode_dirty(sbi->hidden_dir,  						 HFSPLUS_I_CAT_DIRTY);  		} @@ -553,6 +596,8 @@ out_put_root:  	sb->s_root = NULL;  out_put_alloc_file:  	iput(sbi->alloc_file); +out_close_attr_tree: +	hfs_btree_close(sbi->attr_tree);  out_close_cat_tree:  	hfs_btree_close(sbi->cat_tree);  out_close_ext_tree: @@ -626,9 +671,20 @@ static int __init init_hfsplus_fs(void)  		hfsplus_init_once);  	if (!hfsplus_inode_cachep)  		return -ENOMEM; +	err = hfsplus_create_attr_tree_cache(); +	if (err) +		goto destroy_inode_cache;  	err = register_filesystem(&hfsplus_fs_type);  	if (err) -		kmem_cache_destroy(hfsplus_inode_cachep); +		goto destroy_attr_tree_cache; +	return 0; + +destroy_attr_tree_cache: +	hfsplus_destroy_attr_tree_cache(); + +destroy_inode_cache: +	kmem_cache_destroy(hfsplus_inode_cachep); +  	return err;  } @@ -641,6 +697,7 @@ static void __exit exit_hfsplus_fs(void)  	 * destroy cache.  	 */  	rcu_barrier(); +	hfsplus_destroy_attr_tree_cache();  	kmem_cache_destroy(hfsplus_inode_cachep);  } diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index a32998f29f0..2c2e47dcfdd 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -295,7 +295,8 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size)  	return hfsplus_decompose_table + (off / 4);  } -int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, +int hfsplus_asc2uni(struct super_block *sb, +		    struct hfsplus_unistr *ustr, int max_unistr_len,  		    const char *astr, int len)  {  	int size, dsize, decompose; @@ -303,7 +304,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,  	wchar_t c;  	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); -	while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { +	while (outlen < max_unistr_len && len > 0) {  		size = asc2unichar(sb, astr, len, &c);  		if (decompose) @@ -311,7 +312,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,  		else  			dstr = NULL;  		if (dstr) { -			if (outlen + dsize > HFSPLUS_MAX_STRLEN) +			if (outlen + dsize > max_unistr_len)  				break;  			do {  				ustr->unicode[outlen++] = cpu_to_be16(*dstr++); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c new file mode 100644 index 00000000000..e8a4b0815c6 --- /dev/null +++ b/fs/hfsplus/xattr.c @@ -0,0 +1,709 @@ +/* + * linux/fs/hfsplus/xattr.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Logic of processing extended attributes + */ + +#include "hfsplus_fs.h" +#include "xattr.h" + +const struct xattr_handler *hfsplus_xattr_handlers[] = { +	&hfsplus_xattr_osx_handler, +	&hfsplus_xattr_user_handler, +	&hfsplus_xattr_trusted_handler, +	&hfsplus_xattr_security_handler, +	NULL +}; + +static int strcmp_xattr_finder_info(const char *name) +{ +	if (name) { +		return strncmp(name, HFSPLUS_XATTR_FINDER_INFO_NAME, +				sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME)); +	} +	return -1; +} + +static int strcmp_xattr_acl(const char *name) +{ +	if (name) { +		return strncmp(name, HFSPLUS_XATTR_ACL_NAME, +				sizeof(HFSPLUS_XATTR_ACL_NAME)); +	} +	return -1; +} + +static inline int is_known_namespace(const char *name) +{ +	if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && +	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && +	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && +	    strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) +		return false; + +	return true; +} + +static int can_set_xattr(struct inode *inode, const char *name, +				const void *value, size_t value_len) +{ +	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) +		return -EOPNOTSUPP; /* TODO: implement ACL support */ + +	if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { +		/* +		 * This makes sure that we aren't trying to set an +		 * attribute in a different namespace by prefixing it +		 * with "osx." +		 */ +		if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN)) +			return -EOPNOTSUPP; + +		return 0; +	} + +	/* +	 * Don't allow setting an attribute in an unknown namespace. +	 */ +	if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && +	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && +	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) +		return -EOPNOTSUPP; + +	return 0; +} + +int __hfsplus_setxattr(struct inode *inode, const char *name, +			const void *value, size_t size, int flags) +{ +	int err = 0; +	struct hfs_find_data cat_fd; +	hfsplus_cat_entry entry; +	u16 cat_entry_flags, cat_entry_type; +	u16 folder_finderinfo_len = sizeof(struct DInfo) + +					sizeof(struct DXInfo); +	u16 file_finderinfo_len = sizeof(struct FInfo) + +					sizeof(struct FXInfo); + +	if ((!S_ISREG(inode->i_mode) && +			!S_ISDIR(inode->i_mode)) || +				HFSPLUS_IS_RSRC(inode)) +		return -EOPNOTSUPP; + +	err = can_set_xattr(inode, name, value, size); +	if (err) +		return err; + +	if (strncmp(name, XATTR_MAC_OSX_PREFIX, +				XATTR_MAC_OSX_PREFIX_LEN) == 0) +		name += XATTR_MAC_OSX_PREFIX_LEN; + +	if (value == NULL) { +		value = ""; +		size = 0; +	} + +	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); +	if (err) { +		printk(KERN_ERR "hfs: can't init xattr find struct\n"); +		return err; +	} + +	err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); +	if (err) { +		printk(KERN_ERR "hfs: catalog searching failed\n"); +		goto end_setxattr; +	} + +	if (!strcmp_xattr_finder_info(name)) { +		if (flags & XATTR_CREATE) { +			printk(KERN_ERR "hfs: xattr exists yet\n"); +			err = -EOPNOTSUPP; +			goto end_setxattr; +		} +		hfs_bnode_read(cat_fd.bnode, &entry, cat_fd.entryoffset, +					sizeof(hfsplus_cat_entry)); +		if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) { +			if (size == folder_finderinfo_len) { +				memcpy(&entry.folder.user_info, value, +						folder_finderinfo_len); +				hfs_bnode_write(cat_fd.bnode, &entry, +					cat_fd.entryoffset, +					sizeof(struct hfsplus_cat_folder)); +				hfsplus_mark_inode_dirty(inode, +						HFSPLUS_I_CAT_DIRTY); +			} else { +				err = -ERANGE; +				goto end_setxattr; +			} +		} else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) { +			if (size == file_finderinfo_len) { +				memcpy(&entry.file.user_info, value, +						file_finderinfo_len); +				hfs_bnode_write(cat_fd.bnode, &entry, +					cat_fd.entryoffset, +					sizeof(struct hfsplus_cat_file)); +				hfsplus_mark_inode_dirty(inode, +						HFSPLUS_I_CAT_DIRTY); +			} else { +				err = -ERANGE; +				goto end_setxattr; +			} +		} else { +			err = -EOPNOTSUPP; +			goto end_setxattr; +		} +		goto end_setxattr; +	} + +	if (!HFSPLUS_SB(inode->i_sb)->attr_tree) { +		err = -EOPNOTSUPP; +		goto end_setxattr; +	} + +	if (hfsplus_attr_exists(inode, name)) { +		if (flags & XATTR_CREATE) { +			printk(KERN_ERR "hfs: xattr exists yet\n"); +			err = -EOPNOTSUPP; +			goto end_setxattr; +		} +		err = hfsplus_delete_attr(inode, name); +		if (err) +			goto end_setxattr; +		err = hfsplus_create_attr(inode, name, value, size); +		if (err) +			goto end_setxattr; +	} else { +		if (flags & XATTR_REPLACE) { +			printk(KERN_ERR "hfs: cannot replace xattr\n"); +			err = -EOPNOTSUPP; +			goto end_setxattr; +		} +		err = hfsplus_create_attr(inode, name, value, size); +		if (err) +			goto end_setxattr; +	} + +	cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); +	if (cat_entry_type == HFSPLUS_FOLDER) { +		cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, +				    cat_fd.entryoffset + +				    offsetof(struct hfsplus_cat_folder, flags)); +		cat_entry_flags |= HFSPLUS_XATTR_EXISTS; +		if (!strcmp_xattr_acl(name)) +			cat_entry_flags |= HFSPLUS_ACL_EXISTS; +		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + +				offsetof(struct hfsplus_cat_folder, flags), +				cat_entry_flags); +		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); +	} else if (cat_entry_type == HFSPLUS_FILE) { +		cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, +				    cat_fd.entryoffset + +				    offsetof(struct hfsplus_cat_file, flags)); +		cat_entry_flags |= HFSPLUS_XATTR_EXISTS; +		if (!strcmp_xattr_acl(name)) +			cat_entry_flags |= HFSPLUS_ACL_EXISTS; +		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + +				    offsetof(struct hfsplus_cat_file, flags), +				    cat_entry_flags); +		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); +	} else { +		printk(KERN_ERR "hfs: invalid catalog entry type\n"); +		err = -EIO; +		goto end_setxattr; +	} + +end_setxattr: +	hfs_find_exit(&cat_fd); +	return err; +} + +static inline int is_osx_xattr(const char *xattr_name) +{ +	return !is_known_namespace(xattr_name); +} + +static int name_len(const char *xattr_name, int xattr_name_len) +{ +	int len = xattr_name_len + 1; + +	if (is_osx_xattr(xattr_name)) +		len += XATTR_MAC_OSX_PREFIX_LEN; + +	return len; +} + +static int copy_name(char *buffer, const char *xattr_name, int name_len) +{ +	int len = name_len; +	int offset = 0; + +	if (is_osx_xattr(xattr_name)) { +		strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN); +		offset += XATTR_MAC_OSX_PREFIX_LEN; +		len += XATTR_MAC_OSX_PREFIX_LEN; +	} + +	strncpy(buffer + offset, xattr_name, name_len); +	memset(buffer + offset + name_len, 0, 1); +	len += 1; + +	return len; +} + +static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, +						void *value, size_t size) +{ +	ssize_t res = 0; +	struct inode *inode = dentry->d_inode; +	struct hfs_find_data fd; +	u16 entry_type; +	u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); +	u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo); +	u16 record_len = max(folder_rec_len, file_rec_len); +	u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; +	u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; + +	if (size >= record_len) { +		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); +		if (res) { +			printk(KERN_ERR "hfs: can't init xattr find struct\n"); +			return res; +		} +		res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); +		if (res) +			goto end_getxattr_finder_info; +		entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); + +		if (entry_type == HFSPLUS_FOLDER) { +			hfs_bnode_read(fd.bnode, folder_finder_info, +				fd.entryoffset + +				offsetof(struct hfsplus_cat_folder, user_info), +				folder_rec_len); +			memcpy(value, folder_finder_info, folder_rec_len); +			res = folder_rec_len; +		} else if (entry_type == HFSPLUS_FILE) { +			hfs_bnode_read(fd.bnode, file_finder_info, +				fd.entryoffset + +				offsetof(struct hfsplus_cat_file, user_info), +				file_rec_len); +			memcpy(value, file_finder_info, file_rec_len); +			res = file_rec_len; +		} else { +			res = -EOPNOTSUPP; +			goto end_getxattr_finder_info; +		} +	} else +		res = size ? -ERANGE : record_len; + +end_getxattr_finder_info: +	if (size >= record_len) +		hfs_find_exit(&fd); +	return res; +} + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, +			 void *value, size_t size) +{ +	struct inode *inode = dentry->d_inode; +	struct hfs_find_data fd; +	hfsplus_attr_entry *entry; +	__be32 xattr_record_type; +	u32 record_type; +	u16 record_length = 0; +	ssize_t res = 0; + +	if ((!S_ISREG(inode->i_mode) && +			!S_ISDIR(inode->i_mode)) || +				HFSPLUS_IS_RSRC(inode)) +		return -EOPNOTSUPP; + +	if (strncmp(name, XATTR_MAC_OSX_PREFIX, +				XATTR_MAC_OSX_PREFIX_LEN) == 0) { +		/* skip "osx." prefix */ +		name += XATTR_MAC_OSX_PREFIX_LEN; +		/* +		 * Don't allow retrieving properly prefixed attributes +		 * by prepending them with "osx." +		 */ +		if (is_known_namespace(name)) +			return -EOPNOTSUPP; +	} + +	if (!strcmp_xattr_finder_info(name)) +		return hfsplus_getxattr_finder_info(dentry, value, size); + +	if (!HFSPLUS_SB(inode->i_sb)->attr_tree) +		return -EOPNOTSUPP; + +	entry = hfsplus_alloc_attr_entry(); +	if (!entry) { +		printk(KERN_ERR "hfs: can't allocate xattr entry\n"); +		return -ENOMEM; +	} + +	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); +	if (res) { +		printk(KERN_ERR "hfs: can't init xattr find struct\n"); +		goto failed_getxattr_init; +	} + +	res = hfsplus_find_attr(inode->i_sb, inode->i_ino, name, &fd); +	if (res) { +		if (res == -ENOENT) +			res = -ENODATA; +		else +			printk(KERN_ERR "hfs: xattr searching failed\n"); +		goto out; +	} + +	hfs_bnode_read(fd.bnode, &xattr_record_type, +			fd.entryoffset, sizeof(xattr_record_type)); +	record_type = be32_to_cpu(xattr_record_type); +	if (record_type == HFSPLUS_ATTR_INLINE_DATA) { +		record_length = hfs_bnode_read_u16(fd.bnode, +				fd.entryoffset + +				offsetof(struct hfsplus_attr_inline_data, +				length)); +		if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) { +			printk(KERN_ERR "hfs: invalid xattr record size\n"); +			res = -EIO; +			goto out; +		} +	} else if (record_type == HFSPLUS_ATTR_FORK_DATA || +			record_type == HFSPLUS_ATTR_EXTENTS) { +		printk(KERN_ERR "hfs: only inline data xattr are supported\n"); +		res = -EOPNOTSUPP; +		goto out; +	} else { +		printk(KERN_ERR "hfs: invalid xattr record\n"); +		res = -EIO; +		goto out; +	} + +	if (size) { +		hfs_bnode_read(fd.bnode, entry, fd.entryoffset, +				offsetof(struct hfsplus_attr_inline_data, +					raw_bytes) + record_length); +	} + +	if (size >= record_length) { +		memcpy(value, entry->inline_data.raw_bytes, record_length); +		res = record_length; +	} else +		res = size ? -ERANGE : record_length; + +out: +	hfs_find_exit(&fd); + +failed_getxattr_init: +	hfsplus_destroy_attr_entry(entry); +	return res; +} + +static inline int can_list(const char *xattr_name) +{ +	if (!xattr_name) +		return 0; + +	return strncmp(xattr_name, XATTR_TRUSTED_PREFIX, +			XATTR_TRUSTED_PREFIX_LEN) || +				capable(CAP_SYS_ADMIN); +} + +static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, +						char *buffer, size_t size) +{ +	ssize_t res = 0; +	struct inode *inode = dentry->d_inode; +	struct hfs_find_data fd; +	u16 entry_type; +	u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; +	u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; +	unsigned long len, found_bit; +	int xattr_name_len, symbols_count; + +	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); +	if (res) { +		printk(KERN_ERR "hfs: can't init xattr find struct\n"); +		return res; +	} + +	res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); +	if (res) +		goto end_listxattr_finder_info; + +	entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); +	if (entry_type == HFSPLUS_FOLDER) { +		len = sizeof(struct DInfo) + sizeof(struct DXInfo); +		hfs_bnode_read(fd.bnode, folder_finder_info, +				fd.entryoffset + +				offsetof(struct hfsplus_cat_folder, user_info), +				len); +		found_bit = find_first_bit((void *)folder_finder_info, len*8); +	} else if (entry_type == HFSPLUS_FILE) { +		len = sizeof(struct FInfo) + sizeof(struct FXInfo); +		hfs_bnode_read(fd.bnode, file_finder_info, +				fd.entryoffset + +				offsetof(struct hfsplus_cat_file, user_info), +				len); +		found_bit = find_first_bit((void *)file_finder_info, len*8); +	} else { +		res = -EOPNOTSUPP; +		goto end_listxattr_finder_info; +	} + +	if (found_bit >= (len*8)) +		res = 0; +	else { +		symbols_count = sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME) - 1; +		xattr_name_len = +			name_len(HFSPLUS_XATTR_FINDER_INFO_NAME, symbols_count); +		if (!buffer || !size) { +			if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) +				res = xattr_name_len; +		} else if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) { +			if (size < xattr_name_len) +				res = -ERANGE; +			else { +				res = copy_name(buffer, +						HFSPLUS_XATTR_FINDER_INFO_NAME, +						symbols_count); +			} +		} +	} + +end_listxattr_finder_info: +	hfs_find_exit(&fd); + +	return res; +} + +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ +	ssize_t err; +	ssize_t res = 0; +	struct inode *inode = dentry->d_inode; +	struct hfs_find_data fd; +	u16 key_len = 0; +	struct hfsplus_attr_key attr_key; +	char strbuf[HFSPLUS_ATTR_MAX_STRLEN + +			XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; +	int xattr_name_len; + +	if ((!S_ISREG(inode->i_mode) && +			!S_ISDIR(inode->i_mode)) || +				HFSPLUS_IS_RSRC(inode)) +		return -EOPNOTSUPP; + +	res = hfsplus_listxattr_finder_info(dentry, buffer, size); +	if (res < 0) +		return res; +	else if (!HFSPLUS_SB(inode->i_sb)->attr_tree) +		return (res == 0) ? -EOPNOTSUPP : res; + +	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); +	if (err) { +		printk(KERN_ERR "hfs: can't init xattr find struct\n"); +		return err; +	} + +	err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd); +	if (err) { +		if (err == -ENOENT) { +			if (res == 0) +				res = -ENODATA; +			goto end_listxattr; +		} else { +			res = err; +			goto end_listxattr; +		} +	} + +	for (;;) { +		key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); +		if (key_len == 0 || key_len > fd.tree->max_key_len) { +			printk(KERN_ERR "hfs: invalid xattr key length: %d\n", +							key_len); +			res = -EIO; +			goto end_listxattr; +		} + +		hfs_bnode_read(fd.bnode, &attr_key, +				fd.keyoffset, key_len + sizeof(key_len)); + +		if (be32_to_cpu(attr_key.cnid) != inode->i_ino) +			goto end_listxattr; + +		xattr_name_len = HFSPLUS_ATTR_MAX_STRLEN; +		if (hfsplus_uni2asc(inode->i_sb, +			(const struct hfsplus_unistr *)&fd.key->attr.key_name, +					strbuf, &xattr_name_len)) { +			printk(KERN_ERR "hfs: unicode conversion failed\n"); +			res = -EIO; +			goto end_listxattr; +		} + +		if (!buffer || !size) { +			if (can_list(strbuf)) +				res += name_len(strbuf, xattr_name_len); +		} else if (can_list(strbuf)) { +			if (size < (res + name_len(strbuf, xattr_name_len))) { +				res = -ERANGE; +				goto end_listxattr; +			} else +				res += copy_name(buffer + res, +						strbuf, xattr_name_len); +		} + +		if (hfs_brec_goto(&fd, 1)) +			goto end_listxattr; +	} + +end_listxattr: +	hfs_find_exit(&fd); +	return res; +} + +int hfsplus_removexattr(struct dentry *dentry, const char *name) +{ +	int err = 0; +	struct inode *inode = dentry->d_inode; +	struct hfs_find_data cat_fd; +	u16 flags; +	u16 cat_entry_type; +	int is_xattr_acl_deleted = 0; +	int is_all_xattrs_deleted = 0; + +	if ((!S_ISREG(inode->i_mode) && +			!S_ISDIR(inode->i_mode)) || +				HFSPLUS_IS_RSRC(inode)) +		return -EOPNOTSUPP; + +	if (!HFSPLUS_SB(inode->i_sb)->attr_tree) +		return -EOPNOTSUPP; + +	err = can_set_xattr(inode, name, NULL, 0); +	if (err) +		return err; + +	if (strncmp(name, XATTR_MAC_OSX_PREFIX, +				XATTR_MAC_OSX_PREFIX_LEN) == 0) +		name += XATTR_MAC_OSX_PREFIX_LEN; + +	if (!strcmp_xattr_finder_info(name)) +		return -EOPNOTSUPP; + +	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); +	if (err) { +		printk(KERN_ERR "hfs: can't init xattr find struct\n"); +		return err; +	} + +	err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); +	if (err) { +		printk(KERN_ERR "hfs: catalog searching failed\n"); +		goto end_removexattr; +	} + +	err = hfsplus_delete_attr(inode, name); +	if (err) +		goto end_removexattr; + +	is_xattr_acl_deleted = !strcmp_xattr_acl(name); +	is_all_xattrs_deleted = !hfsplus_attr_exists(inode, NULL); + +	if (!is_xattr_acl_deleted && !is_all_xattrs_deleted) +		goto end_removexattr; + +	cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); + +	if (cat_entry_type == HFSPLUS_FOLDER) { +		flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + +				offsetof(struct hfsplus_cat_folder, flags)); +		if (is_xattr_acl_deleted) +			flags &= ~HFSPLUS_ACL_EXISTS; +		if (is_all_xattrs_deleted) +			flags &= ~HFSPLUS_XATTR_EXISTS; +		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + +				offsetof(struct hfsplus_cat_folder, flags), +				flags); +		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); +	} else if (cat_entry_type == HFSPLUS_FILE) { +		flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + +				offsetof(struct hfsplus_cat_file, flags)); +		if (is_xattr_acl_deleted) +			flags &= ~HFSPLUS_ACL_EXISTS; +		if (is_all_xattrs_deleted) +			flags &= ~HFSPLUS_XATTR_EXISTS; +		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + +				offsetof(struct hfsplus_cat_file, flags), +				flags); +		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); +	} else { +		printk(KERN_ERR "hfs: invalid catalog entry type\n"); +		err = -EIO; +		goto end_removexattr; +	} + +end_removexattr: +	hfs_find_exit(&cat_fd); +	return err; +} + +static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, +					void *buffer, size_t size, int type) +{ +	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + +				XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; +	size_t len = strlen(name); + +	if (!strcmp(name, "")) +		return -EINVAL; + +	if (len > HFSPLUS_ATTR_MAX_STRLEN) +		return -EOPNOTSUPP; + +	strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); +	strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); + +	return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, +		const void *buffer, size_t size, int flags, int type) +{ +	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + +				XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; +	size_t len = strlen(name); + +	if (!strcmp(name, "")) +		return -EINVAL; + +	if (len > HFSPLUS_ATTR_MAX_STRLEN) +		return -EOPNOTSUPP; + +	strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); +	strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); + +	return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, +		size_t list_size, const char *name, size_t name_len, int type) +{ +	/* +	 * This method is not used. +	 * It is used hfsplus_listxattr() instead of generic_listxattr(). +	 */ +	return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_osx_handler = { +	.prefix	= XATTR_MAC_OSX_PREFIX, +	.list	= hfsplus_osx_listxattr, +	.get	= hfsplus_osx_getxattr, +	.set	= hfsplus_osx_setxattr, +}; diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h new file mode 100644 index 00000000000..847b695b984 --- /dev/null +++ b/fs/hfsplus/xattr.h @@ -0,0 +1,60 @@ +/* + * linux/fs/hfsplus/xattr.h + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Logic of processing extended attributes + */ + +#ifndef _LINUX_HFSPLUS_XATTR_H +#define _LINUX_HFSPLUS_XATTR_H + +#include <linux/xattr.h> + +extern const struct xattr_handler hfsplus_xattr_osx_handler; +extern const struct xattr_handler hfsplus_xattr_user_handler; +extern const struct xattr_handler hfsplus_xattr_trusted_handler; +/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/ +/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/ +extern const struct xattr_handler hfsplus_xattr_security_handler; + +extern const struct xattr_handler *hfsplus_xattr_handlers[]; + +int __hfsplus_setxattr(struct inode *inode, const char *name, +			const void *value, size_t size, int flags); + +static inline int hfsplus_setxattr(struct dentry *dentry, const char *name, +			const void *value, size_t size, int flags) +{ +	return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags); +} + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, +			void *value, size_t size); + +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); + +int hfsplus_removexattr(struct dentry *dentry, const char *name); + +int hfsplus_init_security(struct inode *inode, struct inode *dir, +				const struct qstr *qstr); + +static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir) +{ +	/*TODO: implement*/ +	return 0; +} + +static inline int hfsplus_init_inode_security(struct inode *inode, +						struct inode *dir, +						const struct qstr *qstr) +{ +	int err; + +	err = hfsplus_init_acl(inode, dir); +	if (!err) +		err = hfsplus_init_security(inode, dir, qstr); +	return err; +} + +#endif diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c new file mode 100644 index 00000000000..83b842f113c --- /dev/null +++ b/fs/hfsplus/xattr_security.c @@ -0,0 +1,104 @@ +/* + * linux/fs/hfsplus/xattr_trusted.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for storing security labels as extended attributes. + */ + +#include <linux/security.h> +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, +					void *buffer, size_t size, int type) +{ +	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; +	size_t len = strlen(name); + +	if (!strcmp(name, "")) +		return -EINVAL; + +	if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) +		return -EOPNOTSUPP; + +	strcpy(xattr_name, XATTR_SECURITY_PREFIX); +	strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); + +	return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_security_setxattr(struct dentry *dentry, const char *name, +		const void *buffer, size_t size, int flags, int type) +{ +	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; +	size_t len = strlen(name); + +	if (!strcmp(name, "")) +		return -EINVAL; + +	if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) +		return -EOPNOTSUPP; + +	strcpy(xattr_name, XATTR_SECURITY_PREFIX); +	strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); + +	return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list, +		size_t list_size, const char *name, size_t name_len, int type) +{ +	/* +	 * This method is not used. +	 * It is used hfsplus_listxattr() instead of generic_listxattr(). +	 */ +	return -EOPNOTSUPP; +} + +static int hfsplus_initxattrs(struct inode *inode, +				const struct xattr *xattr_array, +				void *fs_info) +{ +	const struct xattr *xattr; +	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; +	size_t xattr_name_len; +	int err = 0; + +	for (xattr = xattr_array; xattr->name != NULL; xattr++) { +		xattr_name_len = strlen(xattr->name); + +		if (xattr_name_len == 0) +			continue; + +		if (xattr_name_len + XATTR_SECURITY_PREFIX_LEN > +				HFSPLUS_ATTR_MAX_STRLEN) +			return -EOPNOTSUPP; + +		strcpy(xattr_name, XATTR_SECURITY_PREFIX); +		strcpy(xattr_name + +			XATTR_SECURITY_PREFIX_LEN, xattr->name); +		memset(xattr_name + +			XATTR_SECURITY_PREFIX_LEN + xattr_name_len, 0, 1); + +		err = __hfsplus_setxattr(inode, xattr_name, +					xattr->value, xattr->value_len, 0); +		if (err) +			break; +	} +	return err; +} + +int hfsplus_init_security(struct inode *inode, struct inode *dir, +				const struct qstr *qstr) +{ +	return security_inode_init_security(inode, dir, qstr, +					&hfsplus_initxattrs, NULL); +} + +const struct xattr_handler hfsplus_xattr_security_handler = { +	.prefix	= XATTR_SECURITY_PREFIX, +	.list	= hfsplus_security_listxattr, +	.get	= hfsplus_security_getxattr, +	.set	= hfsplus_security_setxattr, +}; diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c new file mode 100644 index 00000000000..426cee27754 --- /dev/null +++ b/fs/hfsplus/xattr_trusted.c @@ -0,0 +1,63 @@ +/* + * linux/fs/hfsplus/xattr_trusted.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for trusted extended attributes. + */ + +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name, +					void *buffer, size_t size, int type) +{ +	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; +	size_t len = strlen(name); + +	if (!strcmp(name, "")) +		return -EINVAL; + +	if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) +		return -EOPNOTSUPP; + +	strcpy(xattr_name, XATTR_TRUSTED_PREFIX); +	strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); + +	return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name, +		const void *buffer, size_t size, int flags, int type) +{ +	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; +	size_t len = strlen(name); + +	if (!strcmp(name, "")) +		return -EINVAL; + +	if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) +		return -EOPNOTSUPP; + +	strcpy(xattr_name, XATTR_TRUSTED_PREFIX); +	strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); + +	return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list, +		size_t list_size, const char *name, size_t name_len, int type) +{ +	/* +	 * This method is not used. +	 * It is used hfsplus_listxattr() instead of generic_listxattr(). +	 */ +	return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_trusted_handler = { +	.prefix	= XATTR_TRUSTED_PREFIX, +	.list	= hfsplus_trusted_listxattr, +	.get	= hfsplus_trusted_getxattr, +	.set	= hfsplus_trusted_setxattr, +}; diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c new file mode 100644 index 00000000000..e34016561ae --- /dev/null +++ b/fs/hfsplus/xattr_user.c @@ -0,0 +1,63 @@ +/* + * linux/fs/hfsplus/xattr_user.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for user extended attributes. + */ + +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_user_getxattr(struct dentry *dentry, const char *name, +					void *buffer, size_t size, int type) +{ +	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; +	size_t len = strlen(name); + +	if (!strcmp(name, "")) +		return -EINVAL; + +	if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) +		return -EOPNOTSUPP; + +	strcpy(xattr_name, XATTR_USER_PREFIX); +	strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); + +	return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_user_setxattr(struct dentry *dentry, const char *name, +		const void *buffer, size_t size, int flags, int type) +{ +	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; +	size_t len = strlen(name); + +	if (!strcmp(name, "")) +		return -EINVAL; + +	if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) +		return -EOPNOTSUPP; + +	strcpy(xattr_name, XATTR_USER_PREFIX); +	strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); + +	return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list, +		size_t list_size, const char *name, size_t name_len, int type) +{ +	/* +	 * This method is not used. +	 * It is used hfsplus_listxattr() instead of generic_listxattr(). +	 */ +	return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_user_handler = { +	.prefix	= XATTR_USER_PREFIX, +	.list	= hfsplus_user_listxattr, +	.get	= hfsplus_user_getxattr, +	.set	= hfsplus_user_setxattr, +}; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 457addc5c91..fbabb906066 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -30,7 +30,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)  	return list_entry(inode, struct hostfs_inode_info, vfs_inode);  } -#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode) +#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))  static int hostfs_d_delete(const struct dentry *dentry)  { @@ -861,14 +861,6 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)  }  static const struct inode_operations hostfs_iops = { -	.create		= hostfs_create, -	.link		= hostfs_link, -	.unlink		= hostfs_unlink, -	.symlink	= hostfs_symlink, -	.mkdir		= hostfs_mkdir, -	.rmdir		= hostfs_rmdir, -	.mknod		= hostfs_mknod, -	.rename		= hostfs_rename,  	.permission	= hostfs_permission,  	.setattr	= hostfs_setattr,  }; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 78e12b2e0ea..546f6d39713 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -25,7 +25,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)  	loff_t new_off = off + (whence == 1 ? filp->f_pos : 0);  	loff_t pos;  	struct quad_buffer_head qbh; -	struct inode *i = filp->f_path.dentry->d_inode; +	struct inode *i = file_inode(filp);  	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);  	struct super_block *s = i->i_sb; @@ -57,7 +57,7 @@ fail:  static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);  	struct quad_buffer_head qbh;  	struct hpfs_dirent *de; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 89d2a5803ae..9f9dbeceeee 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -50,7 +50,7 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)  	return disk_secno;  } -static void hpfs_truncate(struct inode *i) +void hpfs_truncate(struct inode *i)  {  	if (IS_IMMUTABLE(i)) return /*-EPERM*/;  	hpfs_lock_assert(i->i_sb); @@ -105,6 +105,16 @@ static int hpfs_readpage(struct file *file, struct page *page)  	return block_read_full_page(page,hpfs_get_block);  } +static void hpfs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		hpfs_truncate(inode); +	} +} +  static int hpfs_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -115,11 +125,8 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,  	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,  				hpfs_get_block,  				&hpfs_i(mapping->host)->mmu_private); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		hpfs_write_failed(mapping, pos + len);  	return ret;  } @@ -145,7 +152,7 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf,  	retval = do_sync_write(file, buf, count, ppos);  	if (retval > 0) {  		hpfs_lock(file->f_path.dentry->d_sb); -		hpfs_i(file->f_path.dentry->d_inode)->i_dirty = 1; +		hpfs_i(file_inode(file))->i_dirty = 1;  		hpfs_unlock(file->f_path.dentry->d_sb);  	}  	return retval; @@ -166,6 +173,5 @@ const struct file_operations hpfs_file_ops =  const struct inode_operations hpfs_file_iops =  { -	.truncate	= hpfs_truncate,  	.setattr	= hpfs_setattr,  }; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 7102aaecc24..b7ae286646b 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -252,6 +252,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,  /* file.c */  int hpfs_file_fsync(struct file *, loff_t, loff_t, int); +void hpfs_truncate(struct inode *);  extern const struct file_operations hpfs_file_ops;  extern const struct inode_operations hpfs_file_iops;  extern const struct address_space_operations hpfs_aops; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 804a9a842cb..9edeeb0ea97 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -147,7 +147,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)  	/*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) {  		   Some unknown structures like ACL may be in fnode,  		   we'd better not overwrite them -		hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino); +		hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 structures", i->i_ino);  	} else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {  		__le32 ea;  		if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) { @@ -277,9 +277,12 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)  	if ((attr->ia_valid & ATTR_SIZE) &&  	    attr->ia_size != i_size_read(inode)) { -		error = vmtruncate(inode, attr->ia_size); +		error = inode_newsize_ok(inode, attr->ia_size);  		if (error)  			goto out_unlock; + +		truncate_setsize(inode, attr->ia_size); +		hpfs_truncate(inode);  	}  	setattr_copy(inode, attr); diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 43b315f2002..74f55703be4 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -180,7 +180,7 @@ static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,  	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);  	ssize_t n; -	read = file->f_path.dentry->d_inode->i_fop->read; +	read = file_inode(file)->i_fop->read;  	if (!is_user)  		set_fs(KERNEL_DS); @@ -288,7 +288,7 @@ static ssize_t hppfs_write(struct file *file, const char __user *buf,  	struct file *proc_file = data->proc_file;  	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); -	write = proc_file->f_path.dentry->d_inode->i_fop->write; +	write = file_inode(proc_file)->i_fop->write;  	return (*write)(proc_file, buf, len, ppos);  } @@ -513,7 +513,7 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where)  	loff_t (*llseek)(struct file *, loff_t, int);  	loff_t ret; -	llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek; +	llseek = file_inode(proc_file)->i_fop->llseek;  	if (llseek != NULL) {  		ret = (*llseek)(proc_file, off, where);  		if (ret < 0) @@ -561,7 +561,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)  				      });  	int err; -	readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir; +	readdir = file_inode(proc_file)->i_fop->readdir;  	proc_file->f_pos = file->f_pos;  	err = (*readdir)(proc_file, &dirent, hppfs_filldir); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 78bde32ea95..7f94e0cbc69 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -97,7 +97,7 @@ static void huge_pagevec_release(struct pagevec *pvec)  static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	loff_t len, vma_len;  	int ret;  	struct hstate *h = hstate_file(file); @@ -918,16 +918,25 @@ static int get_hstate_idx(int page_size_log)  	return h - hstates;  } +static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen) +{ +	return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", +				dentry->d_name.name); +} + +static struct dentry_operations anon_ops = { +	.d_dname = hugetlb_dname +}; +  struct file *hugetlb_file_setup(const char *name, unsigned long addr,  				size_t size, vm_flags_t acctflag,  				struct user_struct **user,  				int creat_flags, int page_size_log)  { -	int error = -ENOMEM; -	struct file *file; +	struct file *file = ERR_PTR(-ENOMEM);  	struct inode *inode;  	struct path path; -	struct dentry *root; +	struct super_block *sb;  	struct qstr quick_string;  	struct hstate *hstate;  	unsigned long num_pages; @@ -955,17 +964,18 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,  		}  	} -	root = hugetlbfs_vfsmount[hstate_idx]->mnt_root; +	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;  	quick_string.name = name;  	quick_string.len = strlen(quick_string.name);  	quick_string.hash = 0; -	path.dentry = d_alloc(root, &quick_string); +	path.dentry = d_alloc_pseudo(sb, &quick_string);  	if (!path.dentry)  		goto out_shm_unlock; +	d_set_d_op(path.dentry, &anon_ops);  	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); -	error = -ENOSPC; -	inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); +	file = ERR_PTR(-ENOSPC); +	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);  	if (!inode)  		goto out_dentry; @@ -973,7 +983,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,  	size += addr & ~huge_page_mask(hstate);  	num_pages = ALIGN(size, huge_page_size(hstate)) >>  			huge_page_shift(hstate); -	error = -ENOMEM; +	file = ERR_PTR(-ENOMEM);  	if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))  		goto out_inode; @@ -981,10 +991,9 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,  	inode->i_size = size;  	clear_nlink(inode); -	error = -ENFILE;  	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,  			&hugetlbfs_file_operations); -	if (!file) +	if (IS_ERR(file))  		goto out_dentry; /* inode is already attached */  	return file; @@ -998,7 +1007,7 @@ out_shm_unlock:  		user_shm_unlock(size, *user);  		*user = NULL;  	} -	return ERR_PTR(error); +	return file;  }  static int __init init_hugetlbfs_fs(void) diff --git a/fs/inode.c b/fs/inode.c index 14084b72b25..f5f7c06c36f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -798,11 +798,10 @@ static struct inode *find_inode(struct super_block *sb,  				int (*test)(struct inode *, void *),  				void *data)  { -	struct hlist_node *node;  	struct inode *inode = NULL;  repeat: -	hlist_for_each_entry(inode, node, head, i_hash) { +	hlist_for_each_entry(inode, head, i_hash) {  		spin_lock(&inode->i_lock);  		if (inode->i_sb != sb) {  			spin_unlock(&inode->i_lock); @@ -830,11 +829,10 @@ repeat:  static struct inode *find_inode_fast(struct super_block *sb,  				struct hlist_head *head, unsigned long ino)  { -	struct hlist_node *node;  	struct inode *inode = NULL;  repeat: -	hlist_for_each_entry(inode, node, head, i_hash) { +	hlist_for_each_entry(inode, head, i_hash) {  		spin_lock(&inode->i_lock);  		if (inode->i_ino != ino) {  			spin_unlock(&inode->i_lock); @@ -1132,11 +1130,10 @@ EXPORT_SYMBOL(iget_locked);  static int test_inode_iunique(struct super_block *sb, unsigned long ino)  {  	struct hlist_head *b = inode_hashtable + hash(sb, ino); -	struct hlist_node *node;  	struct inode *inode;  	spin_lock(&inode_hash_lock); -	hlist_for_each_entry(inode, node, b, i_hash) { +	hlist_for_each_entry(inode, b, i_hash) {  		if (inode->i_ino == ino && inode->i_sb == sb) {  			spin_unlock(&inode_hash_lock);  			return 0; @@ -1291,10 +1288,9 @@ int insert_inode_locked(struct inode *inode)  	struct hlist_head *head = inode_hashtable + hash(sb, ino);  	while (1) { -		struct hlist_node *node;  		struct inode *old = NULL;  		spin_lock(&inode_hash_lock); -		hlist_for_each_entry(old, node, head, i_hash) { +		hlist_for_each_entry(old, head, i_hash) {  			if (old->i_ino != ino)  				continue;  			if (old->i_sb != sb) @@ -1306,7 +1302,7 @@ int insert_inode_locked(struct inode *inode)  			}  			break;  		} -		if (likely(!node)) { +		if (likely(!old)) {  			spin_lock(&inode->i_lock);  			inode->i_state |= I_NEW;  			hlist_add_head(&inode->i_hash, head); @@ -1334,11 +1330,10 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,  	struct hlist_head *head = inode_hashtable + hash(sb, hashval);  	while (1) { -		struct hlist_node *node;  		struct inode *old = NULL;  		spin_lock(&inode_hash_lock); -		hlist_for_each_entry(old, node, head, i_hash) { +		hlist_for_each_entry(old, head, i_hash) {  			if (old->i_sb != sb)  				continue;  			if (!test(old, data)) @@ -1350,7 +1345,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,  			}  			break;  		} -		if (likely(!node)) { +		if (likely(!old)) {  			spin_lock(&inode->i_lock);  			inode->i_state |= I_NEW;  			hlist_add_head(&inode->i_hash, head); @@ -1655,7 +1650,7 @@ EXPORT_SYMBOL(file_remove_suid);  int file_update_time(struct file *file)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct timespec now;  	int sync_it = 0;  	int ret; diff --git a/fs/internal.h b/fs/internal.h index 2f6af7f645e..507141fceb9 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -69,7 +69,7 @@ extern void __mnt_drop_write_file(struct file *);  /*   * fs_struct.c   */ -extern void chroot_fs_refs(struct path *, struct path *); +extern void chroot_fs_refs(const struct path *, const struct path *);  /*   * file_table.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 3bdad6d1f26..fd507fb460f 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -175,7 +175,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)  	struct fiemap fiemap;  	struct fiemap __user *ufiemap = (struct fiemap __user *) arg;  	struct fiemap_extent_info fieinfo = { 0, }; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	u64 len;  	int error; @@ -424,7 +424,7 @@ EXPORT_SYMBOL(generic_block_fiemap);   */  int ioctl_preallocate(struct file *filp, void __user *argp)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct space_resv sr;  	if (copy_from_user(&sr, argp, sizeof(sr))) @@ -449,7 +449,7 @@ int ioctl_preallocate(struct file *filp, void __user *argp)  static int file_ioctl(struct file *filp, unsigned int cmd,  		unsigned long arg)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	int __user *p = (int __user *)arg;  	switch (cmd) { @@ -512,7 +512,7 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,  static int ioctl_fsfreeze(struct file *filp)  { -	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; +	struct super_block *sb = file_inode(filp)->i_sb;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -527,7 +527,7 @@ static int ioctl_fsfreeze(struct file *filp)  static int ioctl_fsthaw(struct file *filp)  { -	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; +	struct super_block *sb = file_inode(filp)->i_sb;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -548,7 +548,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,  {  	int error = 0;  	int __user *argp = (int __user *)arg; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	switch (cmd) {  	case FIOCLEX: diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 0b3fa7974fa..592e5115a56 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -296,7 +296,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,   */  static int zisofs_readpage(struct file *file, struct page *page)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct address_space *mapping = inode->i_mapping;  	int err;  	int i, pcount, full_page; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index f20437c068a..a7d5c3c3d4e 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -253,7 +253,7 @@ static int isofs_readdir(struct file *filp,  	int result;  	char *tmpname;  	struct iso_directory_record *tmpde; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	tmpname = (char *)__get_free_page(GFP_KERNEL);  	if (tmpname == NULL) diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 2b4f2358ead..12088d8de3f 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -125,10 +125,10 @@ isofs_export_encode_fh(struct inode *inode,  	 */  	if (parent && (len < 5)) {  		*max_len = 5; -		return 255; +		return FILEID_INVALID;  	} else if (len < 3) {  		*max_len = 3; -		return 255; +		return FILEID_INVALID;  	}  	len = 3; diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index a2862339323..81cc7eaff86 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -446,7 +446,8 @@ int __log_start_commit(journal_t *journal, tid_t target)  	 * currently running transaction (if it exists).  Otherwise,  	 * the target tid must be an old one.  	 */ -	if (journal->j_running_transaction && +	if (journal->j_commit_request != target && +	    journal->j_running_transaction &&  	    journal->j_running_transaction->t_tid == target) {  		/*  		 * We want a new commit: OK, mark the request and wakeup the diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3091d42992f..750c70148ef 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -435,7 +435,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)  	trace_jbd2_commit_locking(journal, commit_transaction);  	stats.run.rs_wait = commit_transaction->t_max_wait; +	stats.run.rs_request_delay = 0;  	stats.run.rs_locked = jiffies; +	if (commit_transaction->t_requested) +		stats.run.rs_request_delay = +			jbd2_time_diff(commit_transaction->t_requested, +				       stats.run.rs_locked);  	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,  					      stats.run.rs_locked); @@ -1116,7 +1121,10 @@ restart_loop:  	 */  	spin_lock(&journal->j_history_lock);  	journal->j_stats.ts_tid++; +	if (commit_transaction->t_requested) +		journal->j_stats.ts_requested++;  	journal->j_stats.run.rs_wait += stats.run.rs_wait; +	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;  	journal->j_stats.run.rs_running += stats.run.rs_running;  	journal->j_stats.run.rs_locked += stats.run.rs_locked;  	journal->j_stats.run.rs_flushing += stats.run.rs_flushing; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index dbf41f9452d..ed10991ab00 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -35,7 +35,6 @@  #include <linux/kthread.h>  #include <linux/poison.h>  #include <linux/proc_fs.h> -#include <linux/debugfs.h>  #include <linux/seq_file.h>  #include <linux/math64.h>  #include <linux/hash.h> @@ -51,6 +50,14 @@  #include <asm/uaccess.h>  #include <asm/page.h> +#ifdef CONFIG_JBD2_DEBUG +ushort jbd2_journal_enable_debug __read_mostly; +EXPORT_SYMBOL(jbd2_journal_enable_debug); + +module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644); +MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2"); +#endif +  EXPORT_SYMBOL(jbd2_journal_extend);  EXPORT_SYMBOL(jbd2_journal_stop);  EXPORT_SYMBOL(jbd2_journal_lock_updates); @@ -513,6 +520,10 @@ int __jbd2_log_space_left(journal_t *journal)   */  int __jbd2_log_start_commit(journal_t *journal, tid_t target)  { +	/* Return if the txn has already requested to be committed */ +	if (journal->j_commit_request == target) +		return 0; +  	/*  	 * The only transaction we can possibly wait upon is the  	 * currently running transaction (if it exists).  Otherwise, @@ -529,6 +540,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)  		jbd_debug(1, "JBD2: requesting commit %d/%d\n",  			  journal->j_commit_request,  			  journal->j_commit_sequence); +		journal->j_running_transaction->t_requested = jiffies;  		wake_up(&journal->j_wait_commit);  		return 1;  	} else if (!tid_geq(journal->j_commit_request, target)) @@ -894,13 +906,18 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)  	if (v != SEQ_START_TOKEN)  		return 0; -	seq_printf(seq, "%lu transaction, each up to %u blocks\n", -			s->stats->ts_tid, -			s->journal->j_max_transaction_buffers); +	seq_printf(seq, "%lu transactions (%lu requested), " +		   "each up to %u blocks\n", +		   s->stats->ts_tid, s->stats->ts_requested, +		   s->journal->j_max_transaction_buffers);  	if (s->stats->ts_tid == 0)  		return 0;  	seq_printf(seq, "average: \n  %ums waiting for transaction\n",  	    jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); +	seq_printf(seq, "  %ums request delay\n", +	    (s->stats->ts_requested == 0) ? 0 : +	    jiffies_to_msecs(s->stats->run.rs_request_delay / +			     s->stats->ts_requested));  	seq_printf(seq, "  %ums running transaction\n",  	    jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));  	seq_printf(seq, "  %ums transaction was being locked\n", @@ -2485,45 +2502,6 @@ restart:  	spin_unlock(&journal->j_list_lock);  } -/* - * debugfs tunables - */ -#ifdef CONFIG_JBD2_DEBUG -u8 jbd2_journal_enable_debug __read_mostly; -EXPORT_SYMBOL(jbd2_journal_enable_debug); - -#define JBD2_DEBUG_NAME "jbd2-debug" - -static struct dentry *jbd2_debugfs_dir; -static struct dentry *jbd2_debug; - -static void __init jbd2_create_debugfs_entry(void) -{ -	jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); -	if (jbd2_debugfs_dir) -		jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, -					       S_IRUGO | S_IWUSR, -					       jbd2_debugfs_dir, -					       &jbd2_journal_enable_debug); -} - -static void __exit jbd2_remove_debugfs_entry(void) -{ -	debugfs_remove(jbd2_debug); -	debugfs_remove(jbd2_debugfs_dir); -} - -#else - -static void __init jbd2_create_debugfs_entry(void) -{ -} - -static void __exit jbd2_remove_debugfs_entry(void) -{ -} - -#endif  #ifdef CONFIG_PROC_FS @@ -2609,7 +2587,6 @@ static int __init journal_init(void)  	ret = journal_init_caches();  	if (ret == 0) { -		jbd2_create_debugfs_entry();  		jbd2_create_jbd_stats_proc_entry();  	} else {  		jbd2_journal_destroy_caches(); @@ -2624,7 +2601,6 @@ static void __exit journal_exit(void)  	if (n)  		printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);  #endif -	jbd2_remove_debugfs_entry();  	jbd2_remove_jbd_stats_proc_entry();  	jbd2_journal_destroy_caches();  } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 42f6615af0a..d6ee5aed56b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -30,6 +30,8 @@  #include <linux/bug.h>  #include <linux/module.h> +#include <trace/events/jbd2.h> +  static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);  static void __jbd2_journal_unfile_buffer(struct journal_head *jh); @@ -100,6 +102,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)  	journal->j_running_transaction = transaction;  	transaction->t_max_wait = 0;  	transaction->t_start = jiffies; +	transaction->t_requested = 0;  	return transaction;  } @@ -209,7 +212,8 @@ repeat:  		if (!new_transaction)  			goto alloc_transaction;  		write_lock(&journal->j_state_lock); -		if (!journal->j_running_transaction) { +		if (!journal->j_running_transaction && +		    !journal->j_barrier_count) {  			jbd2_get_transaction(journal, new_transaction);  			new_transaction = NULL;  		} @@ -305,6 +309,8 @@ repeat:  	 */  	update_t_max_wait(transaction, ts);  	handle->h_transaction = transaction; +	handle->h_requested_credits = nblocks; +	handle->h_start_jiffies = jiffies;  	atomic_inc(&transaction->t_updates);  	atomic_inc(&transaction->t_handle_count);  	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", @@ -351,7 +357,8 @@ static handle_t *new_handle(int nblocks)   * Return a pointer to a newly allocated handle, or an ERR_PTR() value   * on failure.   */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, +			      unsigned int type, unsigned int line_no)  {  	handle_t *handle = journal_current_handle();  	int err; @@ -375,8 +382,13 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)  	if (err < 0) {  		jbd2_free_handle(handle);  		current->journal_info = NULL; -		handle = ERR_PTR(err); +		return ERR_PTR(err);  	} +	handle->h_type = type; +	handle->h_line_no = line_no; +	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, +				handle->h_transaction->t_tid, type, +				line_no, nblocks);  	return handle;  }  EXPORT_SYMBOL(jbd2__journal_start); @@ -384,7 +396,7 @@ EXPORT_SYMBOL(jbd2__journal_start);  handle_t *jbd2_journal_start(journal_t *journal, int nblocks)  { -	return jbd2__journal_start(journal, nblocks, GFP_NOFS); +	return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0);  }  EXPORT_SYMBOL(jbd2_journal_start); @@ -446,7 +458,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)  		goto unlock;  	} +	trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, +				 handle->h_transaction->t_tid, +				 handle->h_type, handle->h_line_no, +				 handle->h_buffer_credits, +				 nblocks); +  	handle->h_buffer_credits += nblocks; +	handle->h_requested_credits += nblocks;  	atomic_add(nblocks, &transaction->t_outstanding_credits);  	result = 0; @@ -1375,6 +1394,13 @@ int jbd2_journal_stop(handle_t *handle)  	}  	jbd_debug(4, "Handle %p going down\n", handle); +	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, +				handle->h_transaction->t_tid, +				handle->h_type, handle->h_line_no, +				jiffies - handle->h_start_jiffies, +				handle->h_sync, handle->h_requested_credits, +				(handle->h_requested_credits - +				 handle->h_buffer_credits));  	/*  	 * Implement synchronous transaction batching.  If the handle @@ -1839,7 +1865,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,  	BUFFER_TRACE(bh, "entry"); -retry:  	/*  	 * It is safe to proceed here without the j_list_lock because the  	 * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1934,14 +1959,11 @@ retry:  		 * for commit and try again.  		 */  		if (partial_page) { -			tid_t tid = journal->j_committing_transaction->t_tid; -  			jbd2_journal_put_journal_head(jh);  			spin_unlock(&journal->j_list_lock);  			jbd_unlock_bh_state(bh);  			write_unlock(&journal->j_state_lock); -			jbd2_log_wait_commit(journal, tid); -			goto retry; +			return -EBUSY;  		}  		/*  		 * OK, buffer won't be reachable after truncate. We just set @@ -2002,21 +2024,23 @@ zap_buffer_unlocked:   * @page:    page to flush   * @offset:  length of page to invalidate.   * - * Reap page buffers containing data after offset in page. - * + * Reap page buffers containing data after offset in page. Can return -EBUSY + * if buffers are part of the committing transaction and the page is straddling + * i_size. Caller then has to wait for current commit and try again.   */ -void jbd2_journal_invalidatepage(journal_t *journal, -		      struct page *page, -		      unsigned long offset) +int jbd2_journal_invalidatepage(journal_t *journal, +				struct page *page, +				unsigned long offset)  {  	struct buffer_head *head, *bh, *next;  	unsigned int curr_off = 0;  	int may_free = 1; +	int ret = 0;  	if (!PageLocked(page))  		BUG();  	if (!page_has_buffers(page)) -		return; +		return 0;  	/* We will potentially be playing with lists other than just the  	 * data lists (especially for journaled data mode), so be @@ -2030,9 +2054,11 @@ void jbd2_journal_invalidatepage(journal_t *journal,  		if (offset <= curr_off) {  			/* This block is wholly outside the truncation point */  			lock_buffer(bh); -			may_free &= journal_unmap_buffer(journal, bh, -							 offset > 0); +			ret = journal_unmap_buffer(journal, bh, offset > 0);  			unlock_buffer(bh); +			if (ret < 0) +				return ret; +			may_free &= ret;  		}  		curr_off = next_off;  		bh = next; @@ -2043,6 +2069,7 @@ void jbd2_journal_invalidatepage(journal_t *journal,  		if (may_free && try_to_free_buffers(page))  			J_ASSERT(!page_has_buffers(page));  	} +	return 0;  }  /* diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig index 6ae169cd8fa..d8bb6c411e9 100644 --- a/fs/jffs2/Kconfig +++ b/fs/jffs2/Kconfig @@ -50,8 +50,8 @@ config JFFS2_FS_WBUF_VERIFY  	  write-buffer, and check for errors.  config JFFS2_SUMMARY -	bool "JFFS2 summary support (EXPERIMENTAL)" -	depends on JFFS2_FS && EXPERIMENTAL +	bool "JFFS2 summary support" +	depends on JFFS2_FS  	default n  	help  	  This feature makes it possible to use summary information @@ -63,8 +63,8 @@ config JFFS2_SUMMARY  	  If unsure, say 'N'.  config JFFS2_FS_XATTR -	bool "JFFS2 XATTR support (EXPERIMENTAL)" -	depends on JFFS2_FS && EXPERIMENTAL +	bool "JFFS2 XATTR support" +	depends on JFFS2_FS  	default n  	help  	  Extended attributes are name:value pairs associated with inodes by @@ -173,7 +173,7 @@ config JFFS2_CMODE_PRIORITY  	  successful one.  config JFFS2_CMODE_SIZE -	bool "size (EXPERIMENTAL)" +	bool "size"  	help  	  Tries all compressors and chooses the one which has the smallest  	  result. diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index ad7774d3209..acd46a4160c 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -117,12 +117,12 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,  static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)  {  	struct jffs2_inode_info *f; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct jffs2_full_dirent *fd;  	unsigned long offset, curofs;  	jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", -		  filp->f_path.dentry->d_inode->i_ino); +		  file_inode(filp)->i_ino);  	f = JFFS2_INODE_INFO(inode); diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 9d3afd157f9..dd7442c5835 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -119,9 +119,12 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)  	    iattr->ia_size != i_size_read(inode)) {  		inode_dio_wait(inode); -		rc = vmtruncate(inode, iattr->ia_size); +		rc = inode_newsize_ok(inode, iattr->ia_size);  		if (rc)  			return rc; + +		truncate_setsize(inode, iattr->ia_size); +		jfs_truncate(inode);  	}  	setattr_copy(inode, iattr); @@ -133,7 +136,6 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)  }  const struct inode_operations jfs_file_inode_operations = { -	.truncate	= jfs_truncate,  	.setxattr	= jfs_setxattr,  	.getxattr	= jfs_getxattr,  	.listxattr	= jfs_listxattr, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 4692bf3ca8c..b7dc47ba675 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -300,6 +300,16 @@ static int jfs_readpages(struct file *file, struct address_space *mapping,  	return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);  } +static void jfs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		jfs_truncate(inode); +	} +} +  static int jfs_write_begin(struct file *file, struct address_space *mapping,  				loff_t pos, unsigned len, unsigned flags,  				struct page **pagep, void **fsdata) @@ -308,11 +318,8 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,  	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,  				jfs_get_block); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		jfs_write_failed(mapping, pos + len);  	return ret;  } @@ -326,6 +333,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,  	const struct iovec *iov, loff_t offset, unsigned long nr_segs)  {  	struct file *file = iocb->ki_filp; +	struct address_space *mapping = file->f_mapping;  	struct inode *inode = file->f_mapping->host;  	ssize_t ret; @@ -341,7 +349,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,  		loff_t end = offset + iov_length(iov, nr_segs);  		if (end > isize) -			vmtruncate(inode, isize); +			jfs_write_failed(mapping, end);  	}  	return ret; diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index bc555ff417e..93a1232894f 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -58,7 +58,7 @@ static long jfs_map_ext2(unsigned long flags, int from)  long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct jfs_inode_info *jfs_inode = JFS_IP(inode);  	unsigned int flags; diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 9197a1b0d02..0ddbeceafc6 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -3004,7 +3004,7 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)   */  int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *ip = filp->f_path.dentry->d_inode; +	struct inode *ip = file_inode(filp);  	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;  	int rc = 0;  	loff_t dtpos;	/* legacy OS/2 style position */ diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 1a543be09c7..060ba638bec 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -154,7 +154,7 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)  	/*  	 * If we really return the number of allocated & free inodes, some  	 * applications will fail because they won't see enough free inodes. -	 * We'll try to calculate some guess as to how may inodes we can +	 * We'll try to calculate some guess as to how many inodes we can  	 * really allocate  	 *  	 * buf->f_files = atomic_read(&imap->im_numinos); diff --git a/fs/libfs.c b/fs/libfs.c index 35fc6e74cd8..916da8c4158 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -369,8 +369,6 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)  	struct inode *inode = dentry->d_inode;  	int error; -	WARN_ON_ONCE(inode->i_op->truncate); -  	error = inode_change_ok(inode, iattr);  	if (error)  		return error; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index ca0a0800144..0796c45d0d4 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -11,7 +11,7 @@  #include <linux/slab.h>  #include <linux/time.h>  #include <linux/nfs_fs.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/sunrpc/svc.h>  #include <linux/lockd/lockd.h>  #include <linux/kthread.h> @@ -178,7 +178,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)  			continue;  		if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))  			continue; -		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) +		if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)) ,fh) != 0)  			continue;  		/* Alright, we found a lock. Set the return status  		 * and wake up the caller @@ -220,10 +220,19 @@ reclaimer(void *ptr)  {  	struct nlm_host	  *host = (struct nlm_host *) ptr;  	struct nlm_wait	  *block; +	struct nlm_rqst   *req;  	struct file_lock *fl, *next;  	u32 nsmstate;  	struct net *net = host->net; +	req = kmalloc(sizeof(*req), GFP_KERNEL); +	if (!req) { +		printk(KERN_ERR "lockd: reclaimer unable to alloc memory." +				" Locks for %s won't be reclaimed!\n", +				host->h_name); +		return 0; +	} +  	allow_signal(SIGKILL);  	down_write(&host->h_rwsem); @@ -253,7 +262,7 @@ restart:  		 */  		if (signalled())  			continue; -		if (nlmclnt_reclaim(host, fl) != 0) +		if (nlmclnt_reclaim(host, fl, req) != 0)  			continue;  		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);  		if (host->h_nsmstate != nsmstate) { @@ -279,5 +288,6 @@ restart:  	/* Release host handle after use */  	nlmclnt_release_host(host);  	lockd_down(net); +	kfree(req);  	return 0;  } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 54f9e6ce043..7e529c3c45c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -127,7 +127,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)  	struct nlm_lock	*lock = &argp->lock;  	nlmclnt_next_cookie(&argp->cookie); -	memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh)); +	memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));  	lock->caller  = utsname()->nodename;  	lock->oh.data = req->a_owner;  	lock->oh.len  = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", @@ -550,6 +550,9 @@ again:  		status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);  		if (status < 0)  			break; +		/* Resend the blocking lock request after a server reboot */ +		if (resp->status ==  nlm_lck_denied_grace_period) +			continue;  		if (resp->status != nlm_lck_blocked)  			break;  	} @@ -615,17 +618,15 @@ out_unlock:   * RECLAIM: Try to reclaim a lock   */  int -nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl) +nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl, +		struct nlm_rqst *req)  { -	struct nlm_rqst reqst, *req;  	int		status; -	req = &reqst;  	memset(req, 0, sizeof(*req));  	locks_init_lock(&req->a_args.lock.fl);  	locks_init_lock(&req->a_res.lock.fl);  	req->a_host  = host; -	req->a_flags = 0;  	/* Set up the argument struct */  	nlmclnt_setlockargs(req, fl); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 0e17090c310..969d589c848 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -13,6 +13,7 @@  #include <linux/in.h>  #include <linux/in6.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/sunrpc/svc.h>  #include <linux/lockd/lockd.h>  #include <linux/mutex.h> @@ -32,15 +33,15 @@  static struct hlist_head	nlm_server_hosts[NLM_HOST_NRHASH];  static struct hlist_head	nlm_client_hosts[NLM_HOST_NRHASH]; -#define for_each_host(host, pos, chain, table) \ +#define for_each_host(host, chain, table) \  	for ((chain) = (table); \  	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ -		hlist_for_each_entry((host), (pos), (chain), h_hash) +		hlist_for_each_entry((host), (chain), h_hash) -#define for_each_host_safe(host, pos, next, chain, table) \ +#define for_each_host_safe(host, next, chain, table) \  	for ((chain) = (table); \  	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ -		hlist_for_each_entry_safe((host), (pos), (next), \ +		hlist_for_each_entry_safe((host), (next), \  						(chain), h_hash)  static unsigned long		nrhosts; @@ -225,7 +226,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,  		.net		= net,  	};  	struct hlist_head *chain; -	struct hlist_node *pos;  	struct nlm_host	*host;  	struct nsm_handle *nsm = NULL;  	struct lockd_net *ln = net_generic(net, lockd_net_id); @@ -237,7 +237,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,  	mutex_lock(&nlm_host_mutex);  	chain = &nlm_client_hosts[nlm_hash_address(sap)]; -	hlist_for_each_entry(host, pos, chain, h_hash) { +	hlist_for_each_entry(host, chain, h_hash) {  		if (host->net != net)  			continue;  		if (!rpc_cmp_addr(nlm_addr(host), sap)) @@ -322,7 +322,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,  				    const size_t hostname_len)  {  	struct hlist_head *chain; -	struct hlist_node *pos;  	struct nlm_host	*host = NULL;  	struct nsm_handle *nsm = NULL;  	struct sockaddr *src_sap = svc_daddr(rqstp); @@ -350,7 +349,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,  		nlm_gc_hosts(net);  	chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; -	hlist_for_each_entry(host, pos, chain, h_hash) { +	hlist_for_each_entry(host, chain, h_hash) {  		if (host->net != net)  			continue;  		if (!rpc_cmp_addr(nlm_addr(host), ni.sap)) @@ -515,10 +514,9 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,  {  	struct nlm_host *host;  	struct hlist_head *chain; -	struct hlist_node *pos;  	mutex_lock(&nlm_host_mutex); -	for_each_host(host, pos, chain, cache) { +	for_each_host(host, chain, cache) {  		if (host->h_nsmhandle == nsm  		    && host->h_nsmstate != info->state) {  			host->h_nsmstate = info->state; @@ -570,7 +568,6 @@ void nlm_host_rebooted(const struct nlm_reboot *info)  static void nlm_complain_hosts(struct net *net)  {  	struct hlist_head *chain; -	struct hlist_node *pos;  	struct nlm_host	*host;  	if (net) { @@ -587,7 +584,7 @@ static void nlm_complain_hosts(struct net *net)  		dprintk("lockd: %lu hosts left:\n", nrhosts);  	} -	for_each_host(host, pos, chain, nlm_server_hosts) { +	for_each_host(host, chain, nlm_server_hosts) {  		if (net && host->net != net)  			continue;  		dprintk("       %s (cnt %d use %d exp %ld net %p)\n", @@ -600,14 +597,13 @@ void  nlm_shutdown_hosts_net(struct net *net)  {  	struct hlist_head *chain; -	struct hlist_node *pos;  	struct nlm_host	*host;  	mutex_lock(&nlm_host_mutex);  	/* First, make all hosts eligible for gc */  	dprintk("lockd: nuking all hosts in net %p...\n", net); -	for_each_host(host, pos, chain, nlm_server_hosts) { +	for_each_host(host, chain, nlm_server_hosts) {  		if (net && host->net != net)  			continue;  		host->h_expires = jiffies - 1; @@ -644,11 +640,11 @@ static void  nlm_gc_hosts(struct net *net)  {  	struct hlist_head *chain; -	struct hlist_node *pos, *next; +	struct hlist_node *next;  	struct nlm_host	*host;  	dprintk("lockd: host garbage collection for net %p\n", net); -	for_each_host(host, pos, chain, nlm_server_hosts) { +	for_each_host(host, chain, nlm_server_hosts) {  		if (net && host->net != net)  			continue;  		host->h_inuse = 0; @@ -657,7 +653,7 @@ nlm_gc_hosts(struct net *net)  	/* Mark all hosts that hold locks, blocks or shares */  	nlmsvc_mark_resources(net); -	for_each_host_safe(host, pos, next, chain, nlm_server_hosts) { +	for_each_host_safe(host, next, chain, nlm_server_hosts) {  		if (net && host->net != net)  			continue;  		if (atomic_read(&host->h_count) || host->h_inuse diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 3c2cfc68363..1812f026960 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -12,6 +12,7 @@  #include <linux/slab.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/sunrpc/xprtsock.h>  #include <linux/sunrpc/svc.h>  #include <linux/lockd/lockd.h> diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 8d80c990dff..e703318c41d 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -406,8 +406,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,  	__be32			ret;  	dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", -				file->f_file->f_path.dentry->d_inode->i_sb->s_id, -				file->f_file->f_path.dentry->d_inode->i_ino, +				file_inode(file->f_file)->i_sb->s_id, +				file_inode(file->f_file)->i_ino,  				lock->fl.fl_type, lock->fl.fl_pid,  				(long long)lock->fl.fl_start,  				(long long)lock->fl.fl_end, @@ -513,8 +513,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,  	__be32			ret;  	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", -				file->f_file->f_path.dentry->d_inode->i_sb->s_id, -				file->f_file->f_path.dentry->d_inode->i_ino, +				file_inode(file->f_file)->i_sb->s_id, +				file_inode(file->f_file)->i_ino,  				lock->fl.fl_type,  				(long long)lock->fl.fl_start,  				(long long)lock->fl.fl_end); @@ -606,8 +606,8 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)  	int	error;  	dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", -				file->f_file->f_path.dentry->d_inode->i_sb->s_id, -				file->f_file->f_path.dentry->d_inode->i_ino, +				file_inode(file->f_file)->i_sb->s_id, +				file_inode(file->f_file)->i_ino,  				lock->fl.fl_pid,  				(long long)lock->fl.fl_start,  				(long long)lock->fl.fl_end); @@ -635,8 +635,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l  	int status = 0;  	dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", -				file->f_file->f_path.dentry->d_inode->i_sb->s_id, -				file->f_file->f_path.dentry->d_inode->i_ino, +				file_inode(file->f_file)->i_sb->s_id, +				file_inode(file->f_file)->i_ino,  				lock->fl.fl_pid,  				(long long)lock->fl.fl_start,  				(long long)lock->fl.fl_end); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 0deb5f6c9dd..97e87415b14 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -13,7 +13,7 @@  #include <linux/slab.h>  #include <linux/mutex.h>  #include <linux/sunrpc/svc.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/nfsd/nfsfh.h>  #include <linux/nfsd/export.h>  #include <linux/lockd/lockd.h> @@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)  static inline void nlm_debug_print_file(char *msg, struct nlm_file *file)  { -	struct inode *inode = file->f_file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file->f_file);  	dprintk("lockd: %s %s/%ld\n",  		msg, inode->i_sb->s_id, inode->i_ino); @@ -84,7 +84,6 @@ __be32  nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,  					struct nfs_fh *f)  { -	struct hlist_node *pos;  	struct nlm_file	*file;  	unsigned int	hash;  	__be32		nfserr; @@ -96,7 +95,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,  	/* Lock file table */  	mutex_lock(&nlm_file_mutex); -	hlist_for_each_entry(file, pos, &nlm_files[hash], f_list) +	hlist_for_each_entry(file, &nlm_files[hash], f_list)  		if (!nfs_compare_fh(&file->f_handle, f))  			goto found; @@ -248,13 +247,13 @@ static int  nlm_traverse_files(void *data, nlm_host_match_fn_t match,  		int (*is_failover_file)(void *data, struct nlm_file *file))  { -	struct hlist_node *pos, *next; +	struct hlist_node *next;  	struct nlm_file	*file;  	int i, ret = 0;  	mutex_lock(&nlm_file_mutex);  	for (i = 0; i < FILE_NRHASH; i++) { -		hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) { +		hlist_for_each_entry_safe(file, next, &nlm_files[i], f_list) {  			if (is_failover_file && !is_failover_file(data, file))  				continue;  			file->f_count++; diff --git a/fs/locks.c b/fs/locks.c index a94e331a52a..cb424a4fed7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -334,7 +334,7 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,  		start = filp->f_pos;  		break;  	case SEEK_END: -		start = i_size_read(filp->f_path.dentry->d_inode); +		start = i_size_read(file_inode(filp));  		break;  	default:  		return -EINVAL; @@ -384,7 +384,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,  		start = filp->f_pos;  		break;  	case SEEK_END: -		start = i_size_read(filp->f_path.dentry->d_inode); +		start = i_size_read(file_inode(filp));  		break;  	default:  		return -EINVAL; @@ -627,7 +627,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)  	struct file_lock *cfl;  	lock_flocks(); -	for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { +	for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {  		if (!IS_POSIX(cfl))  			continue;  		if (posix_locks_conflict(fl, cfl)) @@ -708,7 +708,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)  {  	struct file_lock *new_fl = NULL;  	struct file_lock **before; -	struct inode * inode = filp->f_path.dentry->d_inode; +	struct inode * inode = file_inode(filp);  	int error = 0;  	int found = 0; @@ -1002,7 +1002,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str  int posix_lock_file(struct file *filp, struct file_lock *fl,  			struct file_lock *conflock)  { -	return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock); +	return __posix_lock_file(file_inode(filp), fl, conflock);  }  EXPORT_SYMBOL(posix_lock_file); @@ -1326,8 +1326,8 @@ int fcntl_getlease(struct file *filp)  	int type = F_UNLCK;  	lock_flocks(); -	time_out_leases(filp->f_path.dentry->d_inode); -	for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); +	time_out_leases(file_inode(filp)); +	for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);  			fl = fl->fl_next) {  		if (fl->fl_file == filp) {  			type = target_leasetype(fl); @@ -1843,7 +1843,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,  	if (copy_from_user(&flock, l, sizeof(flock)))  		goto out; -	inode = filp->f_path.dentry->d_inode; +	inode = file_inode(filp);  	/* Don't allow mandatory locks on files that may be memory mapped  	 * and shared. @@ -1961,7 +1961,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,  	if (copy_from_user(&flock, l, sizeof(flock)))  		goto out; -	inode = filp->f_path.dentry->d_inode; +	inode = file_inode(filp);  	/* Don't allow mandatory locks on files that may be memory mapped  	 * and shared. @@ -2030,7 +2030,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)  	 * posix_lock_file().  Another process could be setting a lock on this  	 * file at the same time, but we wouldn't remove that lock anyway.  	 */ -	if (!filp->f_path.dentry->d_inode->i_flock) +	if (!file_inode(filp)->i_flock)  		return;  	lock.fl_type = F_UNLCK; @@ -2056,7 +2056,7 @@ EXPORT_SYMBOL(locks_remove_posix);   */  void locks_remove_flock(struct file *filp)  { -	struct inode * inode = filp->f_path.dentry->d_inode; +	struct inode * inode = file_inode(filp);  	struct file_lock *fl;  	struct file_lock **before; @@ -2152,7 +2152,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,  		fl_pid = fl->fl_pid;  	if (fl->fl_file != NULL) -		inode = fl->fl_file->f_path.dentry->d_inode; +		inode = file_inode(fl->fl_file);  	seq_printf(f, "%lld:%s ", id, pfx);  	if (IS_POSIX(fl)) { diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig index daf9a9b32dd..09ed066c022 100644 --- a/fs/logfs/Kconfig +++ b/fs/logfs/Kconfig @@ -1,6 +1,6 @@  config LOGFS -	tristate "LogFS file system (EXPERIMENTAL)" -	depends on (MTD || BLOCK) && EXPERIMENTAL +	tristate "LogFS file system" +	depends on (MTD || BLOCK)  	select ZLIB_INFLATE  	select ZLIB_DEFLATE  	select CRC32 diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 26e4a941532..b8275108211 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -284,7 +284,7 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)  #define IMPLICIT_NODES 2  static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)  { -	struct inode *dir = file->f_dentry->d_inode; +	struct inode *dir = file_inode(file);  	loff_t pos = file->f_pos - IMPLICIT_NODES;  	struct page *page;  	struct logfs_disk_dentry *dd; @@ -320,7 +320,7 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)  static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	ino_t pino = parent_ino(file->f_dentry);  	int err; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 3886cded283..c2219a6dd3c 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -183,7 +183,7 @@ static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)  long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct logfs_inode *li = logfs_inode(inode);  	unsigned int oldflags, flags;  	int err; diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index e1a3b6bf632..9a59cbade2f 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1887,9 +1887,15 @@ int logfs_truncate(struct inode *inode, u64 target)  		logfs_put_wblocks(sb, NULL, 1);  	} -	if (!err) -		err = vmtruncate(inode, target); +	if (!err) { +		err = inode_newsize_ok(inode, target); +		if (err) +			goto out; + +		truncate_setsize(inode, target); +	} + out:  	/* I don't trust error recovery yet. */  	WARN_ON(err);  	return err; diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 685b2d981b8..a9ed6f36e6e 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -85,7 +85,7 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)  static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)  {  	unsigned long pos = filp->f_pos; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	unsigned offset = pos & ~PAGE_CACHE_MASK;  	unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/minix/file.c b/fs/minix/file.c index 4493ce695ab..adc6f549423 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -34,9 +34,12 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)  	if ((attr->ia_valid & ATTR_SIZE) &&  	    attr->ia_size != i_size_read(inode)) { -		error = vmtruncate(inode, attr->ia_size); +		error = inode_newsize_ok(inode, attr->ia_size);  		if (error)  			return error; + +		truncate_setsize(inode, attr->ia_size); +		minix_truncate(inode);  	}  	setattr_copy(inode, attr); @@ -45,7 +48,6 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)  }  const struct inode_operations minix_file_inode_operations = { -	.truncate	= minix_truncate,  	.setattr	= minix_setattr,  	.getattr	= minix_getattr,  }; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 4fc5f8ab1c4..99541cceb58 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -395,6 +395,16 @@ int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)  	return __block_write_begin(page, pos, len, minix_get_block);  } +static void minix_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		minix_truncate(inode); +	} +} +  static int minix_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -403,11 +413,8 @@ static int minix_write_begin(struct file *file, struct address_space *mapping,  	ret = block_write_begin(mapping, pos, len, flags, pagep,  				minix_get_block); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		minix_write_failed(mapping, pos + len);  	return ret;  } diff --git a/fs/namei.c b/fs/namei.c index 5f4cdf3ad91..961bc126836 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -451,7 +451,7 @@ int inode_permission(struct inode *inode, int mask)   *   * Given a path increment the reference count to the dentry and the vfsmount.   */ -void path_get(struct path *path) +void path_get(const struct path *path)  {  	mntget(path->mnt);  	dget(path->dentry); @@ -464,7 +464,7 @@ EXPORT_SYMBOL(path_get);   *   * Given a path decrement the reference count to the dentry and the vfsmount.   */ -void path_put(struct path *path) +void path_put(const struct path *path)  {  	dput(path->dentry);  	mntput(path->mnt); @@ -600,14 +600,10 @@ static int complete_walk(struct nameidata *nd)  	if (likely(!(nd->flags & LOOKUP_JUMPED)))  		return 0; -	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) +	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))  		return 0; -	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) -		return 0; - -	/* Note: we do not d_invalidate() */ -	status = d_revalidate(dentry, nd->flags); +	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);  	if (status > 0)  		return 0; @@ -1275,9 +1271,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,  	*need_lookup = false;  	dentry = d_lookup(dir, name);  	if (dentry) { -		if (d_need_lookup(dentry)) { -			*need_lookup = true; -		} else if (dentry->d_flags & DCACHE_OP_REVALIDATE) { +		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {  			error = d_revalidate(dentry, flags);  			if (unlikely(error <= 0)) {  				if (error < 0) { @@ -1344,7 +1338,7 @@ static struct dentry *__lookup_hash(struct qstr *name,   *  small and for now I'd prefer to have fast path as straight as possible.   *  It _is_ time-critical.   */ -static int lookup_fast(struct nameidata *nd, struct qstr *name, +static int lookup_fast(struct nameidata *nd,  		       struct path *path, struct inode **inode)  {  	struct vfsmount *mnt = nd->path.mnt; @@ -1360,7 +1354,7 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name,  	 */  	if (nd->flags & LOOKUP_RCU) {  		unsigned seq; -		dentry = __d_lookup_rcu(parent, name, &seq, nd->inode); +		dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode);  		if (!dentry)  			goto unlazy; @@ -1383,8 +1377,6 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name,  			return -ECHILD;  		nd->seq = seq; -		if (unlikely(d_need_lookup(dentry))) -			goto unlazy;  		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {  			status = d_revalidate(dentry, nd->flags);  			if (unlikely(status <= 0)) { @@ -1404,17 +1396,12 @@ unlazy:  		if (unlazy_walk(nd, dentry))  			return -ECHILD;  	} else { -		dentry = __d_lookup(parent, name); +		dentry = __d_lookup(parent, &nd->last);  	}  	if (unlikely(!dentry))  		goto need_lookup; -	if (unlikely(d_need_lookup(dentry))) { -		dput(dentry); -		goto need_lookup; -	} -  	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)  		status = d_revalidate(dentry, nd->flags);  	if (unlikely(status <= 0)) { @@ -1445,8 +1432,7 @@ need_lookup:  }  /* Fast lookup failed, do it the slow way */ -static int lookup_slow(struct nameidata *nd, struct qstr *name, -		       struct path *path) +static int lookup_slow(struct nameidata *nd, struct path *path)  {  	struct dentry *dentry, *parent;  	int err; @@ -1455,7 +1441,7 @@ static int lookup_slow(struct nameidata *nd, struct qstr *name,  	BUG_ON(nd->inode != parent->d_inode);  	mutex_lock(&parent->d_inode->i_mutex); -	dentry = __lookup_hash(name, parent, nd->flags); +	dentry = __lookup_hash(&nd->last, parent, nd->flags);  	mutex_unlock(&parent->d_inode->i_mutex);  	if (IS_ERR(dentry))  		return PTR_ERR(dentry); @@ -1528,7 +1514,7 @@ static inline int should_follow_link(struct inode *inode, int follow)  }  static inline int walk_component(struct nameidata *nd, struct path *path, -		struct qstr *name, int type, int follow) +		int follow)  {  	struct inode *inode;  	int err; @@ -1537,14 +1523,14 @@ static inline int walk_component(struct nameidata *nd, struct path *path,  	 * to be able to know about the current root directory and  	 * parent relationships.  	 */ -	if (unlikely(type != LAST_NORM)) -		return handle_dots(nd, type); -	err = lookup_fast(nd, name, path, &inode); +	if (unlikely(nd->last_type != LAST_NORM)) +		return handle_dots(nd, nd->last_type); +	err = lookup_fast(nd, path, &inode);  	if (unlikely(err)) {  		if (err < 0)  			goto out_err; -		err = lookup_slow(nd, name, path); +		err = lookup_slow(nd, path);  		if (err < 0)  			goto out_err; @@ -1603,8 +1589,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)  		res = follow_link(&link, nd, &cookie);  		if (res)  			break; -		res = walk_component(nd, path, &nd->last, -				     nd->last_type, LOOKUP_FOLLOW); +		res = walk_component(nd, path, LOOKUP_FOLLOW);  		put_link(nd, &link, cookie);  	} while (res > 0); @@ -1811,8 +1796,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)  			}  		} +		nd->last = this; +		nd->last_type = type; +  		if (!name[len]) -			goto last_component; +			return 0;  		/*  		 * If it wasn't NUL, we know it was '/'. Skip that  		 * slash, and continue until no more slashes. @@ -1821,10 +1809,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)  			len++;  		} while (unlikely(name[len] == '/'));  		if (!name[len]) -			goto last_component; +			return 0; +  		name += len; -		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); +		err = walk_component(nd, &next, LOOKUP_FOLLOW);  		if (err < 0)  			return err; @@ -1833,16 +1822,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)  			if (err)  				return err;  		} -		if (can_lookup(nd->inode)) -			continue; -		err = -ENOTDIR;  -		break; -		/* here ends the main loop */ - -last_component: -		nd->last = this; -		nd->last_type = type; -		return 0; +		if (!can_lookup(nd->inode)) { +			err = -ENOTDIR;  +			break; +		}  	}  	terminate_walk(nd);  	return err; @@ -1859,7 +1842,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,  	if (flags & LOOKUP_ROOT) {  		struct inode *inode = nd->root.dentry->d_inode;  		if (*name) { -			if (!inode->i_op->lookup) +			if (!can_lookup(inode))  				return -ENOTDIR;  			retval = inode_permission(inode, MAY_EXEC);  			if (retval) @@ -1903,6 +1886,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,  			get_fs_pwd(current->fs, &nd->path);  		}  	} else { +		/* Caller must check execute permissions on the starting path component */  		struct fd f = fdget_raw(dfd);  		struct dentry *dentry; @@ -1912,16 +1896,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,  		dentry = f.file->f_path.dentry;  		if (*name) { -			if (!S_ISDIR(dentry->d_inode->i_mode)) { +			if (!can_lookup(dentry->d_inode)) {  				fdput(f);  				return -ENOTDIR;  			} - -			retval = inode_permission(dentry->d_inode, MAY_EXEC); -			if (retval) { -				fdput(f); -				return retval; -			}  		}  		nd->path = f.file->f_path; @@ -1946,8 +1924,7 @@ static inline int lookup_last(struct nameidata *nd, struct path *path)  		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;  	nd->flags &= ~LOOKUP_PARENT; -	return walk_component(nd, path, &nd->last, nd->last_type, -					nd->flags & LOOKUP_FOLLOW); +	return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);  }  /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -2189,15 +2166,19 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,   *     path-walking is complete.   */  static struct filename * -user_path_parent(int dfd, const char __user *path, struct nameidata *nd) +user_path_parent(int dfd, const char __user *path, struct nameidata *nd, +		 unsigned int flags)  {  	struct filename *s = getname(path);  	int error; +	/* only LOOKUP_REVAL is allowed in extra flags */ +	flags &= LOOKUP_REVAL; +  	if (IS_ERR(s))  		return s; -	error = filename_lookup(dfd, s, LOOKUP_PARENT, nd); +	error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);  	if (error) {  		putname(s);  		return ERR_PTR(error); @@ -2742,7 +2723,7 @@ static int do_last(struct nameidata *nd, struct path *path,  		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))  			symlink_ok = true;  		/* we _can_ be in RCU mode here */ -		error = lookup_fast(nd, &nd->last, path, &inode); +		error = lookup_fast(nd, path, &inode);  		if (likely(!error))  			goto finish_lookup; @@ -2788,7 +2769,7 @@ retry_lookup:  			goto out;  		if ((*opened & FILE_CREATED) || -		    !S_ISREG(file->f_path.dentry->d_inode->i_mode)) +		    !S_ISREG(file_inode(file)->i_mode))  			will_truncate = false;  		audit_inode(name, file->f_path.dentry, 0); @@ -2951,8 +2932,8 @@ static struct file *path_openat(int dfd, struct filename *pathname,  	int error;  	file = get_empty_filp(); -	if (!file) -		return ERR_PTR(-ENFILE); +	if (IS_ERR(file)) +		return file;  	file->f_flags = op->open_flag; @@ -3044,12 +3025,22 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,  	return file;  } -struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir) +struct dentry *kern_path_create(int dfd, const char *pathname, +				struct path *path, unsigned int lookup_flags)  {  	struct dentry *dentry = ERR_PTR(-EEXIST);  	struct nameidata nd;  	int err2; -	int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); +	int error; +	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); + +	/* +	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any +	 * other flags passed in are ignored! +	 */ +	lookup_flags &= LOOKUP_REVAL; + +	error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);  	if (error)  		return ERR_PTR(error); @@ -3113,13 +3104,14 @@ void done_path_create(struct path *path, struct dentry *dentry)  }  EXPORT_SYMBOL(done_path_create); -struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) +struct dentry *user_path_create(int dfd, const char __user *pathname, +				struct path *path, unsigned int lookup_flags)  {  	struct filename *tmp = getname(pathname);  	struct dentry *res;  	if (IS_ERR(tmp))  		return ERR_CAST(tmp); -	res = kern_path_create(dfd, tmp->name, path, is_dir); +	res = kern_path_create(dfd, tmp->name, path, lookup_flags);  	putname(tmp);  	return res;  } @@ -3175,12 +3167,13 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,  	struct dentry *dentry;  	struct path path;  	int error; +	unsigned int lookup_flags = 0;  	error = may_mknod(mode);  	if (error)  		return error; - -	dentry = user_path_create(dfd, filename, &path, 0); +retry: +	dentry = user_path_create(dfd, filename, &path, lookup_flags);  	if (IS_ERR(dentry))  		return PTR_ERR(dentry); @@ -3203,6 +3196,10 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,  	}  out:  	done_path_create(&path, dentry); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -3241,8 +3238,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)  	struct dentry *dentry;  	struct path path;  	int error; +	unsigned int lookup_flags = LOOKUP_DIRECTORY; -	dentry = user_path_create(dfd, pathname, &path, 1); +retry: +	dentry = user_path_create(dfd, pathname, &path, lookup_flags);  	if (IS_ERR(dentry))  		return PTR_ERR(dentry); @@ -3252,6 +3251,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)  	if (!error)  		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);  	done_path_create(&path, dentry); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -3327,8 +3330,9 @@ static long do_rmdir(int dfd, const char __user *pathname)  	struct filename *name;  	struct dentry *dentry;  	struct nameidata nd; - -	name = user_path_parent(dfd, pathname, &nd); +	unsigned int lookup_flags = 0; +retry: +	name = user_path_parent(dfd, pathname, &nd, lookup_flags);  	if (IS_ERR(name))  		return PTR_ERR(name); @@ -3370,6 +3374,10 @@ exit2:  exit1:  	path_put(&nd.path);  	putname(name); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -3423,8 +3431,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)  	struct dentry *dentry;  	struct nameidata nd;  	struct inode *inode = NULL; - -	name = user_path_parent(dfd, pathname, &nd); +	unsigned int lookup_flags = 0; +retry: +	name = user_path_parent(dfd, pathname, &nd, lookup_flags);  	if (IS_ERR(name))  		return PTR_ERR(name); @@ -3462,6 +3471,11 @@ exit2:  exit1:  	path_put(&nd.path);  	putname(name); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		inode = NULL; +		goto retry; +	}  	return error;  slashes: @@ -3513,12 +3527,13 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,  	struct filename *from;  	struct dentry *dentry;  	struct path path; +	unsigned int lookup_flags = 0;  	from = getname(oldname);  	if (IS_ERR(from))  		return PTR_ERR(from); - -	dentry = user_path_create(newdfd, newname, &path, 0); +retry: +	dentry = user_path_create(newdfd, newname, &path, lookup_flags);  	error = PTR_ERR(dentry);  	if (IS_ERR(dentry))  		goto out_putname; @@ -3527,6 +3542,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,  	if (!error)  		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);  	done_path_create(&path, dentry); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  out_putname:  	putname(from);  	return error; @@ -3613,12 +3632,13 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,  	if (flags & AT_SYMLINK_FOLLOW)  		how |= LOOKUP_FOLLOW; - +retry:  	error = user_path_at(olddfd, oldname, how, &old_path);  	if (error)  		return error; -	new_dentry = user_path_create(newdfd, newname, &new_path, 0); +	new_dentry = user_path_create(newdfd, newname, &new_path, +					(how & LOOKUP_REVAL));  	error = PTR_ERR(new_dentry);  	if (IS_ERR(new_dentry))  		goto out; @@ -3635,6 +3655,10 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,  	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);  out_dput:  	done_path_create(&new_path, new_dentry); +	if (retry_estale(error, how)) { +		how |= LOOKUP_REVAL; +		goto retry; +	}  out:  	path_put(&old_path); @@ -3807,15 +3831,17 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,  	struct nameidata oldnd, newnd;  	struct filename *from;  	struct filename *to; +	unsigned int lookup_flags = 0; +	bool should_retry = false;  	int error; - -	from = user_path_parent(olddfd, oldname, &oldnd); +retry: +	from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);  	if (IS_ERR(from)) {  		error = PTR_ERR(from);  		goto exit;  	} -	to = user_path_parent(newdfd, newname, &newnd); +	to = user_path_parent(newdfd, newname, &newnd, lookup_flags);  	if (IS_ERR(to)) {  		error = PTR_ERR(to);  		goto exit1; @@ -3887,11 +3913,18 @@ exit3:  	unlock_rename(new_dir, old_dir);  	mnt_drop_write(oldnd.path.mnt);  exit2: +	if (retry_estale(error, lookup_flags)) +		should_retry = true;  	path_put(&newnd.path);  	putname(to);  exit1:  	path_put(&oldnd.path);  	putname(from); +	if (should_retry) { +		should_retry = false; +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  exit:  	return error;  } diff --git a/fs/namespace.c b/fs/namespace.c index 398a50ff243..50ca17d3cb4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -313,7 +313,7 @@ int __mnt_want_write(struct vfsmount *m)  	 * incremented count after it has set MNT_WRITE_HOLD.  	 */  	smp_mb(); -	while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) +	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)  		cpu_relax();  	/*  	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will @@ -384,7 +384,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);   */  int __mnt_want_write_file(struct file *file)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))  		return __mnt_want_write(file->f_path.mnt); @@ -1237,6 +1237,14 @@ static int do_umount(struct mount *mnt, int flags)  	return retval;  } +/*  + * Is the caller allowed to modify his namespace? + */ +static inline bool may_mount(void) +{ +	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); +} +  /*   * Now umount can handle mount points as well as block devices.   * This is important for filesystems which use unnamed block devices. @@ -1255,6 +1263,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)  	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))  		return -EINVAL; +	if (!may_mount()) +		return -EPERM; +  	if (!(flags & UMOUNT_NOFOLLOW))  		lookup_flags |= LOOKUP_FOLLOW; @@ -1268,10 +1279,6 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)  	if (!check_mnt(mnt))  		goto dput_and_out; -	retval = -EPERM; -	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) -		goto dput_and_out; -  	retval = do_umount(mnt, flags);  dput_and_out:  	/* we mustn't call path_put() as that would clear mnt_expiry_mark */ @@ -1293,24 +1300,6 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)  #endif -static int mount_is_safe(struct path *path) -{ -	if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) -		return 0; -	return -EPERM; -#ifdef notyet -	if (S_ISLNK(path->dentry->d_inode->i_mode)) -		return -EPERM; -	if (path->dentry->d_inode->i_mode & S_ISVTX) { -		if (current_uid() != path->dentry->d_inode->i_uid) -			return -EPERM; -	} -	if (inode_permission(path->dentry->d_inode, MAY_WRITE)) -		return -EPERM; -	return 0; -#endif -} -  static bool mnt_ns_loop(struct path *path)  {  	/* Could bind mounting the mount namespace inode cause a @@ -1633,9 +1622,6 @@ static int do_change_type(struct path *path, int flag)  	int type;  	int err = 0; -	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) -		return -EPERM; -  	if (path->dentry != path->mnt->mnt_root)  		return -EINVAL; @@ -1669,9 +1655,7 @@ static int do_loopback(struct path *path, const char *old_name,  	LIST_HEAD(umount_list);  	struct path old_path;  	struct mount *mnt = NULL, *old; -	int err = mount_is_safe(path); -	if (err) -		return err; +	int err;  	if (!old_name || !*old_name)  		return -EINVAL;  	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); @@ -1748,9 +1732,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags,  	struct super_block *sb = path->mnt->mnt_sb;  	struct mount *mnt = real_mount(path->mnt); -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; -  	if (!check_mnt(mnt))  		return -EINVAL; @@ -1764,6 +1745,8 @@ static int do_remount(struct path *path, int flags, int mnt_flags,  	down_write(&sb->s_umount);  	if (flags & MS_BIND)  		err = change_mount_flags(path->mnt, flags); +	else if (!capable(CAP_SYS_ADMIN)) +		err = -EPERM;  	else  		err = do_remount_sb(sb, flags, data, 0);  	if (!err) { @@ -1796,9 +1779,7 @@ static int do_move_mount(struct path *path, const char *old_name)  	struct path old_path, parent_path;  	struct mount *p;  	struct mount *old; -	int err = 0; -	if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) -		return -EPERM; +	int err;  	if (!old_name || !*old_name)  		return -EINVAL;  	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); @@ -1933,18 +1914,13 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,  			int mnt_flags, const char *name, void *data)  {  	struct file_system_type *type; -	struct user_namespace *user_ns; +	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;  	struct vfsmount *mnt;  	int err;  	if (!fstype)  		return -EINVAL; -	/* we need capabilities... */ -	user_ns = real_mount(path->mnt)->mnt_ns->user_ns; -	if (!ns_capable(user_ns, CAP_SYS_ADMIN)) -		return -EPERM; -  	type = get_fs_type(fstype);  	if (!type)  		return -ENODEV; @@ -2258,6 +2234,9 @@ long do_mount(const char *dev_name, const char *dir_name,  	if (retval)  		goto dput_out; +	if (!may_mount()) +		return -EPERM; +  	/* Default to relatime unless overriden */  	if (!(flags & MS_NOATIME))  		mnt_flags |= MNT_RELATIME; @@ -2567,7 +2546,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,  	struct mount *new_mnt, *root_mnt;  	int error; -	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) +	if (!may_mount())  		return -EPERM;  	error = user_path_dir(new_root, &new); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 4117e7b377b..81632609365 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -593,14 +593,10 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,  		return 1; /* I'm not sure */  	qname.name = __name; -	qname.hash = full_name_hash(qname.name, qname.len); - -	if (dentry->d_op && dentry->d_op->d_hash) -		if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0) -			goto end_advance; - -	newdent = d_lookup(dentry, &qname); +	newdent = d_hash_and_lookup(dentry, &qname); +	if (unlikely(IS_ERR(newdent))) +		goto end_advance;  	if (!newdent) {  		newdent = d_alloc(dentry, &qname);  		if (!newdent) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index d7e9fe77188..7dafd6899a6 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -331,12 +331,15 @@ static int  ncp_show_options(struct seq_file *seq, struct dentry *root)  	struct ncp_server *server = NCP_SBP(root->d_sb);  	unsigned int tmp; -	if (server->m.uid != 0) -		seq_printf(seq, ",uid=%u", server->m.uid); -	if (server->m.gid != 0) -		seq_printf(seq, ",gid=%u", server->m.gid); -	if (server->m.mounted_uid != 0) -		seq_printf(seq, ",owner=%u", server->m.mounted_uid); +	if (!uid_eq(server->m.uid, GLOBAL_ROOT_UID)) +		seq_printf(seq, ",uid=%u", +			   from_kuid_munged(&init_user_ns, server->m.uid)); +	if (!gid_eq(server->m.gid, GLOBAL_ROOT_GID)) +		seq_printf(seq, ",gid=%u", +			   from_kgid_munged(&init_user_ns, server->m.gid)); +	if (!uid_eq(server->m.mounted_uid, GLOBAL_ROOT_UID)) +		seq_printf(seq, ",owner=%u", +			   from_kuid_munged(&init_user_ns, server->m.mounted_uid));  	tmp = server->m.file_mode & S_IALLUGO;  	if (tmp != NCP_DEFAULT_FILE_MODE)  		seq_printf(seq, ",mode=0%o", tmp); @@ -381,13 +384,13 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)  	data->flags = 0;  	data->int_flags = 0; -	data->mounted_uid = 0; +	data->mounted_uid = GLOBAL_ROOT_UID;  	data->wdog_pid = NULL;  	data->ncp_fd = ~0;  	data->time_out = NCP_DEFAULT_TIME_OUT;  	data->retry_count = NCP_DEFAULT_RETRY_COUNT; -	data->uid = 0; -	data->gid = 0; +	data->uid = GLOBAL_ROOT_UID; +	data->gid = GLOBAL_ROOT_GID;  	data->file_mode = NCP_DEFAULT_FILE_MODE;  	data->dir_mode = NCP_DEFAULT_DIR_MODE;  	data->info_fd = -1; @@ -399,13 +402,19 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)  			goto err;  		switch (optval) {  			case 'u': -				data->uid = optint; +				data->uid = make_kuid(current_user_ns(), optint); +				if (!uid_valid(data->uid)) +					goto err;  				break;  			case 'g': -				data->gid = optint; +				data->gid = make_kgid(current_user_ns(), optint); +				if (!gid_valid(data->gid)) +					goto err;  				break;  			case 'o': -				data->mounted_uid = optint; +				data->mounted_uid = make_kuid(current_user_ns(), optint); +				if (!uid_valid(data->mounted_uid)) +					goto err;  				break;  			case 'm':  				data->file_mode = optint; @@ -480,13 +489,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)  				data.flags = md->flags;  				data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE; -				data.mounted_uid = md->mounted_uid; +				data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid);  				data.wdog_pid = find_get_pid(md->wdog_pid);  				data.ncp_fd = md->ncp_fd;  				data.time_out = md->time_out;  				data.retry_count = md->retry_count; -				data.uid = md->uid; -				data.gid = md->gid; +				data.uid = make_kuid(current_user_ns(), md->uid); +				data.gid = make_kgid(current_user_ns(), md->gid);  				data.file_mode = md->file_mode;  				data.dir_mode = md->dir_mode;  				data.info_fd = -1; @@ -499,13 +508,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)  				struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;  				data.flags = md->flags; -				data.mounted_uid = md->mounted_uid; +				data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid);  				data.wdog_pid = find_get_pid(md->wdog_pid);  				data.ncp_fd = md->ncp_fd;  				data.time_out = md->time_out;  				data.retry_count = md->retry_count; -				data.uid = md->uid; -				data.gid = md->gid; +				data.uid = make_kuid(current_user_ns(), md->uid); +				data.gid = make_kgid(current_user_ns(), md->gid);  				data.file_mode = md->file_mode;  				data.dir_mode = md->dir_mode;  				data.info_fd = -1; @@ -520,12 +529,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)  				goto out;  			break;  	} +	error = -EINVAL; +	if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || +	    !gid_valid(data.gid)) +		goto out;  	error = -EBADF;  	ncp_filp = fget(data.ncp_fd);  	if (!ncp_filp)  		goto out;  	error = -ENOTSOCK; -	sock_inode = ncp_filp->f_path.dentry->d_inode; +	sock_inode = file_inode(ncp_filp);  	if (!S_ISSOCK(sock_inode->i_mode))  		goto out_fput;  	sock = SOCKET_I(sock_inode); @@ -564,7 +577,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)  		if (!server->info_filp)  			goto out_bdi;  		error = -ENOTSOCK; -		sock_inode = server->info_filp->f_path.dentry->d_inode; +		sock_inode = file_inode(server->info_filp);  		if (!S_ISSOCK(sock_inode->i_mode))  			goto out_fput2;  		info_sock = SOCKET_I(sock_inode); @@ -886,12 +899,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)  		goto out;  	result = -EPERM; -	if (((attr->ia_valid & ATTR_UID) && -	     (attr->ia_uid != server->m.uid))) +	if ((attr->ia_valid & ATTR_UID) && !uid_eq(attr->ia_uid, server->m.uid))  		goto out; -	if (((attr->ia_valid & ATTR_GID) && -	     (attr->ia_gid != server->m.gid))) +	if ((attr->ia_valid & ATTR_GID) && !gid_eq(attr->ia_gid, server->m.gid))  		goto out;  	if (((attr->ia_valid & ATTR_MODE) && @@ -976,9 +987,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)  			goto out;  		if (attr->ia_size != i_size_read(inode)) { -			result = vmtruncate(inode, attr->ia_size); -			if (result) -				goto out; +			truncate_setsize(inode, attr->ia_size);  			mark_inode_dirty(inode);  		}  	} diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 6958adfaff0..60426ccb3b6 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -45,7 +45,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode,  		return -EINVAL;  	}  	/* TODO: info.addr = server->m.serv_addr; */ -	SET_UID(info.mounted_uid, server->m.mounted_uid); +	SET_UID(info.mounted_uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid));  	info.connection		= server->connection;  	info.buffer_size	= server->buffer_size;  	info.volume_number	= NCP_FINFO(inode)->volNumber; @@ -69,7 +69,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,  		DPRINTK("info.version invalid: %d\n", info2.version);  		return -EINVAL;  	} -	info2.mounted_uid   = server->m.mounted_uid; +	info2.mounted_uid   = from_kuid_munged(current_user_ns(), server->m.mounted_uid);  	info2.connection    = server->connection;  	info2.buffer_size   = server->buffer_size;  	info2.volume_number = NCP_FINFO(inode)->volNumber; @@ -135,7 +135,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,  		DPRINTK("info.version invalid: %d\n", info2.version);  		return -EINVAL;  	} -	info2.mounted_uid   = server->m.mounted_uid; +	info2.mounted_uid   = from_kuid_munged(current_user_ns(), server->m.mounted_uid);  	info2.connection    = server->connection;  	info2.buffer_size   = server->buffer_size;  	info2.volume_number = NCP_FINFO(inode)->volNumber; @@ -348,22 +348,25 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg  		{  			u16 uid; -			SET_UID(uid, server->m.mounted_uid); +			SET_UID(uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid));  			if (put_user(uid, (u16 __user *)argp))  				return -EFAULT;  			return 0;  		}  	case NCP_IOC_GETMOUNTUID32: -		if (put_user(server->m.mounted_uid, -			     (u32 __user *)argp)) +	{ +		uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); +		if (put_user(uid, (u32 __user *)argp))  			return -EFAULT;  		return 0; +	}  	case NCP_IOC_GETMOUNTUID64: -		if (put_user(server->m.mounted_uid, -			     (u64 __user *)argp)) +	{ +		uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); +		if (put_user(uid, (u64 __user *)argp))  			return -EFAULT;  		return 0; - +	}  	case NCP_IOC_GETROOT:  		{  			struct ncp_setroot_ioctl sr; @@ -808,9 +811,9 @@ outrel:  long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ncp_server *server = NCP_SERVER(inode); -	uid_t uid = current_uid(); +	kuid_t uid = current_uid();  	int need_drop_write = 0;  	long ret; @@ -819,12 +822,12 @@ long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  	case NCP_IOC_CONN_LOGGED_IN:  	case NCP_IOC_SETROOT:  		if (!capable(CAP_SYS_ADMIN)) { -			ret = -EACCES; +			ret = -EPERM;  			goto out;  		}  		break;  	} -	if (server->m.mounted_uid != uid) { +	if (!uid_eq(server->m.mounted_uid, uid)) {  		switch (cmd) {  		/*  		 * Only mount owner can issue these ioctls.  Information diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 63d14a99483..ee24df5af1f 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -105,7 +105,7 @@ static const struct vm_operations_struct ncp_file_mmap =  /* This is used for a general mmap of a ncp file */  int ncp_mmap(struct file *file, struct vm_area_struct *vma)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	DPRINTK("ncp_mmap: called\n"); diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 54cc0cdb3dc..c51b2c54353 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -23,15 +23,15 @@ struct ncp_mount_data_kernel {  	unsigned long    flags;		/* NCP_MOUNT_* flags */  	unsigned int	 int_flags;	/* internal flags */  #define NCP_IMOUNT_LOGGEDIN_POSSIBLE	0x0001 -	uid_t		 mounted_uid;	/* Who may umount() this filesystem? */ +	kuid_t		 mounted_uid;	/* Who may umount() this filesystem? */  	struct pid      *wdog_pid;	/* Who cares for our watchdog packets? */  	unsigned int     ncp_fd;	/* The socket to the ncp port */  	unsigned int     time_out;	/* How long should I wait after  					   sending a NCP request? */  	unsigned int     retry_count;	/* And how often should I retry? */  	unsigned char	 mounted_vol[NCP_VOLNAME_LEN + 1]; -	uid_t		 uid; -	gid_t		 gid; +	kuid_t		 uid; +	kgid_t		 gid;  	umode_t		 file_mode;  	umode_t		 dir_mode;  	int		 info_fd; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 4fa788c93f4..434b93ec097 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -1273,6 +1273,7 @@ static const struct nfs_pageio_ops bl_pg_write_ops = {  static struct pnfs_layoutdriver_type blocklayout_type = {  	.id				= LAYOUT_BLOCK_VOLUME,  	.name				= "LAYOUT_BLOCK_VOLUME", +	.owner				= THIS_MODULE,  	.read_pagelist			= bl_read_pagelist,  	.write_pagelist			= bl_write_pagelist,  	.alloc_layout_hdr		= bl_alloc_layout_hdr, diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 862a2f16db6..5f7b053720e 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -128,10 +128,13 @@ int nfs_cache_register_net(struct net *net, struct cache_detail *cd)  	struct super_block *pipefs_sb;  	int ret = 0; +	sunrpc_init_cache_detail(cd);  	pipefs_sb = rpc_get_sb_net(net);  	if (pipefs_sb) {  		ret = nfs_cache_register_sb(pipefs_sb, cd);  		rpc_put_sb_net(net); +		if (ret) +			sunrpc_destroy_cache_detail(cd);  	}  	return ret;  } @@ -151,14 +154,5 @@ void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)  		nfs_cache_unregister_sb(pipefs_sb, cd);  		rpc_put_sb_net(net);  	} -} - -void nfs_cache_init(struct cache_detail *cd) -{ -	sunrpc_init_cache_detail(cd); -} - -void nfs_cache_destroy(struct cache_detail *cd) -{  	sunrpc_destroy_cache_detail(cd);  } diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 317db95e37f..4116d2c3f52 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -23,8 +23,6 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);  extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);  extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); -extern void nfs_cache_init(struct cache_detail *cd); -extern void nfs_cache_destroy(struct cache_detail *cd);  extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);  extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);  extern int nfs_cache_register_sb(struct super_block *sb, diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c89b26bc975..2960512792c 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -183,60 +183,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,  static u32 initiate_bulk_draining(struct nfs_client *clp,  				  struct cb_layoutrecallargs *args)  { -	struct nfs_server *server; -	struct pnfs_layout_hdr *lo; -	struct inode *ino; -	u32 rv = NFS4ERR_NOMATCHING_LAYOUT; -	struct pnfs_layout_hdr *tmp; -	LIST_HEAD(recall_list); -	LIST_HEAD(free_me_list); -	struct pnfs_layout_range range = { -		.iomode = IOMODE_ANY, -		.offset = 0, -		.length = NFS4_MAX_UINT64, -	}; - -	spin_lock(&clp->cl_lock); -	rcu_read_lock(); -	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { -		if ((args->cbl_recall_type == RETURN_FSID) && -		    memcmp(&server->fsid, &args->cbl_fsid, -			   sizeof(struct nfs_fsid))) -			continue; - -		list_for_each_entry(lo, &server->layouts, plh_layouts) { -			ino = igrab(lo->plh_inode); -			if (ino) -				continue; -			spin_lock(&ino->i_lock); -			/* Is this layout in the process of being freed? */ -			if (NFS_I(ino)->layout != lo) { -				spin_unlock(&ino->i_lock); -				iput(ino); -				continue; -			} -			pnfs_get_layout_hdr(lo); -			spin_unlock(&ino->i_lock); -			list_add(&lo->plh_bulk_recall, &recall_list); -		} -	} -	rcu_read_unlock(); -	spin_unlock(&clp->cl_lock); +	int stat; -	list_for_each_entry_safe(lo, tmp, -				 &recall_list, plh_bulk_recall) { -		ino = lo->plh_inode; -		spin_lock(&ino->i_lock); -		set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); -		if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range)) -			rv = NFS4ERR_DELAY; -		list_del_init(&lo->plh_bulk_recall); -		spin_unlock(&ino->i_lock); -		pnfs_free_lseg_list(&free_me_list); -		pnfs_put_layout_hdr(lo); -		iput(ino); -	} -	return rv; +	if (args->cbl_recall_type == RETURN_FSID) +		stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); +	else +		stat = pnfs_destroy_layouts_byclid(clp, true); +	if (stat != 0) +		return NFS4ERR_DELAY; +	return NFS4ERR_NOMATCHING_LAYOUT;  }  static u32 do_callback_layoutrecall(struct nfs_client *clp, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 9f3c66438d0..84d8eae203a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -197,7 +197,6 @@ error_0:  EXPORT_SYMBOL_GPL(nfs_alloc_client);  #if IS_ENABLED(CONFIG_NFS_V4) -/* idr_remove_all is not needed as all id's are removed by nfs_put_client */  void nfs_cleanup_cb_ident_idr(struct net *net)  {  	struct nfs_net *nn = net_generic(net, nfs_net_id); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 81c5eec3cf3..6390a4b5fee 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -55,7 +55,8 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)  	flags &= FMODE_READ|FMODE_WRITE;  	rcu_read_lock();  	delegation = rcu_dereference(NFS_I(inode)->delegation); -	if (delegation != NULL && (delegation->type & flags) == flags) { +	if (delegation != NULL && (delegation->type & flags) == flags && +	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {  		nfs_mark_delegation_referenced(delegation);  		ret = 1;  	} @@ -70,8 +71,10 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_  	int status = 0;  	if (inode->i_flock == NULL) -		goto out; +		return 0; +	if (inode->i_flock == NULL) +		goto out;  	/* Protect inode->i_flock using the file locks lock */  	lock_flocks();  	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { @@ -94,7 +97,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *s  {  	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_open_context *ctx; +	struct nfs4_state_owner *sp;  	struct nfs4_state *state; +	unsigned int seq;  	int err;  again: @@ -109,9 +114,16 @@ again:  			continue;  		get_nfs_open_context(ctx);  		spin_unlock(&inode->i_lock); +		sp = state->owner; +		/* Block nfs4_proc_unlck */ +		mutex_lock(&sp->so_delegreturn_mutex); +		seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);  		err = nfs4_open_delegation_recall(ctx, state, stateid); -		if (err >= 0) +		if (!err)  			err = nfs_delegation_claim_locks(ctx, state); +		if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) +			err = -EAGAIN; +		mutex_unlock(&sp->so_delegreturn_mutex);  		put_nfs_open_context(ctx);  		if (err != 0)  			return err; @@ -182,39 +194,91 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation  }  static struct nfs_delegation * +nfs_start_delegation_return_locked(struct nfs_inode *nfsi) +{ +	struct nfs_delegation *ret = NULL; +	struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); + +	if (delegation == NULL) +		goto out; +	spin_lock(&delegation->lock); +	if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) +		ret = delegation; +	spin_unlock(&delegation->lock); +out: +	return ret; +} + +static struct nfs_delegation * +nfs_start_delegation_return(struct nfs_inode *nfsi) +{ +	struct nfs_delegation *delegation; + +	rcu_read_lock(); +	delegation = nfs_start_delegation_return_locked(nfsi); +	rcu_read_unlock(); +	return delegation; +} + +static void +nfs_abort_delegation_return(struct nfs_delegation *delegation, +		struct nfs_client *clp) +{ + +	spin_lock(&delegation->lock); +	clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); +	set_bit(NFS_DELEGATION_RETURN, &delegation->flags); +	spin_unlock(&delegation->lock); +	set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +} + +static struct nfs_delegation *  nfs_detach_delegation_locked(struct nfs_inode *nfsi, -			     struct nfs_server *server) +		struct nfs_delegation *delegation, +		struct nfs_client *clp)  { -	struct nfs_delegation *delegation = +	struct nfs_delegation *deleg_cur =  		rcu_dereference_protected(nfsi->delegation, -				lockdep_is_held(&server->nfs_client->cl_lock)); +				lockdep_is_held(&clp->cl_lock)); -	if (delegation == NULL) -		goto nomatch; +	if (deleg_cur == NULL || delegation != deleg_cur) +		return NULL;  	spin_lock(&delegation->lock); +	set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);  	list_del_rcu(&delegation->super_list);  	delegation->inode = NULL;  	nfsi->delegation_state = 0;  	rcu_assign_pointer(nfsi->delegation, NULL);  	spin_unlock(&delegation->lock);  	return delegation; -nomatch: -	return NULL;  }  static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, -						    struct nfs_server *server) +		struct nfs_delegation *delegation, +		struct nfs_server *server)  {  	struct nfs_client *clp = server->nfs_client; -	struct nfs_delegation *delegation;  	spin_lock(&clp->cl_lock); -	delegation = nfs_detach_delegation_locked(nfsi, server); +	delegation = nfs_detach_delegation_locked(nfsi, delegation, clp);  	spin_unlock(&clp->cl_lock);  	return delegation;  } +static struct nfs_delegation * +nfs_inode_detach_delegation(struct inode *inode) +{ +	struct nfs_inode *nfsi = NFS_I(inode); +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_delegation *delegation; + +	delegation = nfs_start_delegation_return(nfsi); +	if (delegation == NULL) +		return NULL; +	return nfs_detach_delegation(nfsi, delegation, server); +} +  /**   * nfs_inode_set_delegation - set up a delegation on an inode   * @inode: inode to which delegation applies @@ -268,7 +332,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct  			delegation = NULL;  			goto out;  		} -		freeme = nfs_detach_delegation_locked(nfsi, server); +		freeme = nfs_detach_delegation_locked(nfsi,  +				old_delegation, clp); +		if (freeme == NULL) +			goto out;  	}  	list_add_rcu(&delegation->super_list, &server->delegations);  	nfsi->delegation_state = delegation->type; @@ -292,19 +359,29 @@ out:  /*   * Basic procedure for returning a delegation to the server   */ -static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)  { +	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;  	struct nfs_inode *nfsi = NFS_I(inode);  	int err; -	/* -	 * Guard against new delegated open/lock/unlock calls and against -	 * state recovery -	 */ -	down_write(&nfsi->rwsem); -	err = nfs_delegation_claim_opens(inode, &delegation->stateid); -	up_write(&nfsi->rwsem); -	if (err) +	if (delegation == NULL) +		return 0; +	do { +		err = nfs_delegation_claim_opens(inode, &delegation->stateid); +		if (!issync || err != -EAGAIN) +			break; +		/* +		 * Guard against state recovery +		 */ +		err = nfs4_wait_clnt_recover(clp); +	} while (err == 0); + +	if (err) { +		nfs_abort_delegation_return(delegation, clp); +		goto out; +	} +	if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode)))  		goto out;  	err = nfs_do_return_delegation(inode, delegation, issync); @@ -340,13 +417,10 @@ restart:  			inode = nfs_delegation_grab_inode(delegation);  			if (inode == NULL)  				continue; -			delegation = nfs_detach_delegation(NFS_I(inode), -								server); +			delegation = nfs_start_delegation_return_locked(NFS_I(inode));  			rcu_read_unlock(); -			if (delegation != NULL) -				err = __nfs_inode_return_delegation(inode, -								delegation, 0); +			err = nfs_end_delegation_return(inode, delegation, 0);  			iput(inode);  			if (!err)  				goto restart; @@ -367,15 +441,11 @@ restart:   */  void nfs_inode_return_delegation_noreclaim(struct inode *inode)  { -	struct nfs_server *server = NFS_SERVER(inode); -	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_delegation *delegation; -	if (rcu_access_pointer(nfsi->delegation) != NULL) { -		delegation = nfs_detach_delegation(nfsi, server); -		if (delegation != NULL) -			nfs_do_return_delegation(inode, delegation, 0); -	} +	delegation = nfs_inode_detach_delegation(inode); +	if (delegation != NULL) +		nfs_do_return_delegation(inode, delegation, 0);  }  /** @@ -390,18 +460,14 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)   */  int nfs4_inode_return_delegation(struct inode *inode)  { -	struct nfs_server *server = NFS_SERVER(inode);  	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_delegation *delegation;  	int err = 0;  	nfs_wb_all(inode); -	if (rcu_access_pointer(nfsi->delegation) != NULL) { -		delegation = nfs_detach_delegation(nfsi, server); -		if (delegation != NULL) { -			err = __nfs_inode_return_delegation(inode, delegation, 1); -		} -	} +	delegation = nfs_start_delegation_return(nfsi); +	if (delegation != NULL) +		err = nfs_end_delegation_return(inode, delegation, 1);  	return err;  } @@ -471,7 +537,7 @@ void nfs_remove_bad_delegation(struct inode *inode)  {  	struct nfs_delegation *delegation; -	delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode)); +	delegation = nfs_inode_detach_delegation(inode);  	if (delegation) {  		nfs_inode_find_state_and_recover(inode, &delegation->stateid);  		nfs_free_delegation(delegation); @@ -649,7 +715,7 @@ restart:  			if (inode == NULL)  				continue;  			delegation = nfs_detach_delegation(NFS_I(inode), -								server); +					delegation, server);  			rcu_read_unlock();  			if (delegation != NULL) diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index bbc6a4dba0d..d54d4fca679 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -29,6 +29,7 @@ enum {  	NFS_DELEGATION_NEED_RECLAIM = 0,  	NFS_DELEGATION_RETURN,  	NFS_DELEGATION_REFERENCED, +	NFS_DELEGATION_RETURNING,  };  int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32e6c53520e..f23f455be42 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -281,7 +281,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des  	for (i = 0; i < array->size; i++) {  		if (array->array[i].cookie == *desc->dir_cookie) { -			struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); +			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));  			struct nfs_open_dir_context *ctx = desc->file->private_data;  			new_pos = desc->current_index + i; @@ -629,7 +629,7 @@ out:  static  int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)  { -	struct inode	*inode = desc->file->f_path.dentry->d_inode; +	struct inode	*inode = file_inode(desc->file);  	int ret;  	ret = nfs_readdir_xdr_to_array(desc, page, inode); @@ -660,7 +660,7 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)  static  struct page *get_cache_page(nfs_readdir_descriptor_t *desc)  { -	return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, +	return read_cache_page(file_inode(desc->file)->i_mapping,  			desc->page_index, (filler_t *)nfs_readdir_filler, desc);  } @@ -764,7 +764,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,  {  	struct page	*page = NULL;  	int		status; -	struct inode *inode = desc->file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(desc->file);  	struct nfs_open_dir_context *ctx = desc->file->private_data;  	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", @@ -1136,6 +1136,45 @@ out_error:  }  /* + * A weaker form of d_revalidate for revalidating just the dentry->d_inode + * when we don't really care about the dentry name. This is called when a + * pathwalk ends on a dentry that was not found via a normal lookup in the + * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). + * + * In this situation, we just want to verify that the inode itself is OK + * since the dentry might have changed on the server. + */ +static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ +	int error; +	struct inode *inode = dentry->d_inode; + +	/* +	 * I believe we can only get a negative dentry here in the case of a +	 * procfs-style symlink. Just assume it's correct for now, but we may +	 * eventually need to do something more here. +	 */ +	if (!inode) { +		dfprintk(LOOKUPCACHE, "%s: %s/%s has negative inode\n", +				__func__, dentry->d_parent->d_name.name, +				dentry->d_name.name); +		return 1; +	} + +	if (is_bad_inode(inode)) { +		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", +				__func__, dentry->d_parent->d_name.name, +				dentry->d_name.name); +		return 0; +	} + +	error = nfs_revalidate_inode(NFS_SERVER(inode), inode); +	dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", +			__func__, inode->i_ino, error ? "invalid" : "valid"); +	return !error; +} + +/*   * This is called from dput() when d_count is going to 0.   */  static int nfs_dentry_delete(const struct dentry *dentry) @@ -1202,6 +1241,7 @@ static void nfs_d_release(struct dentry *dentry)  const struct dentry_operations nfs_dentry_operations = {  	.d_revalidate	= nfs_lookup_revalidate, +	.d_weak_revalidate	= nfs_weak_revalidate,  	.d_delete	= nfs_dentry_delete,  	.d_iput		= nfs_dentry_iput,  	.d_automount	= nfs_d_automount, @@ -2153,12 +2193,16 @@ static int nfs_open_permission_mask(int openflags)  {  	int mask = 0; -	if ((openflags & O_ACCMODE) != O_WRONLY) -		mask |= MAY_READ; -	if ((openflags & O_ACCMODE) != O_RDONLY) -		mask |= MAY_WRITE; -	if (openflags & __FMODE_EXEC) -		mask |= MAY_EXEC; +	if (openflags & __FMODE_EXEC) { +		/* ONLY check exec rights */ +		mask = MAY_EXEC; +	} else { +		if ((openflags & O_ACCMODE) != O_WRONLY) +			mask |= MAY_READ; +		if ((openflags & O_ACCMODE) != O_RDONLY) +			mask |= MAY_WRITE; +	} +  	return mask;  } diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index ca4b11ec87a..94552709229 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -10,6 +10,7 @@  #include <linux/module.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/dns_resolver.h>  #include "dns_resolve.h" @@ -42,6 +43,7 @@ EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);  #include <linux/seq_file.h>  #include <linux/inet.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/sunrpc/cache.h>  #include <linux/sunrpc/svcauth.h>  #include <linux/sunrpc/rpc_pipe_fs.h> @@ -142,7 +144,7 @@ static int nfs_dns_upcall(struct cache_detail *cd,  	ret = nfs_cache_upcall(cd, key->hostname);  	if (ret) -		ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); +		ret = sunrpc_cache_pipe_upcall(cd, ch);  	return ret;  } @@ -351,60 +353,47 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,  }  EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); +static struct cache_detail nfs_dns_resolve_template = { +	.owner		= THIS_MODULE, +	.hash_size	= NFS_DNS_HASHTBL_SIZE, +	.name		= "dns_resolve", +	.cache_put	= nfs_dns_ent_put, +	.cache_upcall	= nfs_dns_upcall, +	.cache_request	= nfs_dns_request, +	.cache_parse	= nfs_dns_parse, +	.cache_show	= nfs_dns_show, +	.match		= nfs_dns_match, +	.init		= nfs_dns_ent_init, +	.update		= nfs_dns_ent_update, +	.alloc		= nfs_dns_ent_alloc, +}; + +  int nfs_dns_resolver_cache_init(struct net *net)  { -	int err = -ENOMEM; +	int err;  	struct nfs_net *nn = net_generic(net, nfs_net_id); -	struct cache_detail *cd; -	struct cache_head **tbl; -	cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL); -	if (cd == NULL) -		goto err_cd; +	nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net); +	if (IS_ERR(nn->nfs_dns_resolve)) +		return PTR_ERR(nn->nfs_dns_resolve); -	tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *), -			GFP_KERNEL); -	if (tbl == NULL) -		goto err_tbl; - -	cd->owner = THIS_MODULE, -	cd->hash_size = NFS_DNS_HASHTBL_SIZE, -	cd->hash_table = tbl, -	cd->name = "dns_resolve", -	cd->cache_put = nfs_dns_ent_put, -	cd->cache_upcall = nfs_dns_upcall, -	cd->cache_parse = nfs_dns_parse, -	cd->cache_show = nfs_dns_show, -	cd->match = nfs_dns_match, -	cd->init = nfs_dns_ent_init, -	cd->update = nfs_dns_ent_update, -	cd->alloc = nfs_dns_ent_alloc, - -	nfs_cache_init(cd); -	err = nfs_cache_register_net(net, cd); +	err = nfs_cache_register_net(net, nn->nfs_dns_resolve);  	if (err)  		goto err_reg; -	nn->nfs_dns_resolve = cd;  	return 0;  err_reg: -	nfs_cache_destroy(cd); -	kfree(cd->hash_table); -err_tbl: -	kfree(cd); -err_cd: +	cache_destroy_net(nn->nfs_dns_resolve, net);  	return err;  }  void nfs_dns_resolver_cache_destroy(struct net *net)  {  	struct nfs_net *nn = net_generic(net, nfs_net_id); -	struct cache_detail *cd = nn->nfs_dns_resolve; -	nfs_cache_unregister_net(net, cd); -	nfs_cache_destroy(cd); -	kfree(cd->hash_table); -	kfree(cd); +	nfs_cache_unregister_net(net, nn->nfs_dns_resolve); +	cache_destroy_net(nn->nfs_dns_resolve, net);  }  static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3c2b893665b..29f4a48a0ee 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -292,7 +292,7 @@ static int  nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)  {  	int ret; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	do {  		ret = filemap_write_and_wait_range(inode->i_mapping, start, end); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index c817787fbdb..24d1d1c5fca 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -307,6 +307,7 @@ void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)  		nfs_fscache_inode_unlock(inode);  	}  } +EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);  /*   * Replace a per-inode cookie due to revalidation detecting a file having diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index c5b11b53ff3..4ecb76652eb 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -153,6 +153,22 @@ static inline void nfs_readpage_to_fscache(struct inode *inode,  }  /* + * Invalidate the contents of fscache for this inode.  This will not sleep. + */ +static inline void nfs_fscache_invalidate(struct inode *inode) +{ +	fscache_invalidate(NFS_I(inode)->fscache); +} + +/* + * Wait for an object to finish being invalidated. + */ +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) +{ +	fscache_wait_on_invalidate(NFS_I(inode)->fscache); +} + +/*   * indicate the client caching state as readable text   */  static inline const char *nfs_server_fscache_state(struct nfs_server *server) @@ -162,7 +178,6 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)  	return "no ";  } -  #else /* CONFIG_NFS_FSCACHE */  static inline int nfs_fscache_register(void) { return 0; }  static inline void nfs_fscache_unregister(void) {} @@ -205,6 +220,10 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,  static inline void nfs_readpage_to_fscache(struct inode *inode,  					   struct page *page, int sync) {} + +static inline void nfs_fscache_invalidate(struct inode *inode) {} +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {} +  static inline const char *nfs_server_fscache_state(struct nfs_server *server)  {  	return "no "; diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 033803c3664..44efaa8c5f7 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -126,8 +126,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,  	}  	spin_unlock(&ret->d_lock);  out: -	if (name) -		kfree(name); +	kfree(name);  	nfs_free_fattr(fsinfo.fattr);  	return ret;  } diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index bc3968fa81e..dc0f98dfa71 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -97,7 +97,7 @@ static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)  static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)  {  	struct nfs4_string *owner = fattr->owner_name; -	__u32 uid; +	kuid_t uid;  	if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))  		return false; @@ -111,7 +111,7 @@ static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr  static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)  {  	struct nfs4_string *group = fattr->group_name; -	__u32 gid; +	kgid_t gid;  	if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))  		return false; @@ -193,7 +193,8 @@ static int nfs_idmap_init_keyring(void)  	if (!cred)  		return -ENOMEM; -	keyring = keyring_alloc(".id_resolver", 0, 0, cred, +	keyring = keyring_alloc(".id_resolver", +				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,  				(KEY_POS_ALL & ~KEY_POS_SETATTR) |  				KEY_USR_VIEW | KEY_USR_READ,  				KEY_ALLOC_NOT_IN_QUOTA, NULL); @@ -764,7 +765,7 @@ out:  static ssize_t  idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)  { -	struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); +	struct rpc_inode *rpci = RPC_I(file_inode(filp));  	struct idmap *idmap = (struct idmap *)rpci->private;  	struct key_construction *cons;  	struct idmap_msg im; @@ -836,43 +837,61 @@ idmap_release_pipe(struct inode *inode)  	nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);  } -int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)  {  	struct idmap *idmap = server->nfs_client->cl_idmap; +	__u32 id = -1; +	int ret = 0; -	if (nfs_map_string_to_numeric(name, namelen, uid)) -		return 0; -	return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap); +	if (!nfs_map_string_to_numeric(name, namelen, &id)) +		ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); +	if (ret == 0) { +		*uid = make_kuid(&init_user_ns, id); +		if (!uid_valid(*uid)) +			ret = -ERANGE; +	} +	return ret;  } -int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid)  {  	struct idmap *idmap = server->nfs_client->cl_idmap; +	__u32 id = -1; +	int ret = 0; -	if (nfs_map_string_to_numeric(name, namelen, gid)) -		return 0; -	return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap); +	if (!nfs_map_string_to_numeric(name, namelen, &id)) +		ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); +	if (ret == 0) { +		*gid = make_kgid(&init_user_ns, id); +		if (!gid_valid(*gid)) +			ret = -ERANGE; +	} +	return ret;  } -int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen)  {  	struct idmap *idmap = server->nfs_client->cl_idmap;  	int ret = -EINVAL; +	__u32 id; +	id = from_kuid(&init_user_ns, uid);  	if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) -		ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap); +		ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);  	if (ret < 0) -		ret = nfs_map_numeric_to_string(uid, buf, buflen); +		ret = nfs_map_numeric_to_string(id, buf, buflen);  	return ret;  } -int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)  {  	struct idmap *idmap = server->nfs_client->cl_idmap;  	int ret = -EINVAL; +	__u32 id; +	id = from_kgid(&init_user_ns, gid);  	if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) -		ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap); +		ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);  	if (ret < 0) -		ret = nfs_map_numeric_to_string(gid, buf, buflen); +		ret = nfs_map_numeric_to_string(id, buf, buflen);  	return ret;  } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2faae14d89f..1f941674b08 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -161,10 +161,12 @@ static void nfs_zap_caches_locked(struct inode *inode)  	nfsi->attrtimeo_timestamp = jiffies;  	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); -	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) +	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {  		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; -	else +		nfs_fscache_invalidate(inode); +	} else {  		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; +	}  }  void nfs_zap_caches(struct inode *inode) @@ -179,6 +181,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)  	if (mapping->nrpages != 0) {  		spin_lock(&inode->i_lock);  		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; +		nfs_fscache_invalidate(inode);  		spin_unlock(&inode->i_lock);  	}  } @@ -234,6 +237,8 @@ nfs_find_actor(struct inode *inode, void *opaque)  	if (NFS_FILEID(inode) != fattr->fileid)  		return 0; +	if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) +		return 0;  	if (nfs_compare_fh(NFS_FH(inode), fh))  		return 0;  	if (is_bad_inode(inode) || NFS_STALE(inode)) @@ -329,8 +334,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)  		inode->i_version = 0;  		inode->i_size = 0;  		clear_nlink(inode); -		inode->i_uid = -2; -		inode->i_gid = -2; +		inode->i_uid = make_kuid(&init_user_ns, -2); +		inode->i_gid = make_kgid(&init_user_ns, -2);  		inode->i_blocks = 0;  		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));  		nfsi->write_io = 0; @@ -691,10 +696,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)  	if (ctx->cred != NULL)  		put_rpccred(ctx->cred);  	dput(ctx->dentry); -	if (is_sync) -		nfs_sb_deactive(sb); -	else -		nfs_sb_deactive_async(sb); +	nfs_sb_deactive(sb);  	kfree(ctx->mdsthreshold);  	kfree(ctx);  } @@ -711,7 +713,7 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);   */  void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct nfs_inode *nfsi = NFS_I(inode);  	filp->private_data = get_nfs_open_context(ctx); @@ -744,7 +746,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c  static void nfs_file_clear_open_context(struct file *filp)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct nfs_open_context *ctx = nfs_file_open_context(filp);  	if (ctx) { @@ -881,7 +883,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map  		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));  	spin_unlock(&inode->i_lock);  	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); -	nfs_fscache_reset_inode_cookie(inode); +	nfs_fscache_wait_on_invalidate(inode);  	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",  			inode->i_sb->s_id, (long long)NFS_FILEID(inode));  	return 0; @@ -957,6 +959,10 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr  		i_size_write(inode, nfs_size_to_loff_t(fattr->size));  		ret |= NFS_INO_INVALID_ATTR;  	} + +	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) +		nfs_fscache_invalidate(inode); +  	return ret;  } @@ -1002,9 +1008,9 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat  	/* Have any file permissions changed? */  	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))  		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; -	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) +	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))  		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; -	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) +	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))  		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;  	/* Has the link count changed? */ @@ -1205,8 +1211,10 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr  	struct nfs_inode *nfsi = NFS_I(inode);  	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; -	if (S_ISDIR(inode->i_mode)) +	if (S_ISDIR(inode->i_mode)) {  		nfsi->cache_validity |= NFS_INO_INVALID_DATA; +		nfs_fscache_invalidate(inode); +	}  	if ((fattr->valid & NFS_ATTR_FATTR) == 0)  		return 0;  	return nfs_refresh_inode_locked(inode, fattr); @@ -1431,7 +1439,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_OWNER) { -		if (inode->i_uid != fattr->uid) { +		if (!uid_eq(inode->i_uid, fattr->uid)) {  			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;  			inode->i_uid = fattr->uid;  		} @@ -1442,7 +1450,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_GROUP) { -		if (inode->i_gid != fattr->gid) { +		if (!gid_eq(inode->i_gid, fattr->gid)) {  			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;  			inode->i_gid = fattr->gid;  		} @@ -1494,6 +1502,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  			(save_cache_validity & NFS_INO_REVAL_FORCED))  		nfsi->cache_validity |= invalid; +	if (invalid & NFS_INO_INVALID_DATA) +		nfs_fscache_invalidate(inode); +  	return 0;   out_err:  	/* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f0e6c7df1a0..541c9ebdbc5 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -329,7 +329,6 @@ extern int __init register_nfs_fs(void);  extern void __exit unregister_nfs_fs(void);  extern void nfs_sb_active(struct super_block *sb);  extern void nfs_sb_deactive(struct super_block *sb); -extern void nfs_sb_deactive_async(struct super_block *sb);  /* namespace.c */  #define NFS_PATH_CANONICAL 1 diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index dd057bc6b65..fc8dc20fdeb 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -177,11 +177,31 @@ out_nofree:  	return mnt;  } +static int +nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ +	if (NFS_FH(dentry->d_inode)->size != 0) +		return nfs_getattr(mnt, dentry, stat); +	generic_fillattr(dentry->d_inode, stat); +	return 0; +} + +static int +nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +{ +	if (NFS_FH(dentry->d_inode)->size != 0) +		return nfs_setattr(dentry, attr); +	return -EACCES; +} +  const struct inode_operations nfs_mountpoint_inode_operations = {  	.getattr	= nfs_getattr, +	.setattr	= nfs_setattr,  };  const struct inode_operations nfs_referral_inode_operations = { +	.getattr	= nfs_namespace_getattr, +	.setattr	= nfs_namespace_setattr,  };  static void nfs_expire_automounts(struct work_struct *work) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 06b9df49f7f..62db136339e 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -290,8 +290,13 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)  	fattr->mode = be32_to_cpup(p++);  	fattr->nlink = be32_to_cpup(p++); -	fattr->uid = be32_to_cpup(p++); -	fattr->gid = be32_to_cpup(p++); +	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); +	if (!uid_valid(fattr->uid)) +		goto out_uid; +	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); +	if (!gid_valid(fattr->gid)) +		goto out_gid; +		  	fattr->size = be32_to_cpup(p++);  	fattr->du.nfs2.blocksize = be32_to_cpup(p++); @@ -313,6 +318,12 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)  	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);  	return 0; +out_uid: +	dprintk("NFS: returned invalid uid\n"); +	return -EINVAL; +out_gid: +	dprintk("NFS: returned invalid gid\n"); +	return -EINVAL;  out_overflow:  	print_overflow_msg(__func__, xdr);  	return -EIO; @@ -351,11 +362,11 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)  	else  		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);  	if (attr->ia_valid & ATTR_UID) -		*p++ = cpu_to_be32(attr->ia_uid); +		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));  	else  		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);  	if (attr->ia_valid & ATTR_GID) -		*p++ = cpu_to_be32(attr->ia_gid); +		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));  	else  		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);  	if (attr->ia_valid & ATTR_SIZE) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 70efb63b1e4..43ea96ced28 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -872,7 +872,7 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess  static int  nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);  } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index bffc32406fb..fa6d72131c1 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -592,13 +592,13 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)  	if (attr->ia_valid & ATTR_UID) {  		*p++ = xdr_one; -		*p++ = cpu_to_be32(attr->ia_uid); +		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));  	} else  		*p++ = xdr_zero;  	if (attr->ia_valid & ATTR_GID) {  		*p++ = xdr_one; -		*p++ = cpu_to_be32(attr->ia_gid); +		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));  	} else  		*p++ = xdr_zero; @@ -657,8 +657,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)  	fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;  	fattr->nlink = be32_to_cpup(p++); -	fattr->uid = be32_to_cpup(p++); -	fattr->gid = be32_to_cpup(p++); +	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); +	if (!uid_valid(fattr->uid)) +		goto out_uid; +	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); +	if (!gid_valid(fattr->gid)) +		goto out_gid;  	p = xdr_decode_size3(p, &fattr->size);  	p = xdr_decode_size3(p, &fattr->du.nfs3.used); @@ -675,6 +679,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)  	fattr->valid |= NFS_ATTR_FATTR_V3;  	return 0; +out_uid: +	dprintk("NFS: returned invalid uid\n"); +	return -EINVAL; +out_gid: +	dprintk("NFS: returned invalid gid\n"); +	return -EINVAL;  out_overflow:  	print_overflow_msg(__func__, xdr);  	return -EIO; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a3f488b074a..944c9a5c103 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -13,6 +13,8 @@  #define NFS4_MAX_LOOP_ON_RECOVER (10) +#include <linux/seqlock.h> +  struct idmap;  enum nfs4_client_state { @@ -90,6 +92,8 @@ struct nfs4_state_owner {  	unsigned long	     so_flags;  	struct list_head     so_states;  	struct nfs_seqid_counter so_seqid; +	seqcount_t	     so_reclaim_seqcount; +	struct mutex	     so_delegreturn_mutex;  };  enum { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index acc34726812..ac4fc9a8fdb 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -6,6 +6,7 @@  #include <linux/nfs_fs.h>  #include <linux/nfs_idmap.h>  #include <linux/nfs_mount.h> +#include <linux/sunrpc/addr.h>  #include <linux/sunrpc/auth.h>  #include <linux/sunrpc/xprt.h>  #include <linux/sunrpc/bc_xprt.h> @@ -29,15 +30,14 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)  	if (clp->rpc_ops->version != 4 || minorversion != 0)  		return ret; -retry: -	if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL)) -		return -ENOMEM; +	idr_preload(GFP_KERNEL);  	spin_lock(&nn->nfs_client_lock); -	ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident); +	ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT); +	if (ret >= 0) +		clp->cl_cb_ident = ret;  	spin_unlock(&nn->nfs_client_lock); -	if (ret == -EAGAIN) -		goto retry; -	return ret; +	idr_preload_end(); +	return ret < 0 ? ret : 0;  }  #ifdef CONFIG_NFS_V4_1 @@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,  	error = nfs4_discover_server_trunking(clp, &old);  	if (error < 0)  		goto error; +	nfs_put_client(clp);  	if (clp != old) {  		clp->cl_preserve_clid = true; -		nfs_put_client(clp);  		clp = old; -		atomic_inc(&clp->cl_count);  	}  	return clp; @@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new,  		.clientid	= new->cl_clientid,  		.confirm	= new->cl_confirm,  	}; -	int status; +	int status = -NFS4ERR_STALE_CLIENTID;  	spin_lock(&nn->nfs_client_lock);  	list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -332,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new,  		if (prev)  			nfs_put_client(prev); +		prev = pos;  		status = nfs4_proc_setclientid_confirm(pos, &clid, cred); -		if (status == 0) { +		switch (status) { +		case -NFS4ERR_STALE_CLIENTID: +			break; +		case 0:  			nfs4_swap_callback_idents(pos, new); -			nfs_put_client(pos); +			prev = NULL;  			*result = pos;  			dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",  				__func__, pos, atomic_read(&pos->cl_count)); -			return 0; -		} -		if (status != -NFS4ERR_STALE_CLIENTID) { -			nfs_put_client(pos); -			dprintk("NFS: <-- %s status = %d, no result\n", -				__func__, status); -			return status; +		default: +			goto out;  		}  		spin_lock(&nn->nfs_client_lock); -		prev = pos;  	} +	spin_unlock(&nn->nfs_client_lock); -	/* -	 * No matching nfs_client found.  This should be impossible, -	 * because the new nfs_client has already been added to -	 * nfs_client_list by nfs_get_client(). -	 * -	 * Don't BUG(), since the caller is holding a mutex. -	 */ +	/* No match found. The server lost our clientid */ +out:  	if (prev)  		nfs_put_client(prev); -	spin_unlock(&nn->nfs_client_lock); -	pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); -	return -NFS4ERR_STALE_CLIENTID; +	dprintk("NFS: <-- %s status = %d\n", __func__, status); +	return status;  }  #ifdef CONFIG_NFS_V4_1 @@ -432,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new,  {  	struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);  	struct nfs_client *pos, *n, *prev = NULL; -	int error; +	int status = -NFS4ERR_STALE_CLIENTID;  	spin_lock(&nn->nfs_client_lock);  	list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -448,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new,  				nfs_put_client(prev);  			prev = pos; -			error = nfs_wait_client_init_complete(pos); -			if (error < 0) { +			nfs4_schedule_lease_recovery(pos); +			status = nfs_wait_client_init_complete(pos); +			if (status < 0) {  				nfs_put_client(pos);  				spin_lock(&nn->nfs_client_lock);  				continue;  			} - +			status = pos->cl_cons_state;  			spin_lock(&nn->nfs_client_lock); +			if (status < 0) +				continue;  		}  		if (pos->rpc_ops != new->rpc_ops) @@ -473,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new,  		if (!nfs4_match_serverowners(pos, new))  			continue; +		atomic_inc(&pos->cl_count);  		spin_unlock(&nn->nfs_client_lock);  		dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",  			__func__, pos, atomic_read(&pos->cl_count)); @@ -481,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new,  		return 0;  	} -	/* -	 * No matching nfs_client found.  This should be impossible, -	 * because the new nfs_client has already been added to -	 * nfs_client_list by nfs_get_client(). -	 * -	 * Don't BUG(), since the caller is holding a mutex. -	 */ +	/* No matching nfs_client found. */  	spin_unlock(&nn->nfs_client_lock); -	pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); -	return -NFS4ERR_STALE_CLIENTID; +	dprintk("NFS: <-- %s status = %d\n", __func__, status); +	return status;  }  #endif	/* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e7699308364..13e6bb3e3fe 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -5,6 +5,7 @@   */  #include <linux/nfs_fs.h>  #include "internal.h" +#include "fscache.h"  #include "pnfs.h"  #define NFSDBG_FACILITY		NFSDBG_FILE @@ -74,6 +75,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)  	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));  	nfs_file_set_open_context(filp, ctx); +	nfs_fscache_set_inode_cookie(inode, filp);  	err = 0;  out_put_ctx: @@ -92,7 +94,7 @@ static int  nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)  {  	int ret; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	do {  		ret = filemap_write_and_wait_range(inode->i_mapping, start, end); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 194c4841033..49eeb044c10 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -99,7 +99,8 @@ static void filelayout_reset_write(struct nfs_write_data *data)  		task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,  							&hdr->pages, -							hdr->completion_ops); +							hdr->completion_ops, +							hdr->dreq);  	}  } @@ -119,7 +120,8 @@ static void filelayout_reset_read(struct nfs_read_data *data)  		task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,  							&hdr->pages, -							hdr->completion_ops); +							hdr->completion_ops, +							hdr->dreq);  	}  } diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 8c07241fe52..b8da95548d3 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -36,7 +36,7 @@   * Default data server connection timeout and retrans vaules.   * Set by module paramters dataserver_timeo and dataserver_retrans.   */ -#define NFS4_DEF_DS_TIMEO   60 +#define NFS4_DEF_DS_TIMEO   600 /* in tenths of a second */  #define NFS4_DEF_DS_RETRANS 5  /* diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index b720064bcd7..1fe284f01f8 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -31,6 +31,7 @@  #include <linux/nfs_fs.h>  #include <linux/vmalloc.h>  #include <linux/module.h> +#include <linux/sunrpc/addr.h>  #include "internal.h"  #include "nfs4session.h" diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 1e09eb78543..0dd766079e1 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -14,6 +14,7 @@  #include <linux/slab.h>  #include <linux/string.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/vfs.h>  #include <linux/inet.h>  #include "internal.h" diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 493f0f41c55..b2671cb0f90 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -64,7 +64,7 @@  #include "pnfs.h"  #include "netns.h"  #include "nfs4session.h" - +#include "fscache.h"  #define NFSDBG_FACILITY		NFSDBG_PROC @@ -93,6 +93,8 @@ static int nfs4_map_errors(int err)  		return err;  	switch (err) {  	case -NFS4ERR_RESOURCE: +	case -NFS4ERR_LAYOUTTRYLATER: +	case -NFS4ERR_RECALLCONFLICT:  		return -EREMOTEIO;  	case -NFS4ERR_WRONGSEC:  		return -EPERM; @@ -734,6 +736,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)  	if (!cinfo->atomic || cinfo->before != dir->i_version)  		nfs_force_lookup_revalidate(dir);  	dir->i_version = cinfo->after; +	nfs_fscache_invalidate(dir);  	spin_unlock(&dir->i_lock);  } @@ -895,6 +898,8 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)  		return 0;  	if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))  		return 0; +	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) +		return 0;  	nfs_mark_delegation_referenced(delegation);  	return 1;  } @@ -972,6 +977,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat  	spin_lock(&deleg_cur->lock);  	if (nfsi->delegation != deleg_cur || +	   test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||  	    (deleg_cur->type & fmode) != fmode)  		goto no_delegation_unlock; @@ -1154,6 +1160,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)  			data->o_arg.fmode);  	iput(inode);  out: +	nfs_release_seqid(data->o_arg.seqid);  	return state;  err_put_inode:  	iput(inode); @@ -1351,19 +1358,18 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state  			case -NFS4ERR_BAD_HIGH_SLOT:  			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:  			case -NFS4ERR_DEADSESSION: +				set_bit(NFS_DELEGATED_STATE, &state->flags);  				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); +				err = -EAGAIN;  				goto out;  			case -NFS4ERR_STALE_CLIENTID:  			case -NFS4ERR_STALE_STATEID: +				set_bit(NFS_DELEGATED_STATE, &state->flags);  			case -NFS4ERR_EXPIRED:  				/* Don't recall a delegation if it was lost */  				nfs4_schedule_lease_recovery(server->nfs_client); +				err = -EAGAIN;  				goto out; -			case -ERESTARTSYS: -				/* -				 * The show must go on: exit, but mark the -				 * stateid as needing recovery. -				 */  			case -NFS4ERR_DELEG_REVOKED:  			case -NFS4ERR_ADMIN_REVOKED:  			case -NFS4ERR_BAD_STATEID: @@ -1374,6 +1380,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state  				err = 0;  				goto out;  		} +		set_bit(NFS_DELEGATED_STATE, &state->flags);  		err = nfs4_handle_exception(server, err, &exception);  	} while (exception.retry);  out: @@ -1462,7 +1469,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)  	struct nfs4_state_owner *sp = data->owner;  	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) -		return; +		goto out_wait;  	/*  	 * Check if we still need to send an OPEN call, or if we can use  	 * a delegation instead. @@ -1497,6 +1504,7 @@ unlock_no_action:  	rcu_read_unlock();  out_no_action:  	task->tk_action = NULL; +out_wait:  	nfs4_sequence_done(task, &data->o_res.seq_res);  } @@ -1625,7 +1633,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)  static int nfs4_opendata_access(struct rpc_cred *cred,  				struct nfs4_opendata *opendata, -				struct nfs4_state *state, fmode_t fmode) +				struct nfs4_state *state, fmode_t fmode, +				int openflags)  {  	struct nfs_access_entry cache;  	u32 mask; @@ -1637,11 +1646,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,  	mask = 0;  	/* don't check MAY_WRITE - a newly created file may not have -	 * write mode bits, but POSIX allows the creating process to write */ -	if (fmode & FMODE_READ) -		mask |= MAY_READ; -	if (fmode & FMODE_EXEC) -		mask |= MAY_EXEC; +	 * write mode bits, but POSIX allows the creating process to write. +	 * use openflags to check for exec, because fmode won't +	 * always have FMODE_EXEC set when file open for exec. */ +	if (openflags & __FMODE_EXEC) { +		/* ONLY check for exec rights */ +		mask = MAY_EXEC; +	} else if (fmode & FMODE_READ) +		mask = MAY_READ;  	cache.cred = cred;  	cache.jiffies = jiffies; @@ -1840,6 +1852,43 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct  		sattr->ia_valid |= ATTR_MTIME;  } +static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, +		fmode_t fmode, +		int flags, +		struct nfs4_state **res) +{ +	struct nfs4_state_owner *sp = opendata->owner; +	struct nfs_server *server = sp->so_server; +	struct nfs4_state *state; +	unsigned int seq; +	int ret; + +	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + +	ret = _nfs4_proc_open(opendata); +	if (ret != 0) +		goto out; + +	state = nfs4_opendata_to_nfs4_state(opendata); +	ret = PTR_ERR(state); +	if (IS_ERR(state)) +		goto out; +	if (server->caps & NFS_CAP_POSIX_LOCK) +		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + +	ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); +	if (ret != 0) +		goto out; + +	if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { +		nfs4_schedule_stateid_recovery(server, state); +		nfs4_wait_clnt_recover(server->nfs_client); +	} +	*res = state; +out: +	return ret; +} +  /*   * Returns a referenced nfs4_state   */ @@ -1884,18 +1933,7 @@ static int _nfs4_do_open(struct inode *dir,  	if (dentry->d_inode != NULL)  		opendata->state = nfs4_get_open_state(dentry->d_inode, sp); -	status = _nfs4_proc_open(opendata); -	if (status != 0) -		goto err_opendata_put; - -	state = nfs4_opendata_to_nfs4_state(opendata); -	status = PTR_ERR(state); -	if (IS_ERR(state)) -		goto err_opendata_put; -	if (server->caps & NFS_CAP_POSIX_LOCK) -		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); - -	status = nfs4_opendata_access(cred, opendata, state, fmode); +	status = _nfs4_open_and_get_state(opendata, fmode, flags, &state);  	if (status != 0)  		goto err_opendata_put; @@ -2083,7 +2121,7 @@ static void nfs4_free_closedata(void *data)  	nfs4_put_open_state(calldata->state);  	nfs_free_seqid(calldata->arg.seqid);  	nfs4_put_state_owner(sp); -	nfs_sb_deactive_async(sb); +	nfs_sb_deactive(sb);  	kfree(calldata);  } @@ -2145,7 +2183,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)  	dprintk("%s: begin!\n", __func__);  	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) -		return; +		goto out_wait;  	task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];  	calldata->arg.fmode = FMODE_READ|FMODE_WRITE; @@ -2167,16 +2205,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)  	if (!call_close) {  		/* Note: exit _without_ calling nfs4_close_done */ -		task->tk_action = NULL; -		nfs4_sequence_done(task, &calldata->res.seq_res); -		goto out; +		goto out_no_action;  	}  	if (calldata->arg.fmode == 0) {  		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];  		if (calldata->roc &&  		    pnfs_roc_drain(inode, &calldata->roc_barrier, task)) -			goto out; +			goto out_wait;  	}  	nfs_fattr_init(calldata->res.fattr); @@ -2186,8 +2222,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)  				&calldata->res.seq_res,  				task) != 0)  		nfs_release_seqid(calldata->arg.seqid); -out:  	dprintk("%s: done!\n", __func__); +	return; +out_no_action: +	task->tk_action = NULL; +out_wait: +	nfs4_sequence_done(task, &calldata->res.seq_res);  }  static const struct rpc_call_ops nfs4_close_ops = { @@ -4418,12 +4458,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)  	struct nfs4_unlockdata *calldata = data;  	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) -		return; +		goto out_wait;  	if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {  		/* Note: exit _without_ running nfs4_locku_done */ -		task->tk_action = NULL; -		nfs4_sequence_done(task, &calldata->res.seq_res); -		return; +		goto out_no_action;  	}  	calldata->timestamp = jiffies;  	if (nfs4_setup_sequence(calldata->server, @@ -4431,6 +4469,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)  				&calldata->res.seq_res,  				task) != 0)  		nfs_release_seqid(calldata->arg.seqid); +	return; +out_no_action: +	task->tk_action = NULL; +out_wait: +	nfs4_sequence_done(task, &calldata->res.seq_res);  }  static const struct rpc_call_ops nfs4_locku_ops = { @@ -4477,7 +4520,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,  static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)  { -	struct nfs_inode *nfsi = NFS_I(state->inode); +	struct inode *inode = state->inode; +	struct nfs4_state_owner *sp = state->owner; +	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_seqid *seqid;  	struct nfs4_lock_state *lsp;  	struct rpc_task *task; @@ -4487,12 +4532,17 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *  	status = nfs4_set_lock_state(state, request);  	/* Unlock _before_ we do the RPC call */  	request->fl_flags |= FL_EXISTS; +	/* Exclude nfs_delegation_claim_locks() */ +	mutex_lock(&sp->so_delegreturn_mutex); +	/* Exclude nfs4_reclaim_open_stateid() - note nesting! */  	down_read(&nfsi->rwsem);  	if (do_vfs_lock(request->fl_file, request) == -ENOENT) {  		up_read(&nfsi->rwsem); +		mutex_unlock(&sp->so_delegreturn_mutex);  		goto out;  	}  	up_read(&nfsi->rwsem); +	mutex_unlock(&sp->so_delegreturn_mutex);  	if (status != 0)  		goto out;  	/* Is this a delegated lock? */ @@ -4571,7 +4621,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)  	dprintk("%s: begin!\n", __func__);  	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) -		return; +		goto out_wait;  	/* Do we need to do an open_to_lock_owner? */  	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {  		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { @@ -4591,6 +4641,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)  	nfs_release_seqid(data->arg.open_seqid);  out_release_lock_seqid:  	nfs_release_seqid(data->arg.lock_seqid); +out_wait: +	nfs4_sequence_done(task, &data->res.seq_res);  	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);  } @@ -4808,8 +4860,10 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques  static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)  { +	struct nfs4_state_owner *sp = state->owner;  	struct nfs_inode *nfsi = NFS_I(state->inode);  	unsigned char fl_flags = request->fl_flags; +	unsigned int seq;  	int status = -ENOLCK;  	if ((fl_flags & FL_POSIX) && @@ -4831,9 +4885,16 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock  		status = do_vfs_lock(request->fl_file, request);  		goto out_unlock;  	} +	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); +	up_read(&nfsi->rwsem);  	status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);  	if (status != 0) +		goto out; +	down_read(&nfsi->rwsem); +	if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { +		status = -NFS4ERR_DELAY;  		goto out_unlock; +	}  	/* Note: we always want to sleep here! */  	request->fl_flags = fl_flags | FL_SLEEP;  	if (do_vfs_lock(request->fl_file, request) < 0) @@ -4940,24 +5001,22 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)  			case 0:  			case -ESTALE:  				goto out; -			case -NFS4ERR_EXPIRED: -				nfs4_schedule_stateid_recovery(server, state);  			case -NFS4ERR_STALE_CLIENTID:  			case -NFS4ERR_STALE_STATEID: +				set_bit(NFS_DELEGATED_STATE, &state->flags); +			case -NFS4ERR_EXPIRED:  				nfs4_schedule_lease_recovery(server->nfs_client); +				err = -EAGAIN;  				goto out;  			case -NFS4ERR_BADSESSION:  			case -NFS4ERR_BADSLOT:  			case -NFS4ERR_BAD_HIGH_SLOT:  			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:  			case -NFS4ERR_DEADSESSION: +				set_bit(NFS_DELEGATED_STATE, &state->flags);  				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); +				err = -EAGAIN;  				goto out; -			case -ERESTARTSYS: -				/* -				 * The show must go on: exit, but mark the -				 * stateid as needing recovery. -				 */  			case -NFS4ERR_DELEG_REVOKED:  			case -NFS4ERR_ADMIN_REVOKED:  			case -NFS4ERR_BAD_STATEID: @@ -4970,9 +5029,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)  				/* kill_proc(fl->fl_pid, SIGLOST, 1); */  				err = 0;  				goto out; -			case -NFS4ERR_DELAY: -				break;  		} +		set_bit(NFS_DELEGATED_STATE, &state->flags);  		err = nfs4_handle_exception(server, err, &exception);  	} while (exception.retry);  out: @@ -5990,6 +6048,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)  	struct nfs_server *server = NFS_SERVER(inode);  	struct pnfs_layout_hdr *lo;  	struct nfs4_state *state = NULL; +	unsigned long timeo, giveup;  	dprintk("--> %s\n", __func__); @@ -6001,7 +6060,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)  		goto out;  	case -NFS4ERR_LAYOUTTRYLATER:  	case -NFS4ERR_RECALLCONFLICT: -		task->tk_status = -NFS4ERR_DELAY; +		timeo = rpc_get_timeout(task->tk_client); +		giveup = lgp->args.timestamp + timeo; +		if (time_after(giveup, jiffies)) +			task->tk_status = -NFS4ERR_DELAY;  		break;  	case -NFS4ERR_EXPIRED:  	case -NFS4ERR_BAD_STATEID: @@ -6074,11 +6136,13 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)  static void nfs4_layoutget_release(void *calldata)  {  	struct nfs4_layoutget *lgp = calldata; -	struct nfs_server *server = NFS_SERVER(lgp->args.inode); +	struct inode *inode = lgp->args.inode; +	struct nfs_server *server = NFS_SERVER(inode);  	size_t max_pages = max_response_pages(server);  	dprintk("--> %s\n", __func__);  	nfs4_free_pages(lgp->args.layout.pages, max_pages); +	pnfs_put_layout_hdr(NFS_I(inode)->layout);  	put_nfs_open_context(lgp->args.ctx);  	kfree(calldata);  	dprintk("<-- %s\n", __func__); @@ -6093,7 +6157,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {  struct pnfs_layout_segment *  nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)  { -	struct nfs_server *server = NFS_SERVER(lgp->args.inode); +	struct inode *inode = lgp->args.inode; +	struct nfs_server *server = NFS_SERVER(inode);  	size_t max_pages = max_response_pages(server);  	struct rpc_task *task;  	struct rpc_message msg = { @@ -6119,17 +6184,23 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)  		return ERR_PTR(-ENOMEM);  	}  	lgp->args.layout.pglen = max_pages * PAGE_SIZE; +	lgp->args.timestamp = jiffies;  	lgp->res.layoutp = &lgp->args.layout;  	lgp->res.seq_res.sr_slot = NULL;  	nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + +	/* nfs4_layoutget_release calls pnfs_put_layout_hdr */ +	pnfs_get_layout_hdr(NFS_I(inode)->layout); +  	task = rpc_run_task(&task_setup_data);  	if (IS_ERR(task))  		return ERR_CAST(task);  	status = nfs4_wait_for_completion_rpc_task(task);  	if (status == 0)  		status = task->tk_status; -	if (status == 0) +	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ +	if (status == 0 && lgp->res.layoutp->len)  		lseg = pnfs_layout_process(lgp);  	rpc_put_task(task);  	dprintk("<-- %s status=%d\n", __func__, status); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9448c579d41..6ace365c633 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,  	clp->cl_confirm = clid.confirm;  	status = nfs40_walk_client_list(clp, result, cred); -	switch (status) { -	case -NFS4ERR_STALE_CLIENTID: -		set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); -	case 0: +	if (status == 0) {  		/* Sustain the lease, even if it's empty.  If the clientid4  		 * goes stale it's of no use for trunking discovery. */  		nfs4_schedule_state_renewal(*result); -		break;  	} -  out:  	return status;  } @@ -523,6 +518,8 @@ nfs4_alloc_state_owner(struct nfs_server *server,  	nfs4_init_seqid_counter(&sp->so_seqid);  	atomic_set(&sp->so_count, 1);  	INIT_LIST_HEAD(&sp->so_lru); +	seqcount_init(&sp->so_reclaim_seqcount); +	mutex_init(&sp->so_delegreturn_mutex);  	return sp;  } @@ -1395,8 +1392,9 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs  	 * recovering after a network partition or a reboot from a  	 * server that doesn't support a grace period.  	 */ -restart:  	spin_lock(&sp->so_lock); +	write_seqcount_begin(&sp->so_reclaim_seqcount); +restart:  	list_for_each_entry(state, &sp->so_states, open_states) {  		if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))  			continue; @@ -1417,6 +1415,7 @@ restart:  				}  				spin_unlock(&state->state_lock);  				nfs4_put_open_state(state); +				spin_lock(&sp->so_lock);  				goto restart;  			}  		} @@ -1454,12 +1453,17 @@ restart:  				goto out_err;  		}  		nfs4_put_open_state(state); +		spin_lock(&sp->so_lock);  		goto restart;  	} +	write_seqcount_end(&sp->so_reclaim_seqcount);  	spin_unlock(&sp->so_lock);  	return 0;  out_err:  	nfs4_put_open_state(state); +	spin_lock(&sp->so_lock); +	write_seqcount_end(&sp->so_reclaim_seqcount); +	spin_unlock(&sp->so_lock);  	return status;  } @@ -1863,6 +1867,7 @@ again:  	case -ETIMEDOUT:  	case -EAGAIN:  		ssleep(1); +	case -NFS4ERR_STALE_CLIENTID:  		dprintk("NFS: %s after status %d, retrying\n",  			__func__, status);  		goto again; @@ -2022,8 +2027,18 @@ static int nfs4_reset_session(struct nfs_client *clp)  	nfs4_begin_drain_session(clp);  	cred = nfs4_get_exchange_id_cred(clp);  	status = nfs4_proc_destroy_session(clp->cl_session, cred); -	if (status && status != -NFS4ERR_BADSESSION && -	    status != -NFS4ERR_DEADSESSION) { +	switch (status) { +	case 0: +	case -NFS4ERR_BADSESSION: +	case -NFS4ERR_DEADSESSION: +		break; +	case -NFS4ERR_BACK_CHAN_BUSY: +	case -NFS4ERR_DELAY: +		set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); +		status = 0; +		ssleep(1); +		goto out; +	default:  		status = nfs4_recovery_handle_error(clp, status);  		goto out;  	} diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 84d2e9e2f31..569b166cc05 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -28,7 +28,7 @@ static struct file_system_type nfs4_remote_fs_type = {  	.name		= "nfs4",  	.mount		= nfs4_remote_mount,  	.kill_sb	= nfs_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  };  static struct file_system_type nfs4_remote_referral_fs_type = { @@ -36,7 +36,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {  	.name		= "nfs4",  	.mount		= nfs4_remote_referral_mount,  	.kill_sb	= nfs_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  };  struct file_system_type nfs4_referral_fs_type = { @@ -44,7 +44,7 @@ struct file_system_type nfs4_referral_fs_type = {  	.name		= "nfs4",  	.mount		= nfs4_referral_mount,  	.kill_sb	= nfs_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  };  static const struct super_operations nfs4_sops = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 26b14392043..e3edda554ac 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1002,7 +1002,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const  		owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);  		if (owner_namelen < 0) {  			dprintk("nfs: couldn't resolve uid %d to string\n", -					iap->ia_uid); +					from_kuid(&init_user_ns, iap->ia_uid));  			/* XXX */  			strcpy(owner_name, "nobody");  			owner_namelen = sizeof("nobody") - 1; @@ -1014,7 +1014,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const  		owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);  		if (owner_grouplen < 0) {  			dprintk("nfs: couldn't resolve gid %d to string\n", -					iap->ia_gid); +					from_kgid(&init_user_ns, iap->ia_gid));  			strcpy(owner_group, "nobody");  			owner_grouplen = sizeof("nobody") - 1;  			/* goto out; */ @@ -3778,14 +3778,14 @@ out_overflow:  }  static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, -		const struct nfs_server *server, uint32_t *uid, +		const struct nfs_server *server, kuid_t *uid,  		struct nfs4_string *owner_name)  {  	uint32_t len;  	__be32 *p;  	int ret = 0; -	*uid = -2; +	*uid = make_kuid(&init_user_ns, -2);  	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))  		return -EIO;  	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { @@ -3813,7 +3813,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,  					__func__, len);  		bitmap[1] &= ~FATTR4_WORD1_OWNER;  	} -	dprintk("%s: uid=%d\n", __func__, (int)*uid); +	dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid));  	return ret;  out_overflow:  	print_overflow_msg(__func__, xdr); @@ -3821,14 +3821,14 @@ out_overflow:  }  static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, -		const struct nfs_server *server, uint32_t *gid, +		const struct nfs_server *server, kgid_t *gid,  		struct nfs4_string *group_name)  {  	uint32_t len;  	__be32 *p;  	int ret = 0; -	*gid = -2; +	*gid = make_kgid(&init_user_ns, -2);  	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))  		return -EIO;  	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { @@ -3856,7 +3856,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,  					__func__, len);  		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;  	} -	dprintk("%s: gid=%d\n", __func__, (int)*gid); +	dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid));  	return ret;  out_overflow:  	print_overflow_msg(__func__, xdr); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index c6f990656f8..88f9611a945 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -647,6 +647,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {  	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |  				   PNFS_LAYOUTRET_ON_ERROR, +	.owner		       	 = THIS_MODULE,  	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,  	.free_layout_hdr         = objlayout_free_layout_hdr, diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index e7165d91536..48ac5aad625 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -254,7 +254,7 @@ static void  pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)  {  	lo->plh_retry_timestamp = jiffies; -	if (test_and_set_bit(fail_bit, &lo->plh_flags)) +	if (!test_and_set_bit(fail_bit, &lo->plh_flags))  		atomic_inc(&lo->plh_refcount);  } @@ -505,37 +505,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)  }  EXPORT_SYMBOL_GPL(pnfs_destroy_layout); -/* - * Called by the state manger to remove all layouts established under an - * expired lease. - */ -void -pnfs_destroy_all_layouts(struct nfs_client *clp) +static bool +pnfs_layout_add_bulk_destroy_list(struct inode *inode, +		struct list_head *layout_list)  { -	struct nfs_server *server;  	struct pnfs_layout_hdr *lo; -	LIST_HEAD(tmp_list); +	bool ret = false; -	nfs4_deviceid_mark_client_invalid(clp); -	nfs4_deviceid_purge_client(clp); +	spin_lock(&inode->i_lock); +	lo = NFS_I(inode)->layout; +	if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { +		pnfs_get_layout_hdr(lo); +		list_add(&lo->plh_bulk_destroy, layout_list); +		ret = true; +	} +	spin_unlock(&inode->i_lock); +	return ret; +} + +/* Caller must hold rcu_read_lock and clp->cl_lock */ +static int +pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, +		struct nfs_server *server, +		struct list_head *layout_list) +{ +	struct pnfs_layout_hdr *lo, *next; +	struct inode *inode; + +	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { +		inode = igrab(lo->plh_inode); +		if (inode == NULL) +			continue; +		list_del_init(&lo->plh_layouts); +		if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) +			continue; +		rcu_read_unlock(); +		spin_unlock(&clp->cl_lock); +		iput(inode); +		spin_lock(&clp->cl_lock); +		rcu_read_lock(); +		return -EAGAIN; +	} +	return 0; +} + +static int +pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, +		bool is_bulk_recall) +{ +	struct pnfs_layout_hdr *lo; +	struct inode *inode; +	struct pnfs_layout_range range = { +		.iomode = IOMODE_ANY, +		.offset = 0, +		.length = NFS4_MAX_UINT64, +	}; +	LIST_HEAD(lseg_list); +	int ret = 0; + +	while (!list_empty(layout_list)) { +		lo = list_entry(layout_list->next, struct pnfs_layout_hdr, +				plh_bulk_destroy); +		dprintk("%s freeing layout for inode %lu\n", __func__, +			lo->plh_inode->i_ino); +		inode = lo->plh_inode; +		spin_lock(&inode->i_lock); +		list_del_init(&lo->plh_bulk_destroy); +		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ +		if (is_bulk_recall) +			set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); +		if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) +			ret = -EAGAIN; +		spin_unlock(&inode->i_lock); +		pnfs_free_lseg_list(&lseg_list); +		pnfs_put_layout_hdr(lo); +		iput(inode); +	} +	return ret; +} + +int +pnfs_destroy_layouts_byfsid(struct nfs_client *clp, +		struct nfs_fsid *fsid, +		bool is_recall) +{ +	struct nfs_server *server; +	LIST_HEAD(layout_list);  	spin_lock(&clp->cl_lock);  	rcu_read_lock(); +restart:  	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { -		if (!list_empty(&server->layouts)) -			list_splice_init(&server->layouts, &tmp_list); +		if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) +			continue; +		if (pnfs_layout_bulk_destroy_byserver_locked(clp, +				server, +				&layout_list) != 0) +			goto restart;  	}  	rcu_read_unlock();  	spin_unlock(&clp->cl_lock); -	while (!list_empty(&tmp_list)) { -		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, -				plh_layouts); -		dprintk("%s freeing layout for inode %lu\n", __func__, -			lo->plh_inode->i_ino); -		list_del_init(&lo->plh_layouts); -		pnfs_destroy_layout(NFS_I(lo->plh_inode)); +	if (list_empty(&layout_list)) +		return 0; +	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +int +pnfs_destroy_layouts_byclid(struct nfs_client *clp, +		bool is_recall) +{ +	struct nfs_server *server; +	LIST_HEAD(layout_list); + +	spin_lock(&clp->cl_lock); +	rcu_read_lock(); +restart: +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		if (pnfs_layout_bulk_destroy_byserver_locked(clp, +					server, +					&layout_list) != 0) +			goto restart;  	} +	rcu_read_unlock(); +	spin_unlock(&clp->cl_lock); + +	if (list_empty(&layout_list)) +		return 0; +	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ +	nfs4_deviceid_mark_client_invalid(clp); +	nfs4_deviceid_purge_client(clp); + +	pnfs_destroy_layouts_byclid(clp, false);  }  /* @@ -888,7 +998,7 @@ alloc_init_layout_hdr(struct inode *ino,  	atomic_set(&lo->plh_refcount, 1);  	INIT_LIST_HEAD(&lo->plh_layouts);  	INIT_LIST_HEAD(&lo->plh_segs); -	INIT_LIST_HEAD(&lo->plh_bulk_recall); +	INIT_LIST_HEAD(&lo->plh_bulk_destroy);  	lo->plh_inode = ino;  	lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);  	return lo; @@ -1071,7 +1181,7 @@ pnfs_update_layout(struct inode *ino,  	struct nfs_client *clp = server->nfs_client;  	struct pnfs_layout_hdr *lo;  	struct pnfs_layout_segment *lseg = NULL; -	bool first = false; +	bool first;  	if (!pnfs_enabled_sb(NFS_SERVER(ino)))  		goto out; @@ -1105,10 +1215,9 @@ pnfs_update_layout(struct inode *ino,  		goto out_unlock;  	atomic_inc(&lo->plh_outstanding); -	if (list_empty(&lo->plh_segs)) -		first = true; - +	first = list_empty(&lo->plh_layouts) ? true : false;  	spin_unlock(&ino->i_lock); +  	if (first) {  		/* The lo must be on the clp list if there is any  		 * chance of a CB_LAYOUTRECALL(FILE) coming in. @@ -1312,13 +1421,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);  int pnfs_write_done_resend_to_mds(struct inode *inode,  				struct list_head *head, -				const struct nfs_pgio_completion_ops *compl_ops) +				const struct nfs_pgio_completion_ops *compl_ops, +				struct nfs_direct_req *dreq)  {  	struct nfs_pageio_descriptor pgio;  	LIST_HEAD(failed);  	/* Resend all requests through the MDS */  	nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); +	pgio.pg_dreq = dreq;  	while (!list_empty(head)) {  		struct nfs_page *req = nfs_list_entry(head->next); @@ -1353,7 +1464,8 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))  		data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,  							&hdr->pages, -							hdr->completion_ops); +							hdr->completion_ops, +							hdr->dreq);  }  /* @@ -1468,13 +1580,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);  int pnfs_read_done_resend_to_mds(struct inode *inode,  				struct list_head *head, -				const struct nfs_pgio_completion_ops *compl_ops) +				const struct nfs_pgio_completion_ops *compl_ops, +				struct nfs_direct_req *dreq)  {  	struct nfs_pageio_descriptor pgio;  	LIST_HEAD(failed);  	/* Resend all requests through the MDS */  	nfs_pageio_init_read(&pgio, inode, compl_ops); +	pgio.pg_dreq = dreq;  	while (!list_empty(head)) {  		struct nfs_page *req = nfs_list_entry(head->next); @@ -1505,7 +1619,8 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))  		data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,  							&hdr->pages, -							hdr->completion_ops); +							hdr->completion_ops, +							hdr->dreq);  }  /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index dbf7bba52da..94ba8041774 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -132,7 +132,7 @@ struct pnfs_layoutdriver_type {  struct pnfs_layout_hdr {  	atomic_t		plh_refcount;  	struct list_head	plh_layouts;   /* other client layouts */ -	struct list_head	plh_bulk_recall; /* clnt list of bulk recalls */ +	struct list_head	plh_bulk_destroy;  	struct list_head	plh_segs;      /* layout segments list */  	nfs4_stateid		plh_stateid;  	atomic_t		plh_outstanding; /* number of RPCs out */ @@ -196,6 +196,11 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);  void pnfs_free_lseg_list(struct list_head *tmp_list);  void pnfs_destroy_layout(struct nfs_inode *);  void pnfs_destroy_all_layouts(struct nfs_client *); +int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, +		struct nfs_fsid *fsid, +		bool is_recall); +int pnfs_destroy_layouts_byclid(struct nfs_client *clp, +		bool is_recall);  void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);  void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,  			     const nfs4_stateid *new, @@ -225,9 +230,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,  void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);  int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, -			const struct nfs_pgio_completion_ops *compl_ops); +			const struct nfs_pgio_completion_ops *compl_ops, +			struct nfs_direct_req *dreq);  int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, -			const struct nfs_pgio_completion_ops *compl_ops); +			const struct nfs_pgio_completion_ops *compl_ops, +			struct nfs_direct_req *dreq);  struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);  /* nfs4_deviceid_flags */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index d35b62e83ea..6da209bd940 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -77,9 +77,8 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,  		 long hash)  {  	struct nfs4_deviceid_node *d; -	struct hlist_node *n; -	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) +	hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)  		if (d->ld == ld && d->nfs_client == clp &&  		    !memcmp(&d->deviceid, id, sizeof(*id))) {  			if (atomic_read(&d->ref)) @@ -248,12 +247,11 @@ static void  _deviceid_purge_client(const struct nfs_client *clp, long hash)  {  	struct nfs4_deviceid_node *d; -	struct hlist_node *n;  	HLIST_HEAD(tmp);  	spin_lock(&nfs4_deviceid_lock);  	rcu_read_lock(); -	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) +	hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)  		if (d->nfs_client == clp && atomic_read(&d->ref)) {  			hlist_del_init_rcu(&d->node);  			hlist_add_head(&d->tmpnode, &tmp); @@ -291,12 +289,11 @@ void  nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)  {  	struct nfs4_deviceid_node *d; -	struct hlist_node *n;  	int i;  	rcu_read_lock();  	for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ -		hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node) +		hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node)  			if (d->nfs_client == clp)  				set_bit(NFS_DEVICEID_INVALID, &d->flags);  	} diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index f084dac948e..fc8de9016ac 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -662,7 +662,7 @@ nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)  static int  nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);  } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index b6bdb18e892..a5e5d9899d5 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -91,12 +91,16 @@ void nfs_readdata_release(struct nfs_read_data *rdata)  	put_nfs_open_context(rdata->args.context);  	if (rdata->pages.pagevec != rdata->pages.page_array)  		kfree(rdata->pages.pagevec); -	if (rdata != &read_header->rpc_data) -		kfree(rdata); -	else +	if (rdata == &read_header->rpc_data) {  		rdata->header = NULL; +		rdata = NULL; +	}  	if (atomic_dec_and_test(&hdr->refcnt))  		hdr->completion_ops->completion(hdr); +	/* Note: we only free the rpc_task after callbacks are done. +	 * See the comment in rpc_free_task() for why +	 */ +	kfree(rdata);  }  EXPORT_SYMBOL_GPL(nfs_readdata_release); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index aa5315bb366..17b32b72245 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -31,6 +31,7 @@  #include <linux/errno.h>  #include <linux/unistd.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/sunrpc/stats.h>  #include <linux/sunrpc/metrics.h>  #include <linux/sunrpc/xprtsock.h> @@ -54,7 +55,6 @@  #include <linux/parser.h>  #include <linux/nsproxy.h>  #include <linux/rcupdate.h> -#include <linux/kthread.h>  #include <asm/uaccess.h> @@ -292,7 +292,7 @@ struct file_system_type nfs_fs_type = {  	.name		= "nfs",  	.mount		= nfs_fs_mount,  	.kill_sb	= nfs_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  };  EXPORT_SYMBOL_GPL(nfs_fs_type); @@ -301,7 +301,7 @@ struct file_system_type nfs_xdev_fs_type = {  	.name		= "nfs",  	.mount		= nfs_xdev_mount,  	.kill_sb	= nfs_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  };  const struct super_operations nfs_sops = { @@ -331,7 +331,7 @@ struct file_system_type nfs4_fs_type = {  	.name		= "nfs4",  	.mount		= nfs_fs_mount,  	.kill_sb	= nfs_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  };  EXPORT_SYMBOL_GPL(nfs4_fs_type); @@ -418,54 +418,6 @@ void nfs_sb_deactive(struct super_block *sb)  }  EXPORT_SYMBOL_GPL(nfs_sb_deactive); -static int nfs_deactivate_super_async_work(void *ptr) -{ -	struct super_block *sb = ptr; - -	deactivate_super(sb); -	module_put_and_exit(0); -	return 0; -} - -/* - * same effect as deactivate_super, but will do final unmount in kthread - * context - */ -static void nfs_deactivate_super_async(struct super_block *sb) -{ -	struct task_struct *task; -	char buf[INET6_ADDRSTRLEN + 1]; -	struct nfs_server *server = NFS_SB(sb); -	struct nfs_client *clp = server->nfs_client; - -	if (!atomic_add_unless(&sb->s_active, -1, 1)) { -		rcu_read_lock(); -		snprintf(buf, sizeof(buf), -			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); -		rcu_read_unlock(); - -		__module_get(THIS_MODULE); -		task = kthread_run(nfs_deactivate_super_async_work, sb, -				"%s-deactivate-super", buf); -		if (IS_ERR(task)) { -			pr_err("%s: kthread_run: %ld\n", -				__func__, PTR_ERR(task)); -			/* make synchronous call and hope for the best */ -			deactivate_super(sb); -			module_put(THIS_MODULE); -		} -	} -} - -void nfs_sb_deactive_async(struct super_block *sb) -{ -	struct nfs_server *server = NFS_SB(sb); - -	if (atomic_dec_and_test(&server->active)) -		nfs_deactivate_super_async(sb); -} -EXPORT_SYMBOL_GPL(nfs_sb_deactive_async); -  /*   * Deliver file system statistics to userspace   */ @@ -1152,7 +1104,7 @@ static int nfs_get_option_str(substring_t args[], char **option)  {  	kfree(*option);  	*option = match_strdup(args); -	return !option; +	return !*option;  }  static int nfs_get_option_ul(substring_t args[], unsigned long *option) @@ -2375,19 +2327,30 @@ static void nfs_get_cache_cookie(struct super_block *sb,  				 struct nfs_parsed_mount_data *parsed,  				 struct nfs_clone_mount *cloned)  { +	struct nfs_server *nfss = NFS_SB(sb);  	char *uniq = NULL;  	int ulen = 0; -	if (parsed && parsed->fscache_uniq) { -		uniq = parsed->fscache_uniq; -		ulen = strlen(parsed->fscache_uniq); +	nfss->fscache_key = NULL; +	nfss->fscache = NULL; + +	if (parsed) { +		if (!(parsed->options & NFS_OPTION_FSCACHE)) +			return; +		if (parsed->fscache_uniq) { +			uniq = parsed->fscache_uniq; +			ulen = strlen(parsed->fscache_uniq); +		}  	} else if (cloned) {  		struct nfs_server *mnt_s = NFS_SB(cloned->sb); +		if (!(mnt_s->options & NFS_OPTION_FSCACHE)) +			return;  		if (mnt_s->fscache_key) {  			uniq = mnt_s->fscache_key->key.uniquifier;  			ulen = mnt_s->fscache_key->key.uniq_len;  		}; -	} +	} else +		return;  	nfs_fscache_get_super_cookie(sb, uniq, ulen);  } @@ -2578,27 +2541,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,  	struct nfs_server *server;  	struct dentry *mntroot = ERR_PTR(-ENOMEM);  	struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; -	int error; -	dprintk("--> nfs_xdev_mount_common()\n"); +	dprintk("--> nfs_xdev_mount()\n");  	mount_info.mntfh = mount_info.cloned->fh;  	/* create a new volume representation */  	server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); -	if (IS_ERR(server)) { -		error = PTR_ERR(server); -		goto out_err; -	} -	mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod); -	dprintk("<-- nfs_xdev_mount_common() = 0\n"); -out: -	return mntroot; +	if (IS_ERR(server)) +		mntroot = ERR_CAST(server); +	else +		mntroot = nfs_fs_mount_common(server, flags, +				dev_name, &mount_info, nfs_mod); -out_err: -	dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error); -	goto out; +	dprintk("<-- nfs_xdev_mount() = %ld\n", +			IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L); +	return mntroot;  }  #if IS_ENABLED(CONFIG_NFS_V4) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 3f79c77153b..1f1f38f0c5d 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata)  	nfs_dec_sillycount(data->dir);  	nfs_free_unlinkdata(data); -	nfs_sb_deactive_async(sb); +	nfs_sb_deactive(sb);  }  static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) @@ -268,8 +268,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)  	 * point dentry is definitely not a root, so we won't need  	 * that anymore.  	 */ -	if (devname_garbage) -		kfree(devname_garbage); +	kfree(devname_garbage);  	return 0;  out_unlock:  	spin_unlock(&dentry->d_lock); @@ -336,20 +335,14 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)  	struct inode *old_dir = data->old_dir;  	struct inode *new_dir = data->new_dir;  	struct dentry *old_dentry = data->old_dentry; -	struct dentry *new_dentry = data->new_dentry;  	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {  		rpc_restart_call_prepare(task);  		return;  	} -	if (task->tk_status != 0) { +	if (task->tk_status != 0)  		nfs_cancel_async_unlink(old_dentry); -		return; -	} - -	d_drop(old_dentry); -	d_drop(new_dentry);  }  /** @@ -550,6 +543,18 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)  	error = rpc_wait_for_completion_task(task);  	if (error == 0)  		error = task->tk_status; +	switch (error) { +	case 0: +		/* The rename succeeded */ +		nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +		d_move(dentry, sdentry); +		break; +	case -ERESTARTSYS: +		/* The result of the rename is unknown. Play it safe by +		 * forcing a new lookup */ +		d_drop(dentry); +		d_drop(sdentry); +	}  	rpc_put_task(task);  out_dput:  	dput(sdentry); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5209916e122..c483cc50b82 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -126,12 +126,16 @@ void nfs_writedata_release(struct nfs_write_data *wdata)  	put_nfs_open_context(wdata->args.context);  	if (wdata->pages.pagevec != wdata->pages.page_array)  		kfree(wdata->pages.pagevec); -	if (wdata != &write_header->rpc_data) -		kfree(wdata); -	else +	if (wdata == &write_header->rpc_data) {  		wdata->header = NULL; +		wdata = NULL; +	}  	if (atomic_dec_and_test(&hdr->refcnt))  		hdr->completion_ops->completion(hdr); +	/* Note: we only free the rpc_task after callbacks are done. +	 * See the comment in rpc_free_task() for why +	 */ +	kfree(wdata);  }  EXPORT_SYMBOL_GPL(nfs_writedata_release); @@ -1794,7 +1798,8 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,  	if (PagePrivate(page))  		return -EBUSY; -	nfs_fscache_release_page(page, GFP_KERNEL); +	if (!nfs_fscache_release_page(page, GFP_KERNEL)) +		return -EBUSY;  	return migrate_page(mapping, newpage, page, mode);  } diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index 6940439bd60..ed628f71274 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -38,8 +38,8 @@ struct nfsacl_encode_desc {  	unsigned int count;  	struct posix_acl *acl;  	int typeflag; -	uid_t uid; -	gid_t gid; +	kuid_t uid; +	kgid_t gid;  };  struct nfsacl_simple_acl { @@ -60,14 +60,16 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)  	*p++ = htonl(entry->e_tag | nfsacl_desc->typeflag);  	switch(entry->e_tag) {  		case ACL_USER_OBJ: -			*p++ = htonl(nfsacl_desc->uid); +			*p++ = htonl(from_kuid(&init_user_ns, nfsacl_desc->uid));  			break;  		case ACL_GROUP_OBJ: -			*p++ = htonl(nfsacl_desc->gid); +			*p++ = htonl(from_kgid(&init_user_ns, nfsacl_desc->gid));  			break;  		case ACL_USER: +			*p++ = htonl(from_kuid(&init_user_ns, entry->e_uid)); +			break;  		case ACL_GROUP: -			*p++ = htonl(entry->e_id); +			*p++ = htonl(from_kgid(&init_user_ns, entry->e_gid));  			break;  		default:  /* Solaris depends on that! */  			*p++ = 0; @@ -148,6 +150,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)  		(struct nfsacl_decode_desc *) desc;  	__be32 *p = elem;  	struct posix_acl_entry *entry; +	unsigned int id;  	if (!nfsacl_desc->acl) {  		if (desc->array_len > NFS_ACL_MAX_ENTRIES) @@ -160,14 +163,22 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)  	entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];  	entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT; -	entry->e_id = ntohl(*p++); +	id = ntohl(*p++);  	entry->e_perm = ntohl(*p++);  	switch(entry->e_tag) { -		case ACL_USER_OBJ:  		case ACL_USER: -		case ACL_GROUP_OBJ: +			entry->e_uid = make_kuid(&init_user_ns, id); +			if (!uid_valid(entry->e_uid)) +				return -EINVAL; +			break;  		case ACL_GROUP: +			entry->e_gid = make_kgid(&init_user_ns, id); +			if (!gid_valid(entry->e_gid)) +				return -EINVAL; +			break; +		case ACL_USER_OBJ: +		case ACL_GROUP_OBJ:  		case ACL_OTHER:  			if (entry->e_perm & ~S_IRWXO)  				return -EINVAL; @@ -190,9 +201,13 @@ cmp_acl_entry(const void *x, const void *y)  	if (a->e_tag != b->e_tag)  		return a->e_tag - b->e_tag; -	else if (a->e_id > b->e_id) +	else if ((a->e_tag == ACL_USER) && uid_gt(a->e_uid, b->e_uid)) +		return 1; +	else if ((a->e_tag == ACL_USER) && uid_lt(a->e_uid, b->e_uid)) +		return -1; +	else if ((a->e_tag == ACL_GROUP) && gid_gt(a->e_gid, b->e_gid))  		return 1; -	else if (a->e_id < b->e_id) +	else if ((a->e_tag == ACL_GROUP) && gid_lt(a->e_gid, b->e_gid))  		return -1;  	else  		return 0; @@ -213,22 +228,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)  	sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry),  	     cmp_acl_entry, NULL); -	/* Clear undefined identifier fields and find the ACL_GROUP_OBJ -	   and ACL_MASK entries. */ +	/* Find the ACL_GROUP_OBJ and ACL_MASK entries. */  	FOREACH_ACL_ENTRY(pa, acl, pe) {  		switch(pa->e_tag) {  			case ACL_USER_OBJ: -				pa->e_id = ACL_UNDEFINED_ID;  				break;  			case ACL_GROUP_OBJ: -				pa->e_id = ACL_UNDEFINED_ID;  				group_obj = pa;  				break;  			case ACL_MASK:  				mask = pa;  				/* fall through */  			case ACL_OTHER: -				pa->e_id = ACL_UNDEFINED_ID;  				break;  		}  	} diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 8df1ea4a6ff..430b6872806 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -65,8 +65,8 @@ config NFSD_V3_ACL  	  If unsure, say N.  config NFSD_V4 -	bool "NFS server support for NFS version 4 (EXPERIMENTAL)" -	depends on NFSD && PROC_FS && EXPERIMENTAL +	bool "NFS server support for NFS version 4" +	depends on NFSD && PROC_FS  	select NFSD_V3  	select FS_POSIX_ACL  	select SUNRPC_GSS diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 34e5c40af5e..8b186a4955c 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -44,8 +44,6 @@  struct nfs4_acl *nfs4_acl_new(int);  int nfs4_acl_get_whotype(char *, u32);  int nfs4_acl_write_who(int who, char *p); -int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, -		                        uid_t who, u32 mask);  #define NFS4_ACL_TYPE_DEFAULT	0x01  #define NFS4_ACL_DIR		0x02 diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 34a10d78b83..06cddd57226 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -47,9 +47,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)  		if (!gi)  			goto oom;  	} else if (flags & NFSEXP_ROOTSQUASH) { -		if (!new->fsuid) +		if (uid_eq(new->fsuid, GLOBAL_ROOT_UID))  			new->fsuid = exp->ex_anon_uid; -		if (!new->fsgid) +		if (gid_eq(new->fsgid, GLOBAL_ROOT_GID))  			new->fsgid = exp->ex_anon_gid;  		gi = groups_alloc(rqgi->ngroups); @@ -58,7 +58,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)  		for (i = 0; i < rqgi->ngroups; i++) {  			if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) -				GROUP_AT(gi, i) = make_kgid(&init_user_ns, exp->ex_anon_gid); +				GROUP_AT(gi, i) = exp->ex_anon_gid;  			else  				GROUP_AT(gi, i) = GROUP_AT(rqgi, i);  		} @@ -66,9 +66,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)  		gi = get_group_info(rqgi);  	} -	if (new->fsuid == (uid_t) -1) +	if (uid_eq(new->fsuid, INVALID_UID))  		new->fsuid = exp->ex_anon_uid; -	if (new->fsgid == (gid_t) -1) +	if (gid_eq(new->fsgid, INVALID_GID))  		new->fsgid = exp->ex_anon_gid;  	ret = set_groups(new, gi); @@ -76,7 +76,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)  	if (ret < 0)  		goto error; -	if (new->fsuid) +	if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))  		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);  	else  		new->cap_effective = cap_raise_nfsd_set(new->cap_effective, diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h index 78b3c0e9382..53325a12ba6 100644 --- a/fs/nfsd/auth.h +++ b/fs/nfsd/auth.h @@ -1,6 +1,5 @@  /*   * nfsd-specific authentication stuff. - * uid/gid mapping not yet implemented.   *   * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>   */ @@ -8,11 +7,6 @@  #ifndef LINUX_NFSD_AUTH_H  #define LINUX_NFSD_AUTH_H -#define nfsd_luid(rq, uid)	((u32)(uid)) -#define nfsd_lgid(rq, gid)	((u32)(gid)) -#define nfsd_ruid(rq, uid)	((u32)(uid)) -#define nfsd_rgid(rq, gid)	((u32)(gid)) -  /*   * Set the current process's fsuid/fsgid etc to those of the NFS   * client user diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 93cc9d34c45..87fd1410b73 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -12,6 +12,10 @@  /*   * Representation of a reply cache entry. + * + * Note that we use a sockaddr_in6 to hold the address instead of the more + * typical sockaddr_storage. This is for space reasons, since sockaddr_storage + * is much larger than a sockaddr_in6.   */  struct svc_cacherep {  	struct hlist_node	c_hash; @@ -20,11 +24,13 @@ struct svc_cacherep {  	unsigned char		c_state,	/* unused, inprog, done */  				c_type,		/* status, buffer */  				c_secure : 1;	/* req came from port < 1024 */ -	struct sockaddr_in	c_addr; +	struct sockaddr_in6	c_addr;  	__be32			c_xid;  	u32			c_prot;  	u32			c_proc;  	u32			c_vers; +	unsigned int		c_len; +	__wsum			c_csum;  	unsigned long		c_timestamp;  	union {  		struct kvec	u_vec; @@ -46,8 +52,7 @@ enum {  enum {  	RC_DROPIT,  	RC_REPLY, -	RC_DOIT, -	RC_INTR +	RC_DOIT  };  /* @@ -67,6 +72,12 @@ enum {   */  #define RC_DELAY		(HZ/5) +/* Cache entries expire after this time period */ +#define RC_EXPIRE		(120 * HZ) + +/* Checksum this amount of the request */ +#define RC_CSUMLEN		(256U) +  int	nfsd_reply_cache_init(void);  void	nfsd_reply_cache_shutdown(void);  int	nfsd_cache_lookup(struct svc_rqst *); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index a3946cf13fc..5f38ea36e26 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -67,11 +67,6 @@ static void expkey_request(struct cache_detail *cd,  	(*bpp)[-1] = '\n';  } -static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) -{ -	return sunrpc_cache_pipe_upcall(cd, h, expkey_request); -} -  static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,  					    struct svc_expkey *old);  static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *); @@ -245,7 +240,7 @@ static struct cache_detail svc_expkey_cache_template = {  	.hash_size	= EXPKEY_HASHMAX,  	.name		= "nfsd.fh",  	.cache_put	= expkey_put, -	.cache_upcall	= expkey_upcall, +	.cache_request	= expkey_request,  	.cache_parse	= expkey_parse,  	.cache_show	= expkey_show,  	.match		= expkey_match, @@ -315,6 +310,7 @@ static void svc_export_put(struct kref *ref)  	path_put(&exp->ex_path);  	auth_domain_put(exp->ex_client);  	nfsd4_fslocs_free(&exp->ex_fslocs); +	kfree(exp->ex_uuid);  	kfree(exp);  } @@ -337,11 +333,6 @@ static void svc_export_request(struct cache_detail *cd,  	(*bpp)[-1] = '\n';  } -static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) -{ -	return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); -} -  static struct svc_export *svc_export_update(struct svc_export *new,  					    struct svc_export *old);  static struct svc_export *svc_export_lookup(struct svc_export *); @@ -544,13 +535,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)  		err = get_int(&mesg, &an_int);  		if (err)  			goto out3; -		exp.ex_anon_uid= an_int; +		exp.ex_anon_uid= make_kuid(&init_user_ns, an_int); +		if (!uid_valid(exp.ex_anon_uid)) +			goto out3;  		/* anon gid */  		err = get_int(&mesg, &an_int);  		if (err)  			goto out3; -		exp.ex_anon_gid= an_int; +		exp.ex_anon_gid= make_kgid(&init_user_ns, an_int); +		if (!gid_valid(exp.ex_anon_gid)) +			goto out3;  		/* fsid */  		err = get_int(&mesg, &an_int); @@ -613,7 +608,7 @@ out:  }  static void exp_flags(struct seq_file *m, int flag, int fsid, -		uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs); +		kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs);  static void show_secinfo(struct seq_file *m, struct svc_export *exp);  static int svc_export_show(struct seq_file *m, @@ -670,6 +665,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)  	new->ex_fslocs.locations = NULL;  	new->ex_fslocs.locations_count = 0;  	new->ex_fslocs.migrated = 0; +	new->ex_uuid = NULL;  	new->cd = item->cd;  } @@ -711,7 +707,7 @@ static struct cache_detail svc_export_cache_template = {  	.hash_size	= EXPORT_HASHMAX,  	.name		= "nfsd.export",  	.cache_put	= svc_export_put, -	.cache_upcall	= svc_export_upcall, +	.cache_request	= svc_export_request,  	.cache_parse	= svc_export_parse,  	.cache_show	= svc_export_show,  	.match		= svc_export_match, @@ -1179,15 +1175,17 @@ static void show_secinfo(struct seq_file *m, struct svc_export *exp)  }  static void exp_flags(struct seq_file *m, int flag, int fsid, -		uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc) +		kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc)  {  	show_expflags(m, flag, NFSEXP_ALLFLAGS);  	if (flag & NFSEXP_FSID)  		seq_printf(m, ",fsid=%d", fsid); -	if (anonu != (uid_t)-2 && anonu != (0x10000-2)) -		seq_printf(m, ",anonuid=%u", anonu); -	if (anong != (gid_t)-2 && anong != (0x10000-2)) -		seq_printf(m, ",anongid=%u", anong); +	if (!uid_eq(anonu, make_kuid(&init_user_ns, (uid_t)-2)) && +	    !uid_eq(anonu, make_kuid(&init_user_ns, 0x10000-2))) +		seq_printf(m, ",anonuid=%u", from_kuid(&init_user_ns, anonu)); +	if (!gid_eq(anong, make_kgid(&init_user_ns, (gid_t)-2)) && +	    !gid_eq(anong, make_kgid(&init_user_ns, 0x10000-2))) +		seq_printf(m, ",anongid=%u", from_kgid(&init_user_ns, anong));  	if (fsloc && fsloc->locations_count > 0) {  		char *loctype = (fsloc->migrated) ? "refer" : "replicas";  		int i; diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index e6c38159622..d620e7f8142 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -8,61 +8,144 @@  #include <linux/fs.h>  #include <linux/debugfs.h>  #include <linux/module.h> +#include <linux/nsproxy.h> +#include <linux/sunrpc/addr.h> +#include <asm/uaccess.h>  #include "state.h" -#include "fault_inject.h" +#include "netns.h"  struct nfsd_fault_inject_op {  	char *file; -	void (*func)(u64); +	u64 (*forget)(struct nfs4_client *, u64); +	u64 (*print)(struct nfs4_client *, u64);  };  static struct nfsd_fault_inject_op inject_ops[] = {  	{  		.file   = "forget_clients", -		.func   = nfsd_forget_clients, +		.forget = nfsd_forget_client, +		.print  = nfsd_print_client,  	},  	{  		.file   = "forget_locks", -		.func   = nfsd_forget_locks, +		.forget = nfsd_forget_client_locks, +		.print  = nfsd_print_client_locks,  	},  	{  		.file   = "forget_openowners", -		.func   = nfsd_forget_openowners, +		.forget = nfsd_forget_client_openowners, +		.print  = nfsd_print_client_openowners,  	},  	{  		.file   = "forget_delegations", -		.func   = nfsd_forget_delegations, +		.forget = nfsd_forget_client_delegations, +		.print  = nfsd_print_client_delegations,  	},  	{  		.file   = "recall_delegations", -		.func   = nfsd_recall_delegations, +		.forget = nfsd_recall_client_delegations, +		.print  = nfsd_print_client_delegations,  	},  };  static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);  static struct dentry *debug_dir; -static int nfsd_inject_set(void *op_ptr, u64 val) +static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val)  { -	struct nfsd_fault_inject_op *op = op_ptr; +	u64 count = 0;  	if (val == 0)  		printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);  	else  		printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val); -	op->func(val); -	return 0; +	nfs4_lock_state(); +	count = nfsd_for_n_state(val, op->forget); +	nfs4_unlock_state(); +	printk(KERN_INFO "NFSD: %s: found %llu", op->file, count);  } -static int nfsd_inject_get(void *data, u64 *val) +static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op, +				   struct sockaddr_storage *addr, +				   size_t addr_size)  { -	*val = 0; -	return 0; +	char buf[INET6_ADDRSTRLEN]; +	struct nfs4_client *clp; +	u64 count; + +	nfs4_lock_state(); +	clp = nfsd_find_client(addr, addr_size); +	if (clp) { +		count = op->forget(clp, 0); +		rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); +		printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count); +	} +	nfs4_unlock_state(); +} + +static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val) +{ +	nfs4_lock_state(); +	*val = nfsd_for_n_state(0, op->print); +	nfs4_unlock_state();  } -DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n"); +static ssize_t fault_inject_read(struct file *file, char __user *buf, +				 size_t len, loff_t *ppos) +{ +	static u64 val; +	char read_buf[25]; +	size_t size, ret; +	loff_t pos = *ppos; + +	if (!pos) +		nfsd_inject_get(file_inode(file)->i_private, &val); +	size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); + +	if (pos < 0) +		return -EINVAL; +	if (pos >= size || !len) +		return 0; +	if (len > size - pos) +		len = size - pos; +	ret = copy_to_user(buf, read_buf + pos, len); +	if (ret == len) +		return -EFAULT; +	len -= ret; +	*ppos = pos + len; +	return len; +} + +static ssize_t fault_inject_write(struct file *file, const char __user *buf, +				  size_t len, loff_t *ppos) +{ +	char write_buf[INET6_ADDRSTRLEN]; +	size_t size = min(sizeof(write_buf) - 1, len); +	struct net *net = current->nsproxy->net_ns; +	struct sockaddr_storage sa; +	u64 val; + +	if (copy_from_user(write_buf, buf, size)) +		return -EFAULT; +	write_buf[size] = '\0'; + +	size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); +	if (size > 0) +		nfsd_inject_set_client(file_inode(file)->i_private, &sa, size); +	else { +		val = simple_strtoll(write_buf, NULL, 0); +		nfsd_inject_set(file_inode(file)->i_private, val); +	} +	return len; /* on success, claim we got the whole input */ +} + +static const struct file_operations fops_nfsd = { +	.owner   = THIS_MODULE, +	.read    = fault_inject_read, +	.write   = fault_inject_write, +};  void nfsd_fault_inject_cleanup(void)  { diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h deleted file mode 100644 index 90bd0570956..00000000000 --- a/fs/nfsd/fault_inject.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> - * - * Function definitions for fault injection - */ - -#ifndef LINUX_NFSD_FAULT_INJECT_H -#define LINUX_NFSD_FAULT_INJECT_H - -#ifdef CONFIG_NFSD_FAULT_INJECTION -int nfsd_fault_inject_init(void); -void nfsd_fault_inject_cleanup(void); -void nfsd_forget_clients(u64); -void nfsd_forget_locks(u64); -void nfsd_forget_openowners(u64); -void nfsd_forget_delegations(u64); -void nfsd_recall_delegations(u64); -#else /* CONFIG_NFSD_FAULT_INJECTION */ -static inline int nfsd_fault_inject_init(void) { return 0; } -static inline void nfsd_fault_inject_cleanup(void) {} -static inline void nfsd_forget_clients(u64 num) {} -static inline void nfsd_forget_locks(u64 num) {} -static inline void nfsd_forget_openowners(u64 num) {} -static inline void nfsd_forget_delegations(u64 num) {} -static inline void nfsd_recall_delegations(u64 num) {} -#endif /* CONFIG_NFSD_FAULT_INJECTION */ - -#endif /* LINUX_NFSD_FAULT_INJECT_H */ diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index 9d513efc01b..bf95f6b817a 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -54,9 +54,9 @@ static inline void nfsd_idmap_shutdown(struct net *net)  }  #endif -__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *); -__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *); -int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *); -int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *); +__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); +__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); +int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *); +int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *);  #endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 65c2431ea32..1051bebff1b 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -24,7 +24,18 @@  #include <net/net_namespace.h>  #include <net/netns/generic.h> +/* Hash tables for nfs4_clientid state */ +#define CLIENT_HASH_BITS                 4 +#define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS) +#define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1) + +#define LOCKOWNER_INO_HASH_BITS		8 +#define LOCKOWNER_INO_HASH_SIZE		(1 << LOCKOWNER_INO_HASH_BITS) + +#define SESSION_HASH_SIZE	512 +  struct cld_net; +struct nfsd4_client_tracking_ops;  struct nfsd_net {  	struct cld_net *cld_net; @@ -38,7 +49,62 @@ struct nfsd_net {  	struct lock_manager nfsd4_manager;  	bool grace_ended;  	time_t boot_time; + +	/* +	 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot +	 * used in reboot/reset lease grace period processing +	 * +	 * conf_id_hashtbl[], and conf_name_tree hold confirmed +	 * setclientid_confirmed info. +	 * +	 * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed +	 * setclientid info. +	 */ +	struct list_head *reclaim_str_hashtbl; +	int reclaim_str_hashtbl_size; +	struct list_head *conf_id_hashtbl; +	struct rb_root conf_name_tree; +	struct list_head *unconf_id_hashtbl; +	struct rb_root unconf_name_tree; +	struct list_head *ownerstr_hashtbl; +	struct list_head *lockowner_ino_hashtbl; +	struct list_head *sessionid_hashtbl; +	/* +	 * client_lru holds client queue ordered by nfs4_client.cl_time +	 * for lease renewal. +	 * +	 * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time +	 * for last close replay. +	 * +	 * All of the above fields are protected by the client_mutex. +	 */ +	struct list_head client_lru; +	struct list_head close_lru; + +	struct delayed_work laundromat_work; + +	/* client_lock protects the client lru list and session hash table */ +	spinlock_t client_lock; + +	struct file *rec_file; +	bool in_grace; +	struct nfsd4_client_tracking_ops *client_tracking_ops; + +	time_t nfsd4_lease; +	time_t nfsd4_grace; + +	bool nfsd_net_up; + +	/* +	 * Time of server startup +	 */ +	struct timeval nfssvc_boot; + +	struct svc_serv *nfsd_serv;  }; +/* Simple check to find out if a given net was properly initialized */ +#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl) +  extern int nfsd_net_id;  #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index b314888825d..95d76dc6c5d 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -45,6 +45,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,  		RETURN_STATUS(nfserr_inval);  	resp->mask = argp->mask; +	nfserr = fh_getattr(fh, &resp->stat); +	if (nfserr) +		goto fail; +  	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {  		acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);  		if (IS_ERR(acl)) { @@ -115,6 +119,9 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,  		nfserr = nfserrno( nfsd_set_posix_acl(  			fh, ACL_TYPE_DEFAULT, argp->acl_default) );  	} +	if (!nfserr) { +		nfserr = fh_getattr(fh, &resp->stat); +	}  	/* argp->acl_{access,default} may have been allocated in  	   nfssvc_decode_setaclargs. */ @@ -129,10 +136,15 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,  static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,  		struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)  { +	__be32 nfserr;  	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));  	fh_copy(&resp->fh, &argp->fh); -	return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); +	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); +	if (nfserr) +		return nfserr; +	nfserr = fh_getattr(&resp->fh, &resp->stat); +	return nfserr;  }  /* @@ -150,6 +162,9 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessarg  	fh_copy(&resp->fh, &argp->fh);  	resp->access = argp->access;  	nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); +	if (nfserr) +		return nfserr; +	nfserr = fh_getattr(&resp->fh, &resp->stat);  	return nfserr;  } @@ -243,7 +258,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,  		return 0;  	inode = dentry->d_inode; -	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); +	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);  	*p++ = htonl(resp->mask);  	if (!xdr_ressize_check(rqstp, p))  		return 0; @@ -253,7 +268,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,  		(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,  		(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);  	while (w > 0) { -		if (!rqstp->rq_respages[rqstp->rq_resused++]) +		if (!*(rqstp->rq_next_page++))  			return 0;  		w -= PAGE_SIZE;  	} @@ -274,7 +289,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,  static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,  		struct nfsd_attrstat *resp)  { -	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); +	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);  	return xdr_ressize_check(rqstp, p);  } @@ -282,7 +297,7 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,  static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,  		struct nfsd3_accessres *resp)  { -	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); +	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);  	*p++ = htonl(resp->access);  	return xdr_ressize_check(rqstp, p);  } diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index a596e9d987e..9cbc1a841f8 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -184,7 +184,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,  			(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,  			(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);  		while (w > 0) { -			if (!rqstp->rq_respages[rqstp->rq_resused++]) +			if (!*(rqstp->rq_next_page++))  				return 0;  			w -= PAGE_SIZE;  		} diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 97d90d1c860..40128991313 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -43,7 +43,6 @@ static __be32  nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,  					   struct nfsd3_attrstat *resp)  { -	int	err;  	__be32	nfserr;  	dprintk("nfsd: GETATTR(3)  %s\n", @@ -55,9 +54,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,  	if (nfserr)  		RETURN_STATUS(nfserr); -	err = vfs_getattr(resp->fh.fh_export->ex_path.mnt, -			  resp->fh.fh_dentry, &resp->stat); -	nfserr = nfserrno(err); +	nfserr = fh_getattr(&resp->fh, &resp->stat);  	RETURN_STATUS(nfserr);  } @@ -460,7 +457,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,  	__be32	nfserr;  	int	count = 0;  	loff_t	offset; -	int	i; +	struct page **p;  	caddr_t	page_addr = NULL;  	dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", @@ -484,8 +481,8 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,  				     &resp->common,  				     nfs3svc_encode_entry_plus);  	memcpy(resp->verf, argp->verf, 8); -	for (i=1; i<rqstp->rq_resused ; i++) { -		page_addr = page_address(rqstp->rq_respages[i]); +	for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { +		page_addr = page_address(*p);  		if (((caddr_t)resp->buffer >= page_addr) &&  		    ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 43f46cd9ede..14d9ecb96cf 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -7,8 +7,11 @@   */  #include <linux/namei.h> +#include <linux/sunrpc/svc_xprt.h>  #include "xdr3.h"  #include "auth.h" +#include "netns.h" +#include "vfs.h"  #define NFSDDBG_FACILITY		NFSDDBG_XDR @@ -103,12 +106,14 @@ decode_sattr3(__be32 *p, struct iattr *iap)  		iap->ia_mode = ntohl(*p++);  	}  	if (*p++) { -		iap->ia_valid |= ATTR_UID; -		iap->ia_uid = ntohl(*p++); +		iap->ia_uid = make_kuid(&init_user_ns, ntohl(*p++)); +		if (uid_valid(iap->ia_uid)) +			iap->ia_valid |= ATTR_UID;  	}  	if (*p++) { -		iap->ia_valid |= ATTR_GID; -		iap->ia_gid = ntohl(*p++); +		iap->ia_gid = make_kgid(&init_user_ns, ntohl(*p++)); +		if (gid_valid(iap->ia_gid)) +			iap->ia_valid |= ATTR_GID;  	}  	if (*p++) {  		u64	newsize; @@ -165,8 +170,8 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,  	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);  	*p++ = htonl((u32) stat->mode);  	*p++ = htonl((u32) stat->nlink); -	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); -	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); +	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); +	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));  	if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {  		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);  	} else { @@ -202,10 +207,10 @@ encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)  {  	struct dentry *dentry = fhp->fh_dentry;  	if (dentry && dentry->d_inode) { -	        int err; +	        __be32 err;  		struct kstat stat; -		err = vfs_getattr(fhp->fh_export->ex_path.mnt, dentry, &stat); +		err = fh_getattr(fhp, &stat);  		if (!err) {  			*p++ = xdr_one;		/* attributes follow */  			lease_get_mtime(dentry->d_inode, &stat.mtime); @@ -252,13 +257,12 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)   */  void fill_post_wcc(struct svc_fh *fhp)  { -	int err; +	__be32 err;  	if (fhp->fh_post_saved)  		printk("nfsd: inode locked twice during operation.\n"); -	err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, -			&fhp->fh_post_attr); +	err = fh_getattr(fhp, &fhp->fh_post_attr);  	fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;  	if (err) {  		fhp->fh_post_saved = 0; @@ -323,7 +327,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,  					struct nfsd3_readargs *args)  {  	unsigned int len; -	int v,pn; +	int v;  	u32 max_blocksize = svc_max_payload(rqstp);  	if (!(p = decode_fh(p, &args->fh))) @@ -338,8 +342,9 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,  	/* set up the kvec */  	v=0;  	while (len > 0) { -		pn = rqstp->rq_resused++; -		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); +		struct page *p = *(rqstp->rq_next_page++); + +		rqstp->rq_vec[v].iov_base = page_address(p);  		rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;  		len -= rqstp->rq_vec[v].iov_len;  		v++; @@ -461,8 +466,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,  	len = ntohl(*p++);  	if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)  		return 0; -	args->tname = new = -		page_address(rqstp->rq_respages[rqstp->rq_resused++]); +	args->tname = new = page_address(*(rqstp->rq_next_page++));  	args->tlen = len;  	/* first copy and check from the first page */  	old = (char*)p; @@ -533,8 +537,7 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,  {  	if (!(p = decode_fh(p, &args->fh)))  		return 0; -	args->buffer = -		page_address(rqstp->rq_respages[rqstp->rq_resused++]); +	args->buffer = page_address(*(rqstp->rq_next_page++));  	return xdr_argsize_check(rqstp, p);  } @@ -565,8 +568,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,  	if (args->count > PAGE_SIZE)  		args->count = PAGE_SIZE; -	args->buffer = -		page_address(rqstp->rq_respages[rqstp->rq_resused++]); +	args->buffer = page_address(*(rqstp->rq_next_page++));  	return xdr_argsize_check(rqstp, p);  } @@ -575,7 +577,7 @@ int  nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,  					struct nfsd3_readdirargs *args)  { -	int len, pn; +	int len;  	u32 max_blocksize = svc_max_payload(rqstp);  	if (!(p = decode_fh(p, &args->fh))) @@ -590,9 +592,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,  	args->count = len;  	while (len > 0) { -		pn = rqstp->rq_resused++; +		struct page *p = *(rqstp->rq_next_page++);  		if (!args->buffer) -			args->buffer = page_address(rqstp->rq_respages[pn]); +			args->buffer = page_address(p);  		len -= PAGE_SIZE;  	} @@ -720,12 +722,14 @@ int  nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,  					struct nfsd3_writeres *resp)  { +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); +  	p = encode_wcc_data(rqstp, p, &resp->fh);  	if (resp->status == 0) {  		*p++ = htonl(resp->count);  		*p++ = htonl(resp->committed); -		*p++ = htonl(nfssvc_boot.tv_sec); -		*p++ = htonl(nfssvc_boot.tv_usec); +		*p++ = htonl(nn->nfssvc_boot.tv_sec); +		*p++ = htonl(nn->nfssvc_boot.tv_usec);  	}  	return xdr_ressize_check(rqstp, p);  } @@ -876,7 +880,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,  		       					common);  	__be32		*p = cd->buffer;  	caddr_t		curr_page_addr = NULL; -	int		pn;		/* current page number */ +	struct page **	page;  	int		slen;		/* string (name) length */  	int		elen;		/* estimated entry length in words */  	int		num_entry_words = 0;	/* actual number of words */ @@ -913,8 +917,9 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,  	}  	/* determine which page in rq_respages[] we are currently filling */ -	for (pn=1; pn < cd->rqstp->rq_resused; pn++) { -		curr_page_addr = page_address(cd->rqstp->rq_respages[pn]); +	for (page = cd->rqstp->rq_respages + 1; +				page < cd->rqstp->rq_next_page; page++) { +		curr_page_addr = page_address(*page);  		if (((caddr_t)cd->buffer >= curr_page_addr) &&  		    ((caddr_t)cd->buffer <  curr_page_addr + PAGE_SIZE)) @@ -929,14 +934,14 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,  		if (plus)  			p = encode_entryplus_baggage(cd, p, name, namlen);  		num_entry_words = p - cd->buffer; -	} else if (cd->rqstp->rq_respages[pn+1] != NULL) { +	} else if (*(page+1) != NULL) {  		/* temporarily encode entry into next page, then move back to  		 * current and next page in rq_respages[] */  		__be32 *p1, *tmp;  		int len1, len2;  		/* grab next page for temporary storage of entry */ -		p1 = tmp = page_address(cd->rqstp->rq_respages[pn+1]); +		p1 = tmp = page_address(*(page+1));  		p1 = encode_entry_baggage(cd, p1, name, namlen, ino); @@ -1082,11 +1087,13 @@ int  nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,  					struct nfsd3_commitres *resp)  { +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); +  	p = encode_wcc_data(rqstp, p, &resp->fh);  	/* Write verifier */  	if (resp->status == 0) { -		*p++ = htonl(nfssvc_boot.tv_sec); -		*p++ = htonl(nfssvc_boot.tv_usec); +		*p++ = htonl(nn->nfssvc_boot.tv_sec); +		*p++ = htonl(nn->nfssvc_boot.tv_usec);  	}  	return xdr_ressize_check(rqstp, p);  } diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 9c51aff02ae..8a50b3c1809 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -264,7 +264,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,  			ace->flag = eflag;  			ace->access_mask = deny_mask_from_posix(deny, flags);  			ace->whotype = NFS4_ACL_WHO_NAMED; -			ace->who = pa->e_id; +			ace->who_uid = pa->e_uid;  			ace++;  			acl->naces++;  		} @@ -273,7 +273,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,  		ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,  						   flags);  		ace->whotype = NFS4_ACL_WHO_NAMED; -		ace->who = pa->e_id; +		ace->who_uid = pa->e_uid;  		ace++;  		acl->naces++;  		pa++; @@ -300,7 +300,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,  		ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,  						   flags);  		ace->whotype = NFS4_ACL_WHO_NAMED; -		ace->who = pa->e_id; +		ace->who_gid = pa->e_gid;  		ace++;  		acl->naces++;  		pa++; @@ -329,7 +329,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,  			ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;  			ace->access_mask = deny_mask_from_posix(deny, flags);  			ace->whotype = NFS4_ACL_WHO_NAMED; -			ace->who = pa->e_id; +			ace->who_gid = pa->e_gid;  			ace++;  			acl->naces++;  		} @@ -345,6 +345,18 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,  	acl->naces++;  } +static bool +pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2) +{ +	if (pace1->e_tag != pace2->e_tag) +		return pace1->e_tag > pace2->e_tag; +	if (pace1->e_tag == ACL_USER) +		return uid_gt(pace1->e_uid, pace2->e_uid); +	if (pace1->e_tag == ACL_GROUP) +		return gid_gt(pace1->e_gid, pace2->e_gid); +	return false; +} +  static void  sort_pacl_range(struct posix_acl *pacl, int start, int end) {  	int sorted = 0, i; @@ -355,8 +367,8 @@ sort_pacl_range(struct posix_acl *pacl, int start, int end) {  	while (!sorted) {  		sorted = 1;  		for (i = start; i < end; i++) { -			if (pacl->a_entries[i].e_id -					> pacl->a_entries[i+1].e_id) { +			if (pace_gt(&pacl->a_entries[i], +				    &pacl->a_entries[i+1])) {  				sorted = 0;  				tmp = pacl->a_entries[i];  				pacl->a_entries[i] = pacl->a_entries[i+1]; @@ -398,7 +410,10 @@ struct posix_ace_state {  };  struct posix_user_ace_state { -	uid_t uid; +	union { +		kuid_t uid; +		kgid_t gid; +	};  	struct posix_ace_state perms;  }; @@ -521,7 +536,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)  	if (error)  		goto out_err;  	low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); -	pace->e_id = ACL_UNDEFINED_ID;  	for (i=0; i < state->users->n; i++) {  		pace++; @@ -531,7 +545,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)  			goto out_err;  		low_mode_from_nfs4(state->users->aces[i].perms.allow,  					&pace->e_perm, flags); -		pace->e_id = state->users->aces[i].uid; +		pace->e_uid = state->users->aces[i].uid;  		add_to_mask(state, &state->users->aces[i].perms);  	} @@ -541,7 +555,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)  	if (error)  		goto out_err;  	low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); -	pace->e_id = ACL_UNDEFINED_ID;  	add_to_mask(state, &state->group);  	for (i=0; i < state->groups->n; i++) { @@ -552,14 +565,13 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)  			goto out_err;  		low_mode_from_nfs4(state->groups->aces[i].perms.allow,  					&pace->e_perm, flags); -		pace->e_id = state->groups->aces[i].uid; +		pace->e_gid = state->groups->aces[i].gid;  		add_to_mask(state, &state->groups->aces[i].perms);  	}  	pace++;  	pace->e_tag = ACL_MASK;  	low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); -	pace->e_id = ACL_UNDEFINED_ID;  	pace++;  	pace->e_tag = ACL_OTHER; @@ -567,7 +579,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)  	if (error)  		goto out_err;  	low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); -	pace->e_id = ACL_UNDEFINED_ID;  	return pacl;  out_err: @@ -587,12 +598,13 @@ static inline void deny_bits(struct posix_ace_state *astate, u32 mask)  	astate->deny |= mask & ~astate->allow;  } -static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid) +static int find_uid(struct posix_acl_state *state, kuid_t uid)  { +	struct posix_ace_state_array *a = state->users;  	int i;  	for (i = 0; i < a->n; i++) -		if (a->aces[i].uid == uid) +		if (uid_eq(a->aces[i].uid, uid))  			return i;  	/* Not found: */  	a->n++; @@ -603,6 +615,23 @@ static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array  	return i;  } +static int find_gid(struct posix_acl_state *state, kgid_t gid) +{ +	struct posix_ace_state_array *a = state->groups; +	int i; + +	for (i = 0; i < a->n; i++) +		if (gid_eq(a->aces[i].gid, gid)) +			return i; +	/* Not found: */ +	a->n++; +	a->aces[i].gid = gid; +	a->aces[i].perms.allow = state->everyone.allow; +	a->aces[i].perms.deny  = state->everyone.deny; + +	return i; +} +  static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)  {  	int i; @@ -636,7 +665,7 @@ static void process_one_v4_ace(struct posix_acl_state *state,  		}  		break;  	case ACL_USER: -		i = find_uid(state, state->users, ace->who); +		i = find_uid(state, ace->who_uid);  		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {  			allow_bits(&state->users->aces[i].perms, mask);  		} else { @@ -658,7 +687,7 @@ static void process_one_v4_ace(struct posix_acl_state *state,  		}  		break;  	case ACL_GROUP: -		i = find_uid(state, state->groups, ace->who); +		i = find_gid(state, ace->who_gid);  		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {  			allow_bits(&state->groups->aces[i].perms, mask);  		} else { diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index bdf29c96e4c..99bc85ff021 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -36,6 +36,7 @@  #include <linux/slab.h>  #include "nfsd.h"  #include "state.h" +#include "netns.h"  #define NFSDDBG_FACILITY                NFSDDBG_PROC @@ -625,20 +626,46 @@ static const struct rpc_program cb_program = {  	.pipe_dir_name		= "nfsd4_cb",  }; -static int max_cb_time(void) +static int max_cb_time(struct net *net)  { -	return max(nfsd4_lease/10, (time_t)1) * HZ; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +	return max(nn->nfsd4_lease/10, (time_t)1) * HZ;  } +static struct rpc_cred *callback_cred; + +int set_callback_cred(void) +{ +	if (callback_cred) +		return 0; +	callback_cred = rpc_lookup_machine_cred("nfs"); +	if (!callback_cred) +		return -ENOMEM; +	return 0; +} + +static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) +{ +	if (clp->cl_minorversion == 0) { +		return get_rpccred(callback_cred); +	} else { +		struct rpc_auth *auth = client->cl_auth; +		struct auth_cred acred = {}; + +		acred.uid = ses->se_cb_sec.uid; +		acred.gid = ses->se_cb_sec.gid; +		return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0); +	} +}  static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)  {  	struct rpc_timeout	timeparms = { -		.to_initval	= max_cb_time(), +		.to_initval	= max_cb_time(clp->net),  		.to_retries	= 0,  	};  	struct rpc_create_args args = { -		.net		= &init_net, +		.net		= clp->net,  		.address	= (struct sockaddr *) &conn->cb_addr,  		.addrsize	= conn->cb_addrlen,  		.saddress	= (struct sockaddr *) &conn->cb_saddr, @@ -648,6 +675,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c  		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),  	};  	struct rpc_clnt *client; +	struct rpc_cred *cred;  	if (clp->cl_minorversion == 0) {  		if (!clp->cl_cred.cr_principal && @@ -666,7 +694,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c  		args.bc_xprt = conn->cb_xprt;  		args.prognumber = clp->cl_cb_session->se_cb_prog;  		args.protocol = XPRT_TRANSPORT_BC_TCP; -		args.authflavor = RPC_AUTH_UNIX; +		args.authflavor = ses->se_cb_sec.flavor;  	}  	/* Create RPC client */  	client = rpc_create(&args); @@ -675,9 +703,14 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c  			PTR_ERR(client));  		return PTR_ERR(client);  	} +	cred = get_backchannel_cred(clp, client, ses); +	if (IS_ERR(cred)) { +		rpc_shutdown_client(client); +		return PTR_ERR(cred); +	}  	clp->cl_cb_client = client; +	clp->cl_cb_cred = cred;  	return 0; -  }  static void warn_no_callback_path(struct nfs4_client *clp, int reason) @@ -714,18 +747,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {  	.rpc_call_done = nfsd4_cb_probe_done,  }; -static struct rpc_cred *callback_cred; - -int set_callback_cred(void) -{ -	if (callback_cred) -		return 0; -	callback_cred = rpc_lookup_machine_cred("nfs"); -	if (!callback_cred) -		return -ENOMEM; -	return 0; -} -  static struct workqueue_struct *callback_wq;  static void run_nfsd4_cb(struct nfsd4_callback *cb) @@ -743,7 +764,6 @@ static void do_probe_callback(struct nfs4_client *clp)  	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];  	cb->cb_msg.rpc_argp = NULL;  	cb->cb_msg.rpc_resp = NULL; -	cb->cb_msg.rpc_cred = callback_cred;  	cb->cb_ops = &nfsd4_cb_probe_ops; @@ -962,6 +982,8 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)  	if (clp->cl_cb_client) {  		rpc_shutdown_client(clp->cl_cb_client);  		clp->cl_cb_client = NULL; +		put_rpccred(clp->cl_cb_cred); +		clp->cl_cb_cred = NULL;  	}  	if (clp->cl_cb_conn.cb_xprt) {  		svc_xprt_put(clp->cl_cb_conn.cb_xprt); @@ -995,7 +1017,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)  		run_nfsd4_cb(cb);  } -void nfsd4_do_callback_rpc(struct work_struct *w) +static void nfsd4_do_callback_rpc(struct work_struct *w)  {  	struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);  	struct nfs4_client *clp = cb->cb_clp; @@ -1010,10 +1032,16 @@ void nfsd4_do_callback_rpc(struct work_struct *w)  		nfsd4_release_cb(cb);  		return;  	} +	cb->cb_msg.rpc_cred = clp->cl_cb_cred;  	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,  			cb->cb_ops, cb);  } +void nfsd4_init_callback(struct nfsd4_callback *cb) +{ +	INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc); +} +  void nfsd4_cb_recall(struct nfs4_delegation *dp)  {  	struct nfsd4_callback *cb = &dp->dl_recall; @@ -1025,7 +1053,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)  	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];  	cb->cb_msg.rpc_argp = cb;  	cb->cb_msg.rpc_resp = cb; -	cb->cb_msg.rpc_cred = callback_cred;  	cb->cb_ops = &nfsd4_cb_recall_ops; diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index a1f10c0a625..4832fd819f8 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -65,7 +65,7 @@ MODULE_PARM_DESC(nfs4_disable_idmapping,  struct ent {  	struct cache_head h;  	int               type;		       /* User / Group */ -	uid_t             id; +	u32               id;  	char              name[IDMAP_NAMESZ];  	char              authname[IDMAP_NAMESZ];  }; @@ -140,12 +140,6 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,  }  static int -idtoname_upcall(struct cache_detail *cd, struct cache_head *ch) -{ -	return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request); -} - -static int  idtoname_match(struct cache_head *ca, struct cache_head *cb)  {  	struct ent *a = container_of(ca, struct ent, h); @@ -192,7 +186,7 @@ static struct cache_detail idtoname_cache_template = {  	.hash_size	= ENT_HASHMAX,  	.name		= "nfs4.idtoname",  	.cache_put	= ent_put, -	.cache_upcall	= idtoname_upcall, +	.cache_request	= idtoname_request,  	.cache_parse	= idtoname_parse,  	.cache_show	= idtoname_show,  	.warn_no_listener = warn_no_idmapd, @@ -321,12 +315,6 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,  }  static int -nametoid_upcall(struct cache_detail *cd, struct cache_head *ch) -{ -	return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request); -} - -static int  nametoid_match(struct cache_head *ca, struct cache_head *cb)  {  	struct ent *a = container_of(ca, struct ent, h); @@ -365,7 +353,7 @@ static struct cache_detail nametoid_cache_template = {  	.hash_size	= ENT_HASHMAX,  	.name		= "nfs4.nametoid",  	.cache_put	= ent_put, -	.cache_upcall	= nametoid_upcall, +	.cache_request	= nametoid_request,  	.cache_parse	= nametoid_parse,  	.cache_show	= nametoid_show,  	.warn_no_listener = warn_no_idmapd, @@ -540,7 +528,7 @@ rqst_authname(struct svc_rqst *rqstp)  static __be32  idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, -		uid_t *id) +		u32 *id)  {  	struct ent *item, key = {  		.type = type, @@ -564,7 +552,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen  }  static int -idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) +idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)  {  	struct ent *item, key = {  		.id = id, @@ -587,7 +575,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)  }  static bool -numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) +numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)  {  	int ret;  	char buf[11]; @@ -603,7 +591,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel  }  static __be32 -do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) +do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)  {  	if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)  		if (numeric_name_to_id(rqstp, type, name, namelen, id)) @@ -616,7 +604,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u  }  static int -do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) +do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)  {  	if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)  		return sprintf(name, "%u", id); @@ -625,26 +613,40 @@ do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)  __be32  nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, -		__u32 *id) +		kuid_t *uid)  { -	return do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); +	__be32 status; +	u32 id = -1; +	status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); +	*uid = make_kuid(&init_user_ns, id); +	if (!uid_valid(*uid)) +		status = nfserr_badowner; +	return status;  }  __be32  nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, -		__u32 *id) +		kgid_t *gid)  { -	return do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id); +	__be32 status; +	u32 id = -1; +	status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); +	*gid = make_kgid(&init_user_ns, id); +	if (!gid_valid(*gid)) +		status = nfserr_badowner; +	return status;  }  int -nfsd_map_uid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) +nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name)  { +	u32 id = from_kuid(&init_user_ns, uid);  	return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name);  }  int -nfsd_map_gid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) +nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name)  { +	u32 id = from_kgid(&init_user_ns, gid);  	return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name);  } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6c9a4b291db..ae73175e6e6 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -40,6 +40,7 @@  #include "xdr4.h"  #include "vfs.h"  #include "current_stateid.h" +#include "netns.h"  #define NFSDDBG_FACILITY		NFSDDBG_PROC @@ -194,6 +195,7 @@ static __be32  do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)  {  	struct svc_fh *resfh; +	int accmode;  	__be32 status;  	resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); @@ -253,9 +255,10 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o  	/* set reply cache */  	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,  			&resfh->fh_handle); -	if (!open->op_created) -		status = do_open_permission(rqstp, resfh, open, -					    NFSD_MAY_NOP); +	accmode = NFSD_MAY_NOP; +	if (open->op_created) +		accmode |= NFSD_MAY_OWNER_OVERRIDE; +	status = do_open_permission(rqstp, resfh, open, accmode);  	set_change_info(&open->op_cinfo, current_fh);  	fh_dup2(current_fh, resfh);  out: @@ -304,6 +307,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  {  	__be32 status;  	struct nfsd4_compoundres *resp; +	struct net *net = SVC_NET(rqstp); +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",  		(int)open->op_fname.len, open->op_fname.data, @@ -331,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	/* check seqid for replay. set nfs4_owner */  	resp = rqstp->rq_resp; -	status = nfsd4_process_open1(&resp->cstate, open); +	status = nfsd4_process_open1(&resp->cstate, open, nn);  	if (status == nfserr_replay_me) {  		struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;  		fh_put(&cstate->current_fh); @@ -354,10 +359,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	/* Openowner is now set, so sequence id will get bumped.  Now we need  	 * these checks before we do any creates: */  	status = nfserr_grace; -	if (locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) +	if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)  		goto out;  	status = nfserr_no_grace; -	if (!locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) +	if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)  		goto out;  	switch (open->op_claim_type) { @@ -370,7 +375,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  			break;  		case NFS4_OPEN_CLAIM_PREVIOUS:  			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; -			status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion); +			status = nfs4_check_open_reclaim(&open->op_clientid, +							 cstate->minorversion, +							 nn);  			if (status)  				goto out;  		case NFS4_OPEN_CLAIM_FH: @@ -490,12 +497,13 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  			   &access->ac_supported);  } -static void gen_boot_verifier(nfs4_verifier *verifier) +static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)  {  	__be32 verf[2]; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); -	verf[0] = (__be32)nfssvc_boot.tv_sec; -	verf[1] = (__be32)nfssvc_boot.tv_usec; +	verf[0] = (__be32)nn->nfssvc_boot.tv_sec; +	verf[1] = (__be32)nn->nfssvc_boot.tv_usec;  	memcpy(verifier->data, verf, sizeof(verifier->data));  } @@ -503,7 +511,7 @@ static __be32  nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	     struct nfsd4_commit *commit)  { -	gen_boot_verifier(&commit->co_verf); +	gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp));  	return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,  			     commit->co_count);  } @@ -684,6 +692,17 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	if (read->rd_offset >= OFFSET_MAX)  		return nfserr_inval; +	/* +	 * If we do a zero copy read, then a client will see read data +	 * that reflects the state of the file *after* performing the +	 * following compound. +	 * +	 * To ensure proper ordering, we therefore turn off zero copy if +	 * the client wants us to do more in this compound: +	 */ +	if (!nfsd4_last_compound_op(rqstp)) +		rqstp->rq_splice_ok = false; +  	nfs4_lock_state();  	/* check stateid */  	if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), @@ -876,6 +895,24 @@ out:  	return status;  } +static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write) +{ +        int i = 1; +        int buflen = write->wr_buflen; + +        vec[0].iov_base = write->wr_head.iov_base; +        vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len); +        buflen -= vec[0].iov_len; + +        while (buflen) { +                vec[i].iov_base = page_address(write->wr_pagelist[i - 1]); +                vec[i].iov_len = min_t(int, PAGE_SIZE, buflen); +                buflen -= vec[i].iov_len; +                i++; +        } +        return i; +} +  static __be32  nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	    struct nfsd4_write *write) @@ -884,6 +921,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	struct file *filp = NULL;  	__be32 status = nfs_ok;  	unsigned long cnt; +	int nvecs;  	/* no need to check permission - this will be done in nfsd_write() */ @@ -904,10 +942,13 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	cnt = write->wr_buflen;  	write->wr_how_written = write->wr_stable_how; -	gen_boot_verifier(&write->wr_verifier); +	gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp)); + +	nvecs = fill_in_write_vector(rqstp->rq_vec, write); +	WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));  	status =  nfsd_write(rqstp, &cstate->current_fh, filp, -			     write->wr_offset, rqstp->rq_vec, write->wr_vlen, +			     write->wr_offset, rqstp->rq_vec, nvecs,  			     &cnt, &write->wr_how_written);  	if (filp)  		fput(filp); @@ -952,14 +993,15 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	if (!buf)  		return nfserr_jukebox; +	p = buf;  	status = nfsd4_encode_fattr(&cstate->current_fh,  				    cstate->current_fh.fh_export, -				    cstate->current_fh.fh_dentry, buf, -				    &count, verify->ve_bmval, +				    cstate->current_fh.fh_dentry, &p, +				    count, verify->ve_bmval,  				    rqstp, 0);  	/* this means that nfsd4_encode_fattr() ran out of space */ -	if (status == nfserr_resource && count == 0) +	if (status == nfserr_resource)  		status = nfserr_not_same;  	if (status)  		goto out_kfree; @@ -1666,6 +1708,12 @@ static struct nfsd4_operation nfsd4_ops[] = {  		.op_name = "OP_EXCHANGE_ID",  		.op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,  	}, +	[OP_BACKCHANNEL_CTL] = { +		.op_func = (nfsd4op_func)nfsd4_backchannel_ctl, +		.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, +		.op_name = "OP_BACKCHANNEL_CTL", +		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, +	},  	[OP_BIND_CONN_TO_SESSION] = {  		.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,  		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP @@ -1719,6 +1767,7 @@ static struct nfsd4_operation nfsd4_ops[] = {  		.op_func = (nfsd4op_func)nfsd4_free_stateid,  		.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,  		.op_name = "OP_FREE_STATEID", +		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,  		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,  	},  }; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 43295d45cc2..899ca26dd19 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -58,13 +58,11 @@ struct nfsd4_client_tracking_ops {  	void (*create)(struct nfs4_client *);  	void (*remove)(struct nfs4_client *);  	int (*check)(struct nfs4_client *); -	void (*grace_done)(struct net *, time_t); +	void (*grace_done)(struct nfsd_net *, time_t);  };  /* Globals */ -static struct file *rec_file;  static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; -static struct nfsd4_client_tracking_ops *client_tracking_ops;  static int  nfs4_save_creds(const struct cred **original_creds) @@ -75,8 +73,8 @@ nfs4_save_creds(const struct cred **original_creds)  	if (!new)  		return -ENOMEM; -	new->fsuid = 0; -	new->fsgid = 0; +	new->fsuid = GLOBAL_ROOT_UID; +	new->fsgid = GLOBAL_ROOT_GID;  	*original_creds = override_creds(new);  	put_cred(new);  	return 0; @@ -102,33 +100,39 @@ md5_to_hex(char *out, char *md5)  	*out = '\0';  } -__be32 -nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) +static int +nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)  {  	struct xdr_netobj cksum;  	struct hash_desc desc;  	struct scatterlist sg; -	__be32 status = nfserr_jukebox; +	int status;  	dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",  			clname->len, clname->data);  	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;  	desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); -	if (IS_ERR(desc.tfm)) +	if (IS_ERR(desc.tfm)) { +		status = PTR_ERR(desc.tfm);  		goto out_no_tfm; +	} +  	cksum.len = crypto_hash_digestsize(desc.tfm);  	cksum.data = kmalloc(cksum.len, GFP_KERNEL); -	if (cksum.data == NULL) +	if (cksum.data == NULL) { +		status = -ENOMEM;   		goto out; +	}  	sg_init_one(&sg, clname->data, clname->len); -	if (crypto_hash_digest(&desc, &sg, sg.length, cksum.data)) +	status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data); +	if (status)  		goto out;  	md5_to_hex(dname, cksum.data); -	status = nfs_ok; +	status = 0;  out:  	kfree(cksum.data);  	crypto_free_hash(desc.tfm); @@ -136,29 +140,61 @@ out_no_tfm:  	return status;  } +/* + * If we had an error generating the recdir name for the legacy tracker + * then warn the admin. If the error doesn't appear to be transient, + * then disable recovery tracking. + */ +static void +legacy_recdir_name_error(int error) +{ +	printk(KERN_ERR "NFSD: unable to generate recoverydir " +			"name (%d).\n", error); + +	/* +	 * if the algorithm just doesn't exist, then disable the recovery +	 * tracker altogether. The crypto libs will generally return this if +	 * FIPS is enabled as well. +	 */ +	if (error == -ENOENT) { +		printk(KERN_ERR "NFSD: disabling legacy clientid tracking. " +			"Reboot recovery will not function correctly!\n"); + +		/* the argument is ignored by the legacy exit function */ +		nfsd4_client_tracking_exit(NULL); +	} +} +  static void  nfsd4_create_clid_dir(struct nfs4_client *clp)  {  	const struct cred *original_cred; -	char *dname = clp->cl_recdir; +	char dname[HEXDIR_LEN];  	struct dentry *dir, *dentry; +	struct nfs4_client_reclaim *crp;  	int status; +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);  	dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);  	if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))  		return; -	if (!rec_file) +	if (!nn->rec_file)  		return; + +	status = nfs4_make_rec_clidname(dname, &clp->cl_name); +	if (status) +		return legacy_recdir_name_error(status); +  	status = nfs4_save_creds(&original_cred);  	if (status < 0)  		return; -	status = mnt_want_write_file(rec_file); +	status = mnt_want_write_file(nn->rec_file);  	if (status)  		return; -	dir = rec_file->f_path.dentry; +	dir = nn->rec_file->f_path.dentry;  	/* lock the parent */  	mutex_lock(&dir->d_inode->i_mutex); @@ -182,18 +218,24 @@ out_put:  	dput(dentry);  out_unlock:  	mutex_unlock(&dir->d_inode->i_mutex); -	if (status == 0) -		vfs_fsync(rec_file, 0); -	else +	if (status == 0) { +		if (nn->in_grace) { +			crp = nfs4_client_to_reclaim(dname, nn); +			if (crp) +				crp->cr_clp = clp; +		} +		vfs_fsync(nn->rec_file, 0); +	} else {  		printk(KERN_ERR "NFSD: failed to write recovery record"  				" (err %d); please check that %s exists"  				" and is writeable", status,  				user_recovery_dirname); -	mnt_drop_write_file(rec_file); +	} +	mnt_drop_write_file(nn->rec_file);  	nfs4_reset_creds(original_cred);  } -typedef int (recdir_func)(struct dentry *, struct dentry *); +typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *);  struct name_list {  	char name[HEXDIR_LEN]; @@ -219,10 +261,10 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,  }  static int -nfsd4_list_rec_dir(recdir_func *f) +nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)  {  	const struct cred *original_cred; -	struct dentry *dir = rec_file->f_path.dentry; +	struct dentry *dir = nn->rec_file->f_path.dentry;  	LIST_HEAD(names);  	int status; @@ -230,13 +272,13 @@ nfsd4_list_rec_dir(recdir_func *f)  	if (status < 0)  		return status; -	status = vfs_llseek(rec_file, 0, SEEK_SET); +	status = vfs_llseek(nn->rec_file, 0, SEEK_SET);  	if (status < 0) {  		nfs4_reset_creds(original_cred);  		return status;  	} -	status = vfs_readdir(rec_file, nfsd4_build_namelist, &names); +	status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);  	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);  	while (!list_empty(&names)) {  		struct name_list *entry; @@ -248,7 +290,7 @@ nfsd4_list_rec_dir(recdir_func *f)  				status = PTR_ERR(dentry);  				break;  			} -			status = f(dir, dentry); +			status = f(dir, dentry, nn);  			dput(dentry);  		}  		list_del(&entry->list); @@ -260,14 +302,14 @@ nfsd4_list_rec_dir(recdir_func *f)  }  static int -nfsd4_unlink_clid_dir(char *name, int namlen) +nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)  {  	struct dentry *dir, *dentry;  	int status;  	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); -	dir = rec_file->f_path.dentry; +	dir = nn->rec_file->f_path.dentry;  	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);  	dentry = lookup_one_len(name, dir, namlen);  	if (IS_ERR(dentry)) { @@ -289,37 +331,52 @@ static void  nfsd4_remove_clid_dir(struct nfs4_client *clp)  {  	const struct cred *original_cred; +	struct nfs4_client_reclaim *crp; +	char dname[HEXDIR_LEN];  	int status; +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); -	if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) +	if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))  		return; -	status = mnt_want_write_file(rec_file); +	status = nfs4_make_rec_clidname(dname, &clp->cl_name); +	if (status) +		return legacy_recdir_name_error(status); + +	status = mnt_want_write_file(nn->rec_file);  	if (status)  		goto out;  	clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);  	status = nfs4_save_creds(&original_cred);  	if (status < 0) -		goto out; +		goto out_drop_write; -	status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); +	status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);  	nfs4_reset_creds(original_cred); -	if (status == 0) -		vfs_fsync(rec_file, 0); -	mnt_drop_write_file(rec_file); +	if (status == 0) { +		vfs_fsync(nn->rec_file, 0); +		if (nn->in_grace) { +			/* remove reclaim record */ +			crp = nfsd4_find_reclaim_client(dname, nn); +			if (crp) +				nfs4_remove_reclaim_record(crp, nn); +		} +	} +out_drop_write: +	mnt_drop_write_file(nn->rec_file);  out:  	if (status)  		printk("NFSD: Failed to remove expired client state directory" -				" %.*s\n", HEXDIR_LEN, clp->cl_recdir); +				" %.*s\n", HEXDIR_LEN, dname);  }  static int -purge_old(struct dentry *parent, struct dentry *child) +purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)  {  	int status; -	if (nfs4_has_reclaimed_state(child->d_name.name, false)) +	if (nfs4_has_reclaimed_state(child->d_name.name, nn))  		return 0;  	status = vfs_rmdir(parent->d_inode, child); @@ -331,27 +388,29 @@ purge_old(struct dentry *parent, struct dentry *child)  }  static void -nfsd4_recdir_purge_old(struct net *net, time_t boot_time) +nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)  {  	int status; -	if (!rec_file) +	nn->in_grace = false; +	if (!nn->rec_file)  		return; -	status = mnt_want_write_file(rec_file); +	status = mnt_want_write_file(nn->rec_file);  	if (status)  		goto out; -	status = nfsd4_list_rec_dir(purge_old); +	status = nfsd4_list_rec_dir(purge_old, nn);  	if (status == 0) -		vfs_fsync(rec_file, 0); -	mnt_drop_write_file(rec_file); +		vfs_fsync(nn->rec_file, 0); +	mnt_drop_write_file(nn->rec_file);  out: +	nfs4_release_reclaim(nn);  	if (status)  		printk("nfsd4: failed to purge old clients from recovery" -			" directory %s\n", rec_file->f_path.dentry->d_name.name); +			" directory %s\n", nn->rec_file->f_path.dentry->d_name.name);  }  static int -load_recdir(struct dentry *parent, struct dentry *child) +load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)  {  	if (child->d_name.len != HEXDIR_LEN - 1) {  		printk("nfsd4: illegal name %s in recovery directory\n", @@ -359,21 +418,22 @@ load_recdir(struct dentry *parent, struct dentry *child)  		/* Keep trying; maybe the others are OK: */  		return 0;  	} -	nfs4_client_to_reclaim(child->d_name.name); +	nfs4_client_to_reclaim(child->d_name.name, nn);  	return 0;  }  static int -nfsd4_recdir_load(void) { +nfsd4_recdir_load(struct net *net) {  	int status; +	struct nfsd_net *nn =  net_generic(net, nfsd_net_id); -	if (!rec_file) +	if (!nn->rec_file)  		return 0; -	status = nfsd4_list_rec_dir(load_recdir); +	status = nfsd4_list_rec_dir(load_recdir, nn);  	if (status)  		printk("nfsd4: failed loading clients from recovery" -			" directory %s\n", rec_file->f_path.dentry->d_name.name); +			" directory %s\n", nn->rec_file->f_path.dentry->d_name.name);  	return status;  } @@ -382,15 +442,16 @@ nfsd4_recdir_load(void) {   */  static int -nfsd4_init_recdir(void) +nfsd4_init_recdir(struct net *net)  { +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	const struct cred *original_cred;  	int status;  	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",  			user_recovery_dirname); -	BUG_ON(rec_file); +	BUG_ON(nn->rec_file);  	status = nfs4_save_creds(&original_cred);  	if (status < 0) { @@ -400,23 +461,65 @@ nfsd4_init_recdir(void)  		return status;  	} -	rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); -	if (IS_ERR(rec_file)) { +	nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); +	if (IS_ERR(nn->rec_file)) {  		printk("NFSD: unable to find recovery directory %s\n",  				user_recovery_dirname); -		status = PTR_ERR(rec_file); -		rec_file = NULL; +		status = PTR_ERR(nn->rec_file); +		nn->rec_file = NULL;  	}  	nfs4_reset_creds(original_cred); +	if (!status) +		nn->in_grace = true;  	return status;  } + +static int +nfs4_legacy_state_init(struct net *net) +{ +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +	int i; + +	nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) * +					  CLIENT_HASH_SIZE, GFP_KERNEL); +	if (!nn->reclaim_str_hashtbl) +		return -ENOMEM; + +	for (i = 0; i < CLIENT_HASH_SIZE; i++) +		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); +	nn->reclaim_str_hashtbl_size = 0; + +	return 0; +} + +static void +nfs4_legacy_state_shutdown(struct net *net) +{ +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); + +	kfree(nn->reclaim_str_hashtbl); +} +  static int  nfsd4_load_reboot_recovery_data(struct net *net)  {  	int status; +	status = nfsd4_init_recdir(net); +	if (!status) +		status = nfsd4_recdir_load(net); +	if (status) +		printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); +	return status; +} + +static int +nfsd4_legacy_tracking_init(struct net *net) +{ +	int status; +  	/* XXX: The legacy code won't work in a container */  	if (net != &init_net) {  		WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client " @@ -424,30 +527,37 @@ nfsd4_load_reboot_recovery_data(struct net *net)  		return -EINVAL;  	} -	nfs4_lock_state(); -	status = nfsd4_init_recdir(); -	if (!status) -		status = nfsd4_recdir_load(); -	nfs4_unlock_state(); +	status = nfs4_legacy_state_init(net);  	if (status) -		printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); +		return status; + +	status = nfsd4_load_reboot_recovery_data(net); +	if (status) +		goto err; +	return 0; + +err: +	nfs4_legacy_state_shutdown(net);  	return status;  }  static void -nfsd4_shutdown_recdir(void) +nfsd4_shutdown_recdir(struct nfsd_net *nn)  { -	if (!rec_file) +	if (!nn->rec_file)  		return; -	fput(rec_file); -	rec_file = NULL; +	fput(nn->rec_file); +	nn->rec_file = NULL;  }  static void  nfsd4_legacy_tracking_exit(struct net *net)  { -	nfs4_release_reclaim(); -	nfsd4_shutdown_recdir(); +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); + +	nfs4_release_reclaim(nn); +	nfsd4_shutdown_recdir(nn); +	nfs4_legacy_state_shutdown(net);  }  /* @@ -480,13 +590,26 @@ nfs4_recoverydir(void)  static int  nfsd4_check_legacy_client(struct nfs4_client *clp)  { +	int status; +	char dname[HEXDIR_LEN]; +	struct nfs4_client_reclaim *crp; +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); +  	/* did we already find that this client is stable? */  	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))  		return 0; +	status = nfs4_make_rec_clidname(dname, &clp->cl_name); +	if (status) { +		legacy_recdir_name_error(status); +		return status; +	} +  	/* look for it in the reclaim hashtable otherwise */ -	if (nfsd4_find_reclaim_client(clp)) { +	crp = nfsd4_find_reclaim_client(dname, nn); +	if (crp) {  		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); +		crp->cr_clp = clp;  		return 0;  	} @@ -494,7 +617,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)  }  static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { -	.init		= nfsd4_load_reboot_recovery_data, +	.init		= nfsd4_legacy_tracking_init,  	.exit		= nfsd4_legacy_tracking_exit,  	.create		= nfsd4_create_clid_dir,  	.remove		= nfsd4_remove_clid_dir, @@ -785,8 +908,7 @@ nfsd4_cld_create(struct nfs4_client *clp)  {  	int ret;  	struct cld_upcall *cup; -	/* FIXME: determine net from clp */ -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);  	struct cld_net *cn = nn->cld_net;  	/* Don't upcall if it's already stored */ @@ -823,8 +945,7 @@ nfsd4_cld_remove(struct nfs4_client *clp)  {  	int ret;  	struct cld_upcall *cup; -	/* FIXME: determine net from clp */ -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);  	struct cld_net *cn = nn->cld_net;  	/* Don't upcall if it's already removed */ @@ -861,8 +982,7 @@ nfsd4_cld_check(struct nfs4_client *clp)  {  	int ret;  	struct cld_upcall *cup; -	/* FIXME: determine net from clp */ -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);  	struct cld_net *cn = nn->cld_net;  	/* Don't upcall if one was already stored during this grace pd */ @@ -892,11 +1012,10 @@ nfsd4_cld_check(struct nfs4_client *clp)  }  static void -nfsd4_cld_grace_done(struct net *net, time_t boot_time) +nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)  {  	int ret;  	struct cld_upcall *cup; -	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	struct cld_net *cn = nn->cld_net;  	cup = alloc_cld_upcall(cn); @@ -926,28 +1045,267 @@ static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {  	.grace_done	= nfsd4_cld_grace_done,  }; +/* upcall via usermodehelper */ +static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; +module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), +			S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program"); + +static bool cltrack_legacy_disable; +module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cltrack_legacy_disable, +		"Disable legacy recoverydir conversion. Default: false"); + +#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" +#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" + +static char * +nfsd4_cltrack_legacy_topdir(void) +{ +	int copied; +	size_t len; +	char *result; + +	if (cltrack_legacy_disable) +		return NULL; + +	len = strlen(LEGACY_TOPDIR_ENV_PREFIX) + +		strlen(nfs4_recoverydir()) + 1; + +	result = kmalloc(len, GFP_KERNEL); +	if (!result) +		return result; + +	copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s", +				nfs4_recoverydir()); +	if (copied >= len) { +		/* just return nothing if output was truncated */ +		kfree(result); +		return NULL; +	} + +	return result; +} + +static char * +nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) +{ +	int copied; +	size_t len; +	char *result; + +	if (cltrack_legacy_disable) +		return NULL; + +	/* +1 is for '/' between "topdir" and "recdir" */ +	len = strlen(LEGACY_RECDIR_ENV_PREFIX) + +		strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN; + +	result = kmalloc(len, GFP_KERNEL); +	if (!result) +		return result; + +	copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/", +				nfs4_recoverydir()); +	if (copied > (len - HEXDIR_LEN)) { +		/* just return nothing if output will be truncated */ +		kfree(result); +		return NULL; +	} + +	copied = nfs4_make_rec_clidname(result + copied, name); +	if (copied) { +		kfree(result); +		return NULL; +	} + +	return result; +} + +static int +nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy) +{ +	char *envp[2]; +	char *argv[4]; +	int ret; + +	if (unlikely(!cltrack_prog[0])) { +		dprintk("%s: cltrack_prog is disabled\n", __func__); +		return -EACCES; +	} + +	dprintk("%s: cmd: %s\n", __func__, cmd); +	dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); +	dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)"); + +	envp[0] = legacy; +	envp[1] = NULL; + +	argv[0] = (char *)cltrack_prog; +	argv[1] = cmd; +	argv[2] = arg; +	argv[3] = NULL; + +	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); +	/* +	 * Disable the upcall mechanism if we're getting an ENOENT or EACCES +	 * error. The admin can re-enable it on the fly by using sysfs +	 * once the problem has been fixed. +	 */ +	if (ret == -ENOENT || ret == -EACCES) { +		dprintk("NFSD: %s was not found or isn't executable (%d). " +			"Setting cltrack_prog to blank string!", +			cltrack_prog, ret); +		cltrack_prog[0] = '\0'; +	} +	dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret); + +	return ret; +} + +static char * +bin_to_hex_dup(const unsigned char *src, int srclen) +{ +	int i; +	char *buf, *hex; + +	/* +1 for terminating NULL */ +	buf = kmalloc((srclen * 2) + 1, GFP_KERNEL); +	if (!buf) +		return buf; + +	hex = buf; +	for (i = 0; i < srclen; i++) { +		sprintf(hex, "%2.2x", *src++); +		hex += 2; +	} +	return buf; +} + +static int +nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) +{ +	/* XXX: The usermode helper s not working in container yet. */ +	if (net != &init_net) { +		WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " +			"tracking in a container!\n"); +		return -EINVAL; +	} +	return nfsd4_umh_cltrack_upcall("init", NULL, NULL); +} + +static void +nfsd4_umh_cltrack_create(struct nfs4_client *clp) +{ +	char *hexid; + +	hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); +	if (!hexid) { +		dprintk("%s: can't allocate memory for upcall!\n", __func__); +		return; +	} +	nfsd4_umh_cltrack_upcall("create", hexid, NULL); +	kfree(hexid); +} + +static void +nfsd4_umh_cltrack_remove(struct nfs4_client *clp) +{ +	char *hexid; + +	hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); +	if (!hexid) { +		dprintk("%s: can't allocate memory for upcall!\n", __func__); +		return; +	} +	nfsd4_umh_cltrack_upcall("remove", hexid, NULL); +	kfree(hexid); +} + +static int +nfsd4_umh_cltrack_check(struct nfs4_client *clp) +{ +	int ret; +	char *hexid, *legacy; + +	hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); +	if (!hexid) { +		dprintk("%s: can't allocate memory for upcall!\n", __func__); +		return -ENOMEM; +	} +	legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); +	ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy); +	kfree(legacy); +	kfree(hexid); +	return ret; +} + +static void +nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn, +				time_t boot_time) +{ +	char *legacy; +	char timestr[22]; /* FIXME: better way to determine max size? */ + +	sprintf(timestr, "%ld", boot_time); +	legacy = nfsd4_cltrack_legacy_topdir(); +	nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy); +	kfree(legacy); +} + +static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { +	.init		= nfsd4_umh_cltrack_init, +	.exit		= NULL, +	.create		= nfsd4_umh_cltrack_create, +	.remove		= nfsd4_umh_cltrack_remove, +	.check		= nfsd4_umh_cltrack_check, +	.grace_done	= nfsd4_umh_cltrack_grace_done, +}; +  int  nfsd4_client_tracking_init(struct net *net)  {  	int status;  	struct path path; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); -	if (!client_tracking_ops) { -		client_tracking_ops = &nfsd4_cld_tracking_ops; -		status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); -		if (!status) { -			if (S_ISDIR(path.dentry->d_inode->i_mode)) -				client_tracking_ops = -						&nfsd4_legacy_tracking_ops; -			path_put(&path); -		} +	/* just run the init if it the method is already decided */ +	if (nn->client_tracking_ops) +		goto do_init; + +	/* +	 * First, try a UMH upcall. It should succeed or fail quickly, so +	 * there's little harm in trying that first. +	 */ +	nn->client_tracking_ops = &nfsd4_umh_tracking_ops; +	status = nn->client_tracking_ops->init(net); +	if (!status) +		return status; + +	/* +	 * See if the recoverydir exists and is a directory. If it is, +	 * then use the legacy ops. +	 */ +	nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; +	status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); +	if (!status) { +		status = S_ISDIR(path.dentry->d_inode->i_mode); +		path_put(&path); +		if (status) +			goto do_init;  	} -	status = client_tracking_ops->init(net); +	/* Finally, try to use nfsdcld */ +	nn->client_tracking_ops = &nfsd4_cld_tracking_ops; +	printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be " +			"removed in 3.10. Please transition to using " +			"nfsdcltrack.\n"); +do_init: +	status = nn->client_tracking_ops->init(net);  	if (status) {  		printk(KERN_WARNING "NFSD: Unable to initialize client "  				    "recovery tracking! (%d)\n", status); -		client_tracking_ops = NULL; +		nn->client_tracking_ops = NULL;  	}  	return status;  } @@ -955,40 +1313,49 @@ nfsd4_client_tracking_init(struct net *net)  void  nfsd4_client_tracking_exit(struct net *net)  { -	if (client_tracking_ops) { -		client_tracking_ops->exit(net); -		client_tracking_ops = NULL; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); + +	if (nn->client_tracking_ops) { +		if (nn->client_tracking_ops->exit) +			nn->client_tracking_ops->exit(net); +		nn->client_tracking_ops = NULL;  	}  }  void  nfsd4_client_record_create(struct nfs4_client *clp)  { -	if (client_tracking_ops) -		client_tracking_ops->create(clp); +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + +	if (nn->client_tracking_ops) +		nn->client_tracking_ops->create(clp);  }  void  nfsd4_client_record_remove(struct nfs4_client *clp)  { -	if (client_tracking_ops) -		client_tracking_ops->remove(clp); +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + +	if (nn->client_tracking_ops) +		nn->client_tracking_ops->remove(clp);  }  int  nfsd4_client_record_check(struct nfs4_client *clp)  { -	if (client_tracking_ops) -		return client_tracking_ops->check(clp); +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + +	if (nn->client_tracking_ops) +		return nn->client_tracking_ops->check(clp);  	return -EOPNOTSUPP;  }  void -nfsd4_record_grace_done(struct net *net, time_t boot_time) +nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)  { -	if (client_tracking_ops) -		client_tracking_ops->grace_done(net, boot_time); +	if (nn->client_tracking_ops) +		nn->client_tracking_ops->grace_done(nn, boot_time);  }  static int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d0237f872cc..16d39c6c4fb 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -40,20 +40,15 @@  #include <linux/pagemap.h>  #include <linux/ratelimit.h>  #include <linux/sunrpc/svcauth_gss.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include "xdr4.h"  #include "vfs.h"  #include "current_stateid.h" -#include "fault_inject.h"  #include "netns.h"  #define NFSDDBG_FACILITY                NFSDDBG_PROC -/* Globals */ -time_t nfsd4_lease = 90;     /* default lease time */ -time_t nfsd4_grace = 90; -  #define all_ones {{~0,~0},~0}  static const stateid_t one_stateid = {  	.si_generation = ~0, @@ -156,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)  }  static int num_delegations; -unsigned int max_delegations; +unsigned long max_delegations;  /*   * Open owner state (share locks) @@ -176,8 +171,6 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)  	return ret & OWNER_HASH_MASK;  } -static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE]; -  /* hash table for nfs4_file */  #define FILE_HASH_BITS                   8  #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS) @@ -192,7 +185,7 @@ static struct list_head file_hashtbl[FILE_HASH_SIZE];  static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)  { -	BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); +	WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));  	atomic_inc(&fp->fi_access[oflag]);  } @@ -251,7 +244,7 @@ static inline int get_new_stid(struct nfs4_stid *stid)  	 * preallocations that can exist at a time, but the state lock  	 * prevents anyone from using ours before we get here:  	 */ -	BUG_ON(error); +	WARN_ON_ONCE(error);  	/*  	 * It shouldn't be a problem to reuse an opaque stateid value.  	 * I don't think it is for 4.1.  But with 4.0 I worry that, for @@ -268,33 +261,46 @@ static inline int get_new_stid(struct nfs4_stid *stid)  	return new_stid;  } -static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) +static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct +kmem_cache *slab)  { -	stateid_t *s = &stid->sc_stateid; +	struct idr *stateids = &cl->cl_stateids; +	static int min_stateid = 0; +	struct nfs4_stid *stid;  	int new_id; -	stid->sc_type = type; +	stid = kmem_cache_alloc(slab, GFP_KERNEL); +	if (!stid) +		return NULL; + +	if (!idr_pre_get(stateids, GFP_KERNEL)) +		goto out_free; +	if (idr_get_new_above(stateids, stid, min_stateid, &new_id)) +		goto out_free;  	stid->sc_client = cl; -	s->si_opaque.so_clid = cl->cl_clientid; -	new_id = get_new_stid(stid); -	s->si_opaque.so_id = (u32)new_id; +	stid->sc_type = 0; +	stid->sc_stateid.si_opaque.so_id = new_id; +	stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;  	/* Will be incremented before return to client: */ -	s->si_generation = 0; -} +	stid->sc_stateid.si_generation = 0; -static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) -{ -	struct idr *stateids = &cl->cl_stateids; - -	if (!idr_pre_get(stateids, GFP_KERNEL)) -		return NULL;  	/* -	 * Note: if we fail here (or any time between now and the time -	 * we actually get the new idr), we won't need to undo the idr -	 * preallocation, since the idr code caps the number of -	 * preallocated entries. +	 * It shouldn't be a problem to reuse an opaque stateid value. +	 * I don't think it is for 4.1.  But with 4.0 I worry that, for +	 * example, a stray write retransmission could be accepted by +	 * the server when it should have been rejected.  Therefore, +	 * adopt a trick from the sctp code to attempt to maximize the +	 * amount of time until an id is reused, by ensuring they always +	 * "increase" (mod INT_MAX):  	 */ -	return kmem_cache_alloc(slab, GFP_KERNEL); + +	min_stateid = new_id+1; +	if (min_stateid == INT_MAX) +		min_stateid = 0; +	return stid; +out_free: +	kfree(stid); +	return NULL;  }  static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) @@ -323,7 +329,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv  	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));  	if (dp == NULL)  		return dp; -	init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); +	dp->dl_stid.sc_type = NFS4_DELEG_STID;  	/*  	 * delegation seqid's are never incremented.  The 4.1 special  	 * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -340,17 +346,25 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv  	fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle);  	dp->dl_time = 0;  	atomic_set(&dp->dl_count, 1); -	INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); +	nfsd4_init_callback(&dp->dl_recall);  	return dp;  } +static void free_stid(struct nfs4_stid *s, struct kmem_cache *slab) +{ +	struct idr *stateids = &s->sc_client->cl_stateids; + +	idr_remove(stateids, s->sc_stateid.si_opaque.so_id); +	kmem_cache_free(slab, s); +} +  void  nfs4_put_delegation(struct nfs4_delegation *dp)  {  	if (atomic_dec_and_test(&dp->dl_count)) {  		dprintk("NFSD: freeing dp %p\n",dp);  		put_nfs4_file(dp->dl_file); -		kmem_cache_free(deleg_slab, dp); +		free_stid(&dp->dl_stid, deleg_slab);  		num_delegations--;  	}  } @@ -367,9 +381,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)  static void unhash_stid(struct nfs4_stid *s)  { -	struct idr *stateids = &s->sc_client->cl_stateids; - -	idr_remove(stateids, s->sc_stateid.si_opaque.so_id); +	s->sc_type = 0;  }  /* Called under the state lock. */ @@ -390,14 +402,6 @@ unhash_delegation(struct nfs4_delegation *dp)   * SETCLIENTID state    */ -/* client_lock protects the client lru list and session hash table */ -static DEFINE_SPINLOCK(client_lock); - -/* Hash tables for nfs4_clientid state */ -#define CLIENT_HASH_BITS                 4 -#define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS) -#define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1) -  static unsigned int clientid_hashval(u32 id)  {  	return id & CLIENT_HASH_MASK; @@ -409,31 +413,6 @@ static unsigned int clientstr_hashval(const char *name)  }  /* - * reclaim_str_hashtbl[] holds known client info from previous reset/reboot - * used in reboot/reset lease grace period processing - * - * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed - * setclientid_confirmed info.  - * - * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed  - * setclientid info. - * - * client_lru holds client queue ordered by nfs4_client.cl_time - * for lease renewal. - * - * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time - * for last close replay. - */ -static struct list_head	reclaim_str_hashtbl[CLIENT_HASH_SIZE]; -static int reclaim_str_hashtbl_size = 0; -static struct list_head	conf_id_hashtbl[CLIENT_HASH_SIZE]; -static struct list_head	conf_str_hashtbl[CLIENT_HASH_SIZE]; -static struct list_head	unconf_str_hashtbl[CLIENT_HASH_SIZE]; -static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE]; -static struct list_head client_lru; -static struct list_head close_lru; - -/*   * We store the NONE, READ, WRITE, and BOTH bits separately in the   * st_{access,deny}_bmap field of the stateid, in order to track not   * only what share bits are currently in force, but also what @@ -526,7 +505,8 @@ static int nfs4_access_to_omode(u32 access)  	case NFS4_SHARE_ACCESS_BOTH:  		return O_RDWR;  	} -	BUG(); +	WARN_ON_ONCE(1); +	return O_RDONLY;  }  /* release all access and file references for a given stateid */ @@ -558,7 +538,7 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp)  static void free_generic_stateid(struct nfs4_ol_stateid *stp)  { -	kmem_cache_free(stateid_slab, stp); +	free_stid(&stp->st_stid, stateid_slab);  }  static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -652,9 +632,6 @@ static void release_openowner(struct nfs4_openowner *oo)  	nfs4_free_openowner(oo);  } -#define SESSION_HASH_SIZE	512 -static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; -  static inline int  hash_sessionid(struct nfs4_sessionid *sessionid)  { @@ -742,8 +719,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num)  	num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);  	spin_lock(&nfsd_drc_lock); -	avail = min_t(int, NFSD_MAX_MEM_PER_SESSION, -			nfsd_drc_max_mem - nfsd_drc_mem_used); +	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, +		    nfsd_drc_max_mem - nfsd_drc_mem_used);  	num = min_t(int, num, avail / slotsize);  	nfsd_drc_mem_used += num * slotsize;  	spin_unlock(&nfsd_drc_lock); @@ -785,9 +762,12 @@ out_free:  	return NULL;  } -static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize) +static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, +				   struct nfsd4_channel_attrs *req, +				   int numslots, int slotsize, +				   struct nfsd_net *nn)  { -	u32 maxrpc = nfsd_serv->sv_max_mesg; +	u32 maxrpc = nn->nfsd_serv->sv_max_mesg;  	new->maxreqs = numslots;  	new->maxresp_cached = min_t(u32, req->maxresp_cached, @@ -906,21 +886,27 @@ static void __free_session(struct nfsd4_session *ses)  static void free_session(struct kref *kref)  {  	struct nfsd4_session *ses; +	struct nfsd_net *nn; -	lockdep_assert_held(&client_lock);  	ses = container_of(kref, struct nfsd4_session, se_ref); +	nn = net_generic(ses->se_client->net, nfsd_net_id); + +	lockdep_assert_held(&nn->client_lock);  	nfsd4_del_conns(ses);  	__free_session(ses);  }  void nfsd4_put_session(struct nfsd4_session *ses)  { -	spin_lock(&client_lock); +	struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id); + +	spin_lock(&nn->client_lock);  	nfsd4_put_session_locked(ses); -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  } -static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan) +static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan, +					   struct nfsd_net *nn)  {  	struct nfsd4_session *new;  	int numslots, slotsize; @@ -938,16 +924,17 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)  	new = __alloc_session(slotsize, numslots);  	if (!new) { -		nfsd4_put_drc_mem(slotsize, fchan->maxreqs); +		nfsd4_put_drc_mem(slotsize, numslots);  		return NULL;  	} -	init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize); +	init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);  	return new;  } -static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) +static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)  {  	int idx; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	new->se_client = clp;  	gen_sessionid(new); @@ -957,14 +944,15 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s  	new->se_cb_seq_nr = 1;  	new->se_flags = cses->flags;  	new->se_cb_prog = cses->callback_prog; +	new->se_cb_sec = cses->cb_sec;  	kref_init(&new->se_ref);  	idx = hash_sessionid(&new->se_sessionid); -	spin_lock(&client_lock); -	list_add(&new->se_hash, &sessionid_hashtbl[idx]); +	spin_lock(&nn->client_lock); +	list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);  	spin_lock(&clp->cl_lock);  	list_add(&new->se_perclnt, &clp->cl_sessions);  	spin_unlock(&clp->cl_lock); -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  	if (cses->flags & SESSION4_BACK_CHAN) {  		struct sockaddr *sa = svc_addr(rqstp); @@ -978,20 +966,20 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s  		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);  		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);  	} -	return new;  }  /* caller must hold client_lock */  static struct nfsd4_session * -find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) +find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)  {  	struct nfsd4_session *elem;  	int idx; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	dump_sessionid(__func__, sessionid);  	idx = hash_sessionid(sessionid);  	/* Search in the appropriate list */ -	list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { +	list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) {  		if (!memcmp(elem->se_sessionid.data, sessionid->data,  			    NFS4_MAX_SESSIONID_LEN)) {  			return elem; @@ -1016,6 +1004,8 @@ unhash_session(struct nfsd4_session *ses)  static inline void  renew_client_locked(struct nfs4_client *clp)  { +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); +  	if (is_client_expired(clp)) {  		WARN_ON(1);  		printk("%s: client (clientid %08x/%08x) already expired\n", @@ -1028,16 +1018,18 @@ renew_client_locked(struct nfs4_client *clp)  	dprintk("renewing client (clientid %08x/%08x)\n",   			clp->cl_clientid.cl_boot,   			clp->cl_clientid.cl_id); -	list_move_tail(&clp->cl_lru, &client_lru); +	list_move_tail(&clp->cl_lru, &nn->client_lru);  	clp->cl_time = get_seconds();  }  static inline void  renew_client(struct nfs4_client *clp)  { -	spin_lock(&client_lock); +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + +	spin_lock(&nn->client_lock);  	renew_client_locked(clp); -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  }  /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ @@ -1075,7 +1067,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)  static inline void  free_client(struct nfs4_client *clp)  { -	lockdep_assert_held(&client_lock); +	struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id); + +	lockdep_assert_held(&nn->client_lock);  	while (!list_empty(&clp->cl_sessions)) {  		struct nfsd4_session *ses;  		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, @@ -1085,6 +1079,7 @@ free_client(struct nfs4_client *clp)  	}  	free_svc_cred(&clp->cl_cred);  	kfree(clp->cl_name.data); +	idr_destroy(&clp->cl_stateids);  	kfree(clp);  } @@ -1092,15 +1087,16 @@ void  release_session_client(struct nfsd4_session *session)  {  	struct nfs4_client *clp = session->se_client; +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); -	if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock)) +	if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))  		return;  	if (is_client_expired(clp)) {  		free_client(clp);  		session->se_client = NULL;  	} else  		renew_client_locked(clp); -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  }  /* must be called under the client_lock */ @@ -1123,6 +1119,7 @@ destroy_client(struct nfs4_client *clp)  	struct nfs4_openowner *oo;  	struct nfs4_delegation *dp;  	struct list_head reaplist; +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);  	INIT_LIST_HEAD(&reaplist);  	spin_lock(&recall_lock); @@ -1144,12 +1141,15 @@ destroy_client(struct nfs4_client *clp)  	if (clp->cl_cb_conn.cb_xprt)  		svc_xprt_put(clp->cl_cb_conn.cb_xprt);  	list_del(&clp->cl_idhash); -	list_del(&clp->cl_strhash); -	spin_lock(&client_lock); +	if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) +		rb_erase(&clp->cl_namenode, &nn->conf_name_tree); +	else +		rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); +	spin_lock(&nn->client_lock);  	unhash_client_locked(clp);  	if (atomic_read(&clp->cl_refcount) == 0)  		free_client(clp); -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  }  static void expire_client(struct nfs4_client *clp) @@ -1187,6 +1187,17 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)  	return 0;  } +static long long +compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) +{ +	long long res; + +	res = o1->len - o2->len; +	if (res) +		return res; +	return (long long)memcmp(o1->data, o2->data, o1->len); +} +  static int same_name(const char *n1, const char *n2)  {  	return 0 == memcmp(n1, n2, HEXDIR_LEN); @@ -1211,7 +1222,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)  	if (g1->ngroups != g2->ngroups)  		return false;  	for (i=0; i<g1->ngroups; i++) -		if (GROUP_AT(g1, i) != GROUP_AT(g2, i)) +		if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i)))  			return false;  	return true;  } @@ -1236,8 +1247,8 @@ static bool  same_creds(struct svc_cred *cr1, struct svc_cred *cr2)  {  	if ((is_gss_cred(cr1) != is_gss_cred(cr2)) -		|| (cr1->cr_uid != cr2->cr_uid) -		|| (cr1->cr_gid != cr2->cr_gid) +		|| (!uid_eq(cr1->cr_uid, cr2->cr_uid)) +		|| (!gid_eq(cr1->cr_gid, cr2->cr_gid))  		|| !groups_equal(cr1->cr_group_info, cr2->cr_group_info))  		return false;  	if (cr1->cr_principal == cr2->cr_principal) @@ -1247,10 +1258,9 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)  	return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);  } -static void gen_clid(struct nfs4_client *clp) +static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)  {  	static u32 current_clientid = 1; -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);  	clp->cl_clientid.cl_boot = nn->boot_time;  	clp->cl_clientid.cl_id = current_clientid++;  @@ -1268,7 +1278,12 @@ static void gen_confirm(struct nfs4_client *clp)  static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)  { -	return idr_find(&cl->cl_stateids, t->si_opaque.so_id); +	struct nfs4_stid *ret; + +	ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id); +	if (!ret || !ret->sc_type) +		return NULL; +	return ret;  }  static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) @@ -1283,12 +1298,14 @@ static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t  	return NULL;  } -static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, +static struct nfs4_client *create_client(struct xdr_netobj name,  		struct svc_rqst *rqstp, nfs4_verifier *verf)  {  	struct nfs4_client *clp;  	struct sockaddr *sa = svc_addr(rqstp);  	int ret; +	struct net *net = SVC_NET(rqstp); +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	clp = alloc_client(name);  	if (clp == NULL) @@ -1297,23 +1314,21 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,  	INIT_LIST_HEAD(&clp->cl_sessions);  	ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);  	if (ret) { -		spin_lock(&client_lock); +		spin_lock(&nn->client_lock);  		free_client(clp); -		spin_unlock(&client_lock); +		spin_unlock(&nn->client_lock);  		return NULL;  	}  	idr_init(&clp->cl_stateids); -	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);  	atomic_set(&clp->cl_refcount, 0);  	clp->cl_cb_state = NFSD4_CB_UNKNOWN;  	INIT_LIST_HEAD(&clp->cl_idhash); -	INIT_LIST_HEAD(&clp->cl_strhash);  	INIT_LIST_HEAD(&clp->cl_openowners);  	INIT_LIST_HEAD(&clp->cl_delegations);  	INIT_LIST_HEAD(&clp->cl_lru);  	INIT_LIST_HEAD(&clp->cl_callbacks);  	spin_lock_init(&clp->cl_lock); -	INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); +	nfsd4_init_callback(&clp->cl_cb_null);  	clp->cl_time = get_seconds();  	clear_bit(0, &clp->cl_cb_slot_busy);  	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); @@ -1321,17 +1336,60 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,  	rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);  	gen_confirm(clp);  	clp->cl_cb_session = NULL; +	clp->net = net;  	return clp;  }  static void -add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) +add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root) +{ +	struct rb_node **new = &(root->rb_node), *parent = NULL; +	struct nfs4_client *clp; + +	while (*new) { +		clp = rb_entry(*new, struct nfs4_client, cl_namenode); +		parent = *new; + +		if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0) +			new = &((*new)->rb_left); +		else +			new = &((*new)->rb_right); +	} + +	rb_link_node(&new_clp->cl_namenode, parent, new); +	rb_insert_color(&new_clp->cl_namenode, root); +} + +static struct nfs4_client * +find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root) +{ +	long long cmp; +	struct rb_node *node = root->rb_node; +	struct nfs4_client *clp; + +	while (node) { +		clp = rb_entry(node, struct nfs4_client, cl_namenode); +		cmp = compare_blob(&clp->cl_name, name); +		if (cmp > 0) +			node = node->rb_left; +		else if (cmp < 0) +			node = node->rb_right; +		else +			return clp; +	} +	return NULL; +} + +static void +add_to_unconfirmed(struct nfs4_client *clp)  {  	unsigned int idhashval; +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); -	list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); +	clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); +	add_clp_to_name_tree(clp, &nn->unconf_name_tree);  	idhashval = clientid_hashval(clp->cl_clientid.cl_id); -	list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); +	list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);  	renew_client(clp);  } @@ -1339,22 +1397,23 @@ static void  move_to_confirmed(struct nfs4_client *clp)  {  	unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); -	unsigned int strhashval; +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);  	dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); -	list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); -	strhashval = clientstr_hashval(clp->cl_recdir); -	list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); +	list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); +	rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); +	add_clp_to_name_tree(clp, &nn->conf_name_tree); +	set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);  	renew_client(clp);  }  static struct nfs4_client * -find_confirmed_client(clientid_t *clid, bool sessions) +find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)  {  	struct nfs4_client *clp;  	unsigned int idhashval = clientid_hashval(clid->cl_id); -	list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { +	list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) {  		if (same_clid(&clp->cl_clientid, clid)) {  			if ((bool)clp->cl_minorversion != sessions)  				return NULL; @@ -1366,12 +1425,12 @@ find_confirmed_client(clientid_t *clid, bool sessions)  }  static struct nfs4_client * -find_unconfirmed_client(clientid_t *clid, bool sessions) +find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)  {  	struct nfs4_client *clp;  	unsigned int idhashval = clientid_hashval(clid->cl_id); -	list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) { +	list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) {  		if (same_clid(&clp->cl_clientid, clid)) {  			if ((bool)clp->cl_minorversion != sessions)  				return NULL; @@ -1387,27 +1446,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)  }   static struct nfs4_client * -find_confirmed_client_by_str(const char *dname, unsigned int hashval) +find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)  { -	struct nfs4_client *clp; - -	list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { -		if (same_name(clp->cl_recdir, dname)) -			return clp; -	} -	return NULL; +	return find_clp_in_name_tree(name, &nn->conf_name_tree);  }  static struct nfs4_client * -find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) +find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)  { -	struct nfs4_client *clp; - -	list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { -		if (same_name(clp->cl_recdir, dname)) -			return clp; -	} -	return NULL; +	return find_clp_in_name_tree(name, &nn->unconf_name_tree);  }  static void @@ -1428,7 +1475,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r  	else  		goto out_err; -	conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val, +	conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val,  					    se->se_callback_addr_len,  					    (struct sockaddr *)&conn->cb_addr,  					    sizeof(conn->cb_addr)); @@ -1572,12 +1619,11 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,  {  	struct nfs4_client *unconf, *conf, *new;  	__be32 status; -	unsigned int		strhashval; -	char			dname[HEXDIR_LEN];  	char			addr_str[INET6_ADDRSTRLEN];  	nfs4_verifier		verf = exid->verifier;  	struct sockaddr		*sa = svc_addr(rqstp);  	bool	update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A; +	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	rpc_ntop(sa, addr_str, sizeof(addr_str));  	dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " @@ -1592,24 +1638,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,  	switch (exid->spa_how) {  	case SP4_NONE:  		break; +	default:				/* checked by xdr code */ +		WARN_ON_ONCE(1);  	case SP4_SSV: -		return nfserr_serverfault; -	default: -		BUG();				/* checked by xdr code */  	case SP4_MACH_CRED:  		return nfserr_serverfault;	/* no excuse :-/ */  	} -	status = nfs4_make_rec_clidname(dname, &exid->clname); - -	if (status) -		return status; - -	strhashval = clientstr_hashval(dname); -  	/* Cases below refer to rfc 5661 section 18.35.4: */  	nfs4_lock_state(); -	conf = find_confirmed_client_by_str(dname, strhashval); +	conf = find_confirmed_client_by_name(&exid->clname, nn);  	if (conf) {  		bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);  		bool verfs_match = same_verf(&verf, &conf->cl_verifier); @@ -1654,21 +1692,21 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,  		goto out;  	} -	unconf  = find_unconfirmed_client_by_str(dname, strhashval); +	unconf  = find_unconfirmed_client_by_name(&exid->clname, nn);  	if (unconf) /* case 4, possible retry or client restart */  		expire_client(unconf);  	/* case 1 (normal case) */  out_new: -	new = create_client(exid->clname, dname, rqstp, &verf); +	new = create_client(exid->clname, rqstp, &verf);  	if (new == NULL) {  		status = nfserr_jukebox;  		goto out;  	}  	new->cl_minorversion = 1; -	gen_clid(new); -	add_to_unconfirmed(new, strhashval); +	gen_clid(new, nn); +	add_to_unconfirmed(new);  out_copy:  	exid->clientid.cl_boot = new->cl_clientid.cl_boot;  	exid->clientid.cl_id = new->cl_clientid.cl_id; @@ -1761,12 +1799,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,  	struct nfsd4_conn *conn;  	struct nfsd4_clid_slot *cs_slot = NULL;  	__be32 status = 0; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)  		return nfserr_inval;  	if (check_forechannel_attrs(cr_ses->fore_channel))  		return nfserr_toosmall; -	new = alloc_session(&cr_ses->fore_channel); +	new = alloc_session(&cr_ses->fore_channel, nn);  	if (!new)  		return nfserr_jukebox;  	status = nfserr_jukebox; @@ -1775,8 +1814,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,  		goto out_free_session;  	nfs4_lock_state(); -	unconf = find_unconfirmed_client(&cr_ses->clientid, true); -	conf = find_confirmed_client(&cr_ses->clientid, true); +	unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); +	conf = find_confirmed_client(&cr_ses->clientid, true, nn);  	if (conf) {  		cs_slot = &conf->cl_cs_slot; @@ -1789,7 +1828,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,  			goto out_free_conn;  		}  	} else if (unconf) { -		unsigned int hash;  		struct nfs4_client *old;  		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||  		    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { @@ -1803,8 +1841,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,  			status = nfserr_seq_misordered;  			goto out_free_conn;  		} -		hash = clientstr_hashval(unconf->cl_recdir); -		old = find_confirmed_client_by_str(unconf->cl_recdir, hash); +		old = find_confirmed_client_by_name(&unconf->cl_name, nn);  		if (old)  			expire_client(old);  		move_to_confirmed(unconf); @@ -1832,25 +1869,18 @@ nfsd4_create_session(struct svc_rqst *rqstp,  	/* cache solo and embedded create sessions under the state lock */  	nfsd4_cache_create_session(cr_ses, cs_slot, status); -out:  	nfs4_unlock_state(); +out:  	dprintk("%s returns %d\n", __func__, ntohl(status));  	return status;  out_free_conn: +	nfs4_unlock_state();  	free_conn(conn);  out_free_session:  	__free_session(new);  	goto out;  } -static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) -{ -	struct nfsd4_compoundres *resp = rqstp->rq_resp; -	struct nfsd4_compoundargs *argp = rqstp->rq_argp; - -	return argp->opcnt == resp->opcnt; -} -  static __be32 nfsd4_map_bcts_dir(u32 *dir)  {  	switch (*dir) { @@ -1865,24 +1895,40 @@ static __be32 nfsd4_map_bcts_dir(u32 *dir)  	return nfserr_inval;  } +__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc) +{ +	struct nfsd4_session *session = cstate->session; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + +	spin_lock(&nn->client_lock); +	session->se_cb_prog = bc->bc_cb_program; +	session->se_cb_sec = bc->bc_cb_sec; +	spin_unlock(&nn->client_lock); + +	nfsd4_probe_callback(session->se_client); + +	return nfs_ok; +} +  __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,  		     struct nfsd4_compound_state *cstate,  		     struct nfsd4_bind_conn_to_session *bcts)  {  	__be32 status;  	struct nfsd4_conn *conn; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	if (!nfsd4_last_compound_op(rqstp))  		return nfserr_not_only_op; -	spin_lock(&client_lock); -	cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid); +	spin_lock(&nn->client_lock); +	cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));  	/* Sorta weird: we only need the refcnt'ing because new_conn acquires  	 * client_lock iself: */  	if (cstate->session) {  		nfsd4_get_session(cstate->session);  		atomic_inc(&cstate->session->se_client->cl_refcount);  	} -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  	if (!cstate->session)  		return nfserr_badsession; @@ -1910,6 +1956,7 @@ nfsd4_destroy_session(struct svc_rqst *r,  {  	struct nfsd4_session *ses;  	__be32 status = nfserr_badsession; +	struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);  	/* Notes:  	 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid @@ -1923,24 +1970,24 @@ nfsd4_destroy_session(struct svc_rqst *r,  			return nfserr_not_only_op;  	}  	dump_sessionid(__func__, &sessionid->sessionid); -	spin_lock(&client_lock); -	ses = find_in_sessionid_hashtbl(&sessionid->sessionid); +	spin_lock(&nn->client_lock); +	ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));  	if (!ses) { -		spin_unlock(&client_lock); +		spin_unlock(&nn->client_lock);  		goto out;  	}  	unhash_session(ses); -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  	nfs4_lock_state();  	nfsd4_probe_callback_sync(ses->se_client);  	nfs4_unlock_state(); -	spin_lock(&client_lock); +	spin_lock(&nn->client_lock);  	nfsd4_del_conns(ses);  	nfsd4_put_session_locked(ses); -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  	status = nfs_ok;  out:  	dprintk("%s returns %d\n", __func__, ntohl(status)); @@ -2006,6 +2053,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,  	struct nfsd4_slot *slot;  	struct nfsd4_conn *conn;  	__be32 status; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	if (resp->opcnt != 1)  		return nfserr_sequence_pos; @@ -2018,9 +2066,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,  	if (!conn)  		return nfserr_jukebox; -	spin_lock(&client_lock); +	spin_lock(&nn->client_lock);  	status = nfserr_badsession; -	session = find_in_sessionid_hashtbl(&seq->sessionid); +	session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));  	if (!session)  		goto out; @@ -2094,7 +2142,7 @@ out:  		}  	}  	kfree(conn); -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  	dprintk("%s: return %d\n", __func__, ntohl(status));  	return status;  } @@ -2104,10 +2152,11 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta  {  	struct nfs4_client *conf, *unconf, *clp;  	__be32 status = 0; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	nfs4_lock_state(); -	unconf = find_unconfirmed_client(&dc->clientid, true); -	conf = find_confirmed_client(&dc->clientid, true); +	unconf = find_unconfirmed_client(&dc->clientid, true, nn); +	conf = find_confirmed_client(&dc->clientid, true, nn);  	if (conf) {  		clp = conf; @@ -2181,20 +2230,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  {  	struct xdr_netobj 	clname = setclid->se_name;  	nfs4_verifier		clverifier = setclid->se_verf; -	unsigned int 		strhashval;  	struct nfs4_client	*conf, *unconf, *new;  	__be32 			status; -	char                    dname[HEXDIR_LEN]; -	 -	status = nfs4_make_rec_clidname(dname, &clname); -	if (status) -		return status; - -	strhashval = clientstr_hashval(dname); +	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	/* Cases below refer to rfc 3530 section 14.2.33: */  	nfs4_lock_state(); -	conf = find_confirmed_client_by_str(dname, strhashval); +	conf = find_confirmed_client_by_name(&clname, nn);  	if (conf) {  		/* case 0: */  		status = nfserr_clid_inuse; @@ -2209,21 +2251,21 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  			goto out;  		}  	} -	unconf = find_unconfirmed_client_by_str(dname, strhashval); +	unconf = find_unconfirmed_client_by_name(&clname, nn);  	if (unconf)  		expire_client(unconf);  	status = nfserr_jukebox; -	new = create_client(clname, dname, rqstp, &clverifier); +	new = create_client(clname, rqstp, &clverifier);  	if (new == NULL)  		goto out;  	if (conf && same_verf(&conf->cl_verifier, &clverifier))  		/* case 1: probable callback update */  		copy_clid(new, conf);  	else /* case 4 (new client) or cases 2, 3 (client reboot): */ -		gen_clid(new); +		gen_clid(new, nn);  	new->cl_minorversion = 0;  	gen_callback(new, setclid, rqstp); -	add_to_unconfirmed(new, strhashval); +	add_to_unconfirmed(new);  	setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;  	setclid->se_clientid.cl_id = new->cl_clientid.cl_id;  	memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); @@ -2243,14 +2285,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,  	nfs4_verifier confirm = setclientid_confirm->sc_confirm;   	clientid_t * clid = &setclientid_confirm->sc_clientid;  	__be32 status; -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); +	struct nfsd_net	*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	if (STALE_CLIENTID(clid, nn))  		return nfserr_stale_clientid;  	nfs4_lock_state(); -	conf = find_confirmed_client(clid, false); -	unconf = find_unconfirmed_client(clid, false); +	conf = find_confirmed_client(clid, false, nn); +	unconf = find_unconfirmed_client(clid, false, nn);  	/*  	 * We try hard to give out unique clientid's, so if we get an  	 * attempt to confirm the same clientid with a different cred, @@ -2276,9 +2318,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,  		nfsd4_probe_callback(conf);  		expire_client(unconf);  	} else { /* case 3: normal case; new or rebooted client */ -		unsigned int hash = clientstr_hashval(unconf->cl_recdir); - -		conf = find_confirmed_client_by_str(unconf->cl_recdir, hash); +		conf = find_confirmed_client_by_name(&unconf->cl_name, nn);  		if (conf)  			expire_client(conf);  		move_to_confirmed(unconf); @@ -2340,7 +2380,7 @@ nfsd4_init_slabs(void)  	if (openowner_slab == NULL)  		goto out_nomem;  	lockowner_slab = kmem_cache_create("nfsd4_lockowners", -			sizeof(struct nfs4_openowner), 0, 0, NULL); +			sizeof(struct nfs4_lockowner), 0, 0, NULL);  	if (lockowner_slab == NULL)  		goto out_nomem;  	file_slab = kmem_cache_create("nfsd4_files", @@ -2404,7 +2444,9 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj  static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)  { -	list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + +	list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);  	list_add(&oo->oo_perclient, &clp->cl_openowners);  } @@ -2427,9 +2469,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str  static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {  	struct nfs4_openowner *oo = open->op_openowner; -	struct nfs4_client *clp = oo->oo_owner.so_client; -	init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); +	stp->st_stid.sc_type = NFS4_OPEN_STID;  	INIT_LIST_HEAD(&stp->st_lockowners);  	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);  	list_add(&stp->st_perfile, &fp->fi_stateids); @@ -2444,11 +2485,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,  }  static void -move_to_close_lru(struct nfs4_openowner *oo) +move_to_close_lru(struct nfs4_openowner *oo, struct net *net)  { +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +  	dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); -	list_move_tail(&oo->oo_close_lru, &close_lru); +	list_move_tail(&oo->oo_close_lru, &nn->close_lru);  	oo->oo_time = get_seconds();  } @@ -2462,13 +2505,14 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,  }  static struct nfs4_openowner * -find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions) +find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, +			bool sessions, struct nfsd_net *nn)  {  	struct nfs4_stateowner *so;  	struct nfs4_openowner *oo;  	struct nfs4_client *clp; -	list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { +	list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) {  		if (!so->so_is_open_owner)  			continue;  		if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { @@ -2555,9 +2599,14 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)  	struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;  	struct nfs4_delegation *dp; -	BUG_ON(!fp); -	/* We assume break_lease is only called once per lease: */ -	BUG_ON(fp->fi_had_conflict); +	if (!fp) { +		WARN(1, "(%p)->fl_owner NULL\n", fl); +		return; +	} +	if (fp->fi_had_conflict) { +		WARN(1, "duplicate break on %p\n", fp); +		return; +	}  	/*  	 * We don't want the locks code to timeout the lease for us;  	 * we'll remove it ourself if a delegation isn't returned @@ -2599,14 +2648,13 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4  __be32  nfsd4_process_open1(struct nfsd4_compound_state *cstate, -		    struct nfsd4_open *open) +		    struct nfsd4_open *open, struct nfsd_net *nn)  {  	clientid_t *clientid = &open->op_clientid;  	struct nfs4_client *clp = NULL;  	unsigned int strhashval;  	struct nfs4_openowner *oo = NULL;  	__be32 status; -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);  	if (STALE_CLIENTID(&open->op_clientid, nn))  		return nfserr_stale_clientid; @@ -2619,10 +2667,11 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,  		return nfserr_jukebox;  	strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); -	oo = find_openstateowner_str(strhashval, open, cstate->minorversion); +	oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn);  	open->op_openowner = oo;  	if (!oo) { -		clp = find_confirmed_client(clientid, cstate->minorversion); +		clp = find_confirmed_client(clientid, cstate->minorversion, +					    nn);  		if (clp == NULL)  			return nfserr_expired;  		goto new_owner; @@ -2891,7 +2940,7 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)  			open->op_why_no_deleg = WND4_CANCELLED;  			break;  		case NFS4_SHARE_WANT_NO_DELEG: -			BUG();	/* not supposed to get here */ +			WARN_ON_ONCE(1);  		}  	}  } @@ -2959,6 +3008,7 @@ out:  	}  	return;  out_free: +	unhash_stid(&dp->dl_stid);  	nfs4_put_delegation(dp);  out_no_deleg:  	flag = NFS4_OPEN_DELEGATE_NONE; @@ -3104,27 +3154,32 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)  		free_generic_stateid(open->op_stp);  } +static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp) +{ +	struct nfs4_client *found; + +	if (STALE_CLIENTID(clid, nn)) +		return nfserr_stale_clientid; +	found = find_confirmed_client(clid, session, nn); +	if (clp) +		*clp = found; +	return found ? nfs_ok : nfserr_expired; +} +  __be32  nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	    clientid_t *clid)  {  	struct nfs4_client *clp;  	__be32 status; -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	nfs4_lock_state();  	dprintk("process_renew(%08x/%08x): starting\n",   			clid->cl_boot, clid->cl_id); -	status = nfserr_stale_clientid; -	if (STALE_CLIENTID(clid, nn)) -		goto out; -	clp = find_confirmed_client(clid, cstate->minorversion); -	status = nfserr_expired; -	if (clp == NULL) { -		/* We assume the client took too long to RENEW. */ -		dprintk("nfsd4_renew: clientid not found!\n"); +	status = lookup_clientid(clid, cstate->minorversion, nn, &clp); +	if (status)  		goto out; -	}  	status = nfserr_cb_path_down;  	if (!list_empty(&clp->cl_delegations)  			&& clp->cl_cb_state != NFSD4_CB_UP) @@ -3136,44 +3191,42 @@ out:  }  static void -nfsd4_end_grace(struct net *net) +nfsd4_end_grace(struct nfsd_net *nn)  { -	struct nfsd_net *nn = net_generic(net, nfsd_net_id); -  	/* do nothing if grace period already ended */  	if (nn->grace_ended)  		return;  	dprintk("NFSD: end of grace period\n");  	nn->grace_ended = true; -	nfsd4_record_grace_done(net, nn->boot_time); +	nfsd4_record_grace_done(nn, nn->boot_time);  	locks_end_grace(&nn->nfsd4_manager);  	/*  	 * Now that every NFSv4 client has had the chance to recover and  	 * to see the (possibly new, possibly shorter) lease time, we  	 * can safely set the next grace time to the current lease time:  	 */ -	nfsd4_grace = nfsd4_lease; +	nn->nfsd4_grace = nn->nfsd4_lease;  }  static time_t -nfs4_laundromat(void) +nfs4_laundromat(struct nfsd_net *nn)  {  	struct nfs4_client *clp;  	struct nfs4_openowner *oo;  	struct nfs4_delegation *dp;  	struct list_head *pos, *next, reaplist; -	time_t cutoff = get_seconds() - nfsd4_lease; -	time_t t, clientid_val = nfsd4_lease; -	time_t u, test_val = nfsd4_lease; +	time_t cutoff = get_seconds() - nn->nfsd4_lease; +	time_t t, clientid_val = nn->nfsd4_lease; +	time_t u, test_val = nn->nfsd4_lease;  	nfs4_lock_state();  	dprintk("NFSD: laundromat service - starting\n"); -	nfsd4_end_grace(&init_net); +	nfsd4_end_grace(nn);  	INIT_LIST_HEAD(&reaplist); -	spin_lock(&client_lock); -	list_for_each_safe(pos, next, &client_lru) { +	spin_lock(&nn->client_lock); +	list_for_each_safe(pos, next, &nn->client_lru) {  		clp = list_entry(pos, struct nfs4_client, cl_lru);  		if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {  			t = clp->cl_time - cutoff; @@ -3189,7 +3242,7 @@ nfs4_laundromat(void)  		unhash_client_locked(clp);  		list_add(&clp->cl_lru, &reaplist);  	} -	spin_unlock(&client_lock); +	spin_unlock(&nn->client_lock);  	list_for_each_safe(pos, next, &reaplist) {  		clp = list_entry(pos, struct nfs4_client, cl_lru);  		dprintk("NFSD: purging unused client (clientid %08x)\n", @@ -3199,6 +3252,8 @@ nfs4_laundromat(void)  	spin_lock(&recall_lock);  	list_for_each_safe(pos, next, &del_recall_lru) {  		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); +		if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) +			continue;  		if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {  			u = dp->dl_time - cutoff;  			if (test_val > u) @@ -3212,8 +3267,8 @@ nfs4_laundromat(void)  		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);  		unhash_delegation(dp);  	} -	test_val = nfsd4_lease; -	list_for_each_safe(pos, next, &close_lru) { +	test_val = nn->nfsd4_lease; +	list_for_each_safe(pos, next, &nn->close_lru) {  		oo = container_of(pos, struct nfs4_openowner, oo_close_lru);  		if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {  			u = oo->oo_time - cutoff; @@ -3231,16 +3286,19 @@ nfs4_laundromat(void)  static struct workqueue_struct *laundry_wq;  static void laundromat_main(struct work_struct *); -static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main);  static void -laundromat_main(struct work_struct *not_used) +laundromat_main(struct work_struct *laundry)  {  	time_t t; +	struct delayed_work *dwork = container_of(laundry, struct delayed_work, +						  work); +	struct nfsd_net *nn = container_of(dwork, struct nfsd_net, +					   laundromat_work); -	t = nfs4_laundromat(); +	t = nfs4_laundromat(nn);  	dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t); -	queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); +	queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);  }  static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) @@ -3385,16 +3443,17 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)  	return nfs_ok;  } -static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions) +static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, +				   struct nfs4_stid **s, bool sessions, +				   struct nfsd_net *nn)  {  	struct nfs4_client *cl; -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);  	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))  		return nfserr_bad_stateid;  	if (STALE_STATEID(stateid, nn))  		return nfserr_stale_stateid; -	cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions); +	cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn);  	if (!cl)  		return nfserr_expired;  	*s = find_stateid_by_type(cl, stateid, typemask); @@ -3416,6 +3475,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,  	struct nfs4_delegation *dp = NULL;  	struct svc_fh *current_fh = &cstate->current_fh;  	struct inode *ino = current_fh->fh_dentry->d_inode; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	__be32 status;  	if (filpp) @@ -3427,7 +3487,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,  	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))  		return check_special_stateids(net, current_fh, stateid, flags); -	status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion); +	status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, +				      &s, cstate->minorversion, nn);  	if (status)  		return status;  	status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); @@ -3441,7 +3502,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,  			goto out;  		if (filpp) {  			*filpp = dp->dl_file->fi_deleg_file; -			BUG_ON(!*filpp); +			if (!*filpp) { +				WARN_ON_ONCE(1); +				status = nfserr_serverfault; +				goto out; +			}  		}  		break;  	case NFS4_OPEN_STID: @@ -3568,7 +3633,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_  static __be32  nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,  			 stateid_t *stateid, char typemask, -			 struct nfs4_ol_stateid **stpp) +			 struct nfs4_ol_stateid **stpp, +			 struct nfsd_net *nn)  {  	__be32 status;  	struct nfs4_stid *s; @@ -3577,7 +3643,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,  		seqid, STATEID_VAL(stateid));  	*stpp = NULL; -	status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion); +	status = nfsd4_lookup_stateid(stateid, typemask, &s, +				      cstate->minorversion, nn);  	if (status)  		return status;  	*stpp = openlockstateid(s); @@ -3586,13 +3653,14 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,  	return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);  } -static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp) +static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, +						 stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn)  {  	__be32 status;  	struct nfs4_openowner *oo;  	status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, -						NFS4_OPEN_STID, stpp); +						NFS4_OPEN_STID, stpp, nn);  	if (status)  		return status;  	oo = openowner((*stpp)->st_stateowner); @@ -3608,6 +3676,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	__be32 status;  	struct nfs4_openowner *oo;  	struct nfs4_ol_stateid *stp; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",  			(int)cstate->current_fh.fh_dentry->d_name.len, @@ -3621,7 +3690,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	status = nfs4_preprocess_seqid_op(cstate,  					oc->oc_seqid, &oc->oc_req_stateid, -					NFS4_OPEN_STID, &stp); +					NFS4_OPEN_STID, &stp, nn);  	if (status)  		goto out;  	oo = openowner(stp->st_stateowner); @@ -3664,7 +3733,7 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac  	case NFS4_SHARE_ACCESS_BOTH:  		break;  	default: -		BUG(); +		WARN_ON_ONCE(1);  	}  } @@ -3685,6 +3754,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,  {  	__be32 status;  	struct nfs4_ol_stateid *stp; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n",   			(int)cstate->current_fh.fh_dentry->d_name.len, @@ -3697,7 +3767,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,  	nfs4_lock_state();  	status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, -					&od->od_stateid, &stp); +					&od->od_stateid, &stp, nn);  	if (status)  		goto out;   	status = nfserr_inval; @@ -3760,6 +3830,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	__be32 status;  	struct nfs4_openowner *oo;  	struct nfs4_ol_stateid *stp; +	struct net *net = SVC_NET(rqstp); +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	dprintk("NFSD: nfsd4_close on file %.*s\n",   			(int)cstate->current_fh.fh_dentry->d_name.len, @@ -3769,7 +3841,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,  					&close->cl_stateid,  					NFS4_OPEN_STID|NFS4_CLOSED_STID, -					&stp); +					&stp, nn);  	if (status)  		goto out;   	oo = openowner(stp->st_stateowner); @@ -3791,7 +3863,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  			 * little while to handle CLOSE replay.  			 */  			if (list_empty(&oo->oo_owner.so_stateids)) -				move_to_close_lru(oo); +				move_to_close_lru(oo, SVC_NET(rqstp));  		}  	}  out: @@ -3807,15 +3879,15 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	struct nfs4_delegation *dp;  	stateid_t *stateid = &dr->dr_stateid;  	struct nfs4_stid *s; -	struct inode *inode;  	__be32 status; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))  		return status; -	inode = cstate->current_fh.fh_dentry->d_inode;  	nfs4_lock_state(); -	status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion); +	status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, +				      cstate->minorversion, nn);  	if (status)  		goto out;  	dp = delegstateid(s); @@ -3833,8 +3905,6 @@ out:  #define LOFF_OVERFLOW(start, len)      ((u64)(len) > ~(u64)(start)) -#define LOCKOWNER_INO_HASH_BITS 8 -#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)  #define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)  static inline u64 @@ -3852,7 +3922,7 @@ last_byte_offset(u64 start, u64 len)  {  	u64 end; -	BUG_ON(!len); +	WARN_ON_ONCE(!len);  	end = start + len;  	return end > start ? end - 1: NFS4_MAX_UINT64;  } @@ -3864,8 +3934,6 @@ static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct  		& LOCKOWNER_INO_HASH_MASK;  } -static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE]; -  /*   * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that   * we can't properly handle lock requests that go beyond the (2^63 - 1)-th @@ -3931,12 +3999,12 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c  static struct nfs4_lockowner *  find_lockowner_str(struct inode *inode, clientid_t *clid, -		struct xdr_netobj *owner) +		   struct xdr_netobj *owner, struct nfsd_net *nn)  {  	unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);  	struct nfs4_lockowner *lo; -	list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { +	list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {  		if (same_lockowner_ino(lo, inode, clid, owner))  			return lo;  	} @@ -3948,9 +4016,10 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s  	struct inode *inode = open_stp->st_file->fi_inode;  	unsigned int inohash = lockowner_ino_hashval(inode,  			clp->cl_clientid.cl_id, &lo->lo_owner.so_owner); +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); -	list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); -	list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]); +	list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); +	list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]);  	list_add(&lo->lo_perstateid, &open_stp->st_lockowners);  } @@ -3987,7 +4056,7 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct  	stp = nfs4_alloc_stateid(clp);  	if (stp == NULL)  		return NULL; -	init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); +	stp->st_stid.sc_type = NFS4_LOCK_STID;  	list_add(&stp->st_perfile, &fp->fi_stateids);  	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);  	stp->st_stateowner = &lo->lo_owner; @@ -4024,8 +4093,10 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, s  	struct nfs4_client *cl = oo->oo_owner.so_client;  	struct nfs4_lockowner *lo;  	unsigned int strhashval; +	struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id); -	lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner); +	lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, +				&lock->v.new.owner, nn);  	if (lo) {  		if (!cstate->minorversion)  			return nfserr_bad_seqid; @@ -4065,7 +4136,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	bool new_state = false;  	int lkflg;  	int err; -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); +	struct net *net = SVC_NET(rqstp); +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",  		(long long) lock->lk_offset, @@ -4099,7 +4171,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  		status = nfs4_preprocess_confirmed_seqid_op(cstate,  				        lock->lk_new_open_seqid,  		                        &lock->lk_new_open_stateid, -					&open_stp); +					&open_stp, nn);  		if (status)  			goto out;  		open_sop = openowner(open_stp->st_stateowner); @@ -4113,7 +4185,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  		status = nfs4_preprocess_seqid_op(cstate,  				       lock->lk_old_lock_seqid,  				       &lock->lk_old_lock_stateid, -				       NFS4_LOCK_STID, &lock_stp); +				       NFS4_LOCK_STID, &lock_stp, nn);  	if (status)  		goto out;  	lock_sop = lockowner(lock_stp->st_stateowner); @@ -4124,10 +4196,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  		goto out;  	status = nfserr_grace; -	if (locks_in_grace(SVC_NET(rqstp)) && !lock->lk_reclaim) +	if (locks_in_grace(net) && !lock->lk_reclaim)  		goto out;  	status = nfserr_no_grace; -	if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim) +	if (!locks_in_grace(net) && lock->lk_reclaim)  		goto out;  	file_lock = locks_alloc_lock(); @@ -4238,7 +4310,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	struct file_lock *file_lock = NULL;  	struct nfs4_lockowner *lo;  	__be32 status; -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	if (locks_in_grace(SVC_NET(rqstp)))  		return nfserr_grace; @@ -4248,9 +4320,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	nfs4_lock_state(); -	status = nfserr_stale_clientid; -	if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid, nn)) -		goto out; +	if (!nfsd4_has_session(cstate)) { +		status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL); +		if (status) +			goto out; +	}  	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))  		goto out; @@ -4278,7 +4352,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  		goto out;  	} -	lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); +	lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn);  	if (lo)  		file_lock->fl_owner = (fl_owner_t)lo;  	file_lock->fl_pid = current->tgid; @@ -4313,7 +4387,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	struct file_lock *file_lock = NULL;  	__be32 status;  	int err; -						         +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); +  	dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",  		(long long) locku->lu_offset,  		(long long) locku->lu_length); @@ -4324,7 +4399,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  	nfs4_lock_state();  	status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, -					&locku->lu_stateid, NFS4_LOCK_STID, &stp); +					&locku->lu_stateid, NFS4_LOCK_STID, +					&stp, nn);  	if (status)  		goto out;  	filp = find_any_file(stp->st_file); @@ -4414,23 +4490,21 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,  	struct list_head matches;  	unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);  	__be32 status; -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",  		clid->cl_boot, clid->cl_id); -	/* XXX check for lease expiration */ - -	status = nfserr_stale_clientid; -	if (STALE_CLIENTID(clid, nn)) -		return status; -  	nfs4_lock_state(); +	status = lookup_clientid(clid, cstate->minorversion, nn, NULL); +	if (status) +		goto out; +  	status = nfserr_locks_held;  	INIT_LIST_HEAD(&matches); -	list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) { +	list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) {  		if (sop->so_is_open_owner)  			continue;  		if (!same_owner_str(sop, owner, clid)) @@ -4466,73 +4540,74 @@ alloc_reclaim(void)  	return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);  } -int -nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) +bool +nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)  { -	unsigned int strhashval = clientstr_hashval(name); -	struct nfs4_client *clp; +	struct nfs4_client_reclaim *crp; -	clp = find_confirmed_client_by_str(name, strhashval); -	if (!clp) -		return 0; -	return test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); +	crp = nfsd4_find_reclaim_client(name, nn); +	return (crp && crp->cr_clp);  }  /*   * failure => all reset bets are off, nfserr_no_grace...   */ -int -nfs4_client_to_reclaim(const char *name) +struct nfs4_client_reclaim * +nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn)  {  	unsigned int strhashval; -	struct nfs4_client_reclaim *crp = NULL; +	struct nfs4_client_reclaim *crp;  	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);  	crp = alloc_reclaim(); -	if (!crp) -		return 0; -	strhashval = clientstr_hashval(name); -	INIT_LIST_HEAD(&crp->cr_strhash); -	list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); -	memcpy(crp->cr_recdir, name, HEXDIR_LEN); -	reclaim_str_hashtbl_size++; -	return 1; +	if (crp) { +		strhashval = clientstr_hashval(name); +		INIT_LIST_HEAD(&crp->cr_strhash); +		list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); +		memcpy(crp->cr_recdir, name, HEXDIR_LEN); +		crp->cr_clp = NULL; +		nn->reclaim_str_hashtbl_size++; +	} +	return crp; +} + +void +nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) +{ +	list_del(&crp->cr_strhash); +	kfree(crp); +	nn->reclaim_str_hashtbl_size--;  }  void -nfs4_release_reclaim(void) +nfs4_release_reclaim(struct nfsd_net *nn)  {  	struct nfs4_client_reclaim *crp = NULL;  	int i;  	for (i = 0; i < CLIENT_HASH_SIZE; i++) { -		while (!list_empty(&reclaim_str_hashtbl[i])) { -			crp = list_entry(reclaim_str_hashtbl[i].next, +		while (!list_empty(&nn->reclaim_str_hashtbl[i])) { +			crp = list_entry(nn->reclaim_str_hashtbl[i].next,  			                struct nfs4_client_reclaim, cr_strhash); -			list_del(&crp->cr_strhash); -			kfree(crp); -			reclaim_str_hashtbl_size--; +			nfs4_remove_reclaim_record(crp, nn);  		}  	} -	BUG_ON(reclaim_str_hashtbl_size); +	WARN_ON_ONCE(nn->reclaim_str_hashtbl_size);  }  /*   * called from OPEN, CLAIM_PREVIOUS with a new clientid. */  struct nfs4_client_reclaim * -nfsd4_find_reclaim_client(struct nfs4_client *clp) +nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)  {  	unsigned int strhashval;  	struct nfs4_client_reclaim *crp = NULL; -	dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n", -		            clp->cl_name.len, clp->cl_name.data, -			    clp->cl_recdir); +	dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir); -	/* find clp->cl_name in reclaim_str_hashtbl */ -	strhashval = clientstr_hashval(clp->cl_recdir); -	list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) { -		if (same_name(crp->cr_recdir, clp->cl_recdir)) { +	strhashval = clientstr_hashval(recdir); +	list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) { +		if (same_name(crp->cr_recdir, recdir)) {  			return crp;  		}  	} @@ -4543,12 +4618,12 @@ nfsd4_find_reclaim_client(struct nfs4_client *clp)  * Called from OPEN. Look for clientid in reclaim list.  */  __be32 -nfs4_check_open_reclaim(clientid_t *clid, bool sessions) +nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)  {  	struct nfs4_client *clp;  	/* find clientid in conf_id_hashtbl */ -	clp = find_confirmed_client(clid, sessions); +	clp = find_confirmed_client(clid, sessions, nn);  	if (clp == NULL)  		return nfserr_reclaim_bad; @@ -4557,124 +4632,177 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions)  #ifdef CONFIG_NFSD_FAULT_INJECTION -void nfsd_forget_clients(u64 num) +u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)  { -	struct nfs4_client *clp, *next; -	int count = 0; - -	nfs4_lock_state(); -	list_for_each_entry_safe(clp, next, &client_lru, cl_lru) { -		expire_client(clp); -		if (++count == num) -			break; -	} -	nfs4_unlock_state(); - -	printk(KERN_INFO "NFSD: Forgot %d clients", count); +	expire_client(clp); +	return 1;  } -static void release_lockowner_sop(struct nfs4_stateowner *sop) +u64 nfsd_print_client(struct nfs4_client *clp, u64 num)  { -	release_lockowner(lockowner(sop)); +	char buf[INET6_ADDRSTRLEN]; +	rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); +	printk(KERN_INFO "NFS Client: %s\n", buf); +	return 1;  } -static void release_openowner_sop(struct nfs4_stateowner *sop) +static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, +			     const char *type)  { -	release_openowner(openowner(sop)); +	char buf[INET6_ADDRSTRLEN]; +	rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); +	printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);  } -static int nfsd_release_n_owners(u64 num, bool is_open_owner, -				void (*release_sop)(struct nfs4_stateowner *)) +static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *))  { -	int i, count = 0; -	struct nfs4_stateowner *sop, *next; +	struct nfs4_openowner *oop; +	struct nfs4_lockowner *lop, *lo_next; +	struct nfs4_ol_stateid *stp, *st_next; +	u64 count = 0; -	for (i = 0; i < OWNER_HASH_SIZE; i++) { -		list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) { -			if (sop->so_is_open_owner != is_open_owner) -				continue; -			release_sop(sop); -			if (++count == num) -				return count; +	list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) { +		list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) { +			list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) { +				if (func) +					func(lop); +				if (++count == max) +					return count; +			}  		}  	} +  	return count;  } -void nfsd_forget_locks(u64 num) +u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max)  { -	int count; - -	nfs4_lock_state(); -	count = nfsd_release_n_owners(num, false, release_lockowner_sop); -	nfs4_unlock_state(); +	return nfsd_foreach_client_lock(clp, max, release_lockowner); +} -	printk(KERN_INFO "NFSD: Forgot %d locks", count); +u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max) +{ +	u64 count = nfsd_foreach_client_lock(clp, max, NULL); +	nfsd_print_count(clp, count, "locked files"); +	return count;  } -void nfsd_forget_openowners(u64 num) +static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *))  { -	int count; +	struct nfs4_openowner *oop, *next; +	u64 count = 0; -	nfs4_lock_state(); -	count = nfsd_release_n_owners(num, true, release_openowner_sop); -	nfs4_unlock_state(); +	list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) { +		if (func) +			func(oop); +		if (++count == max) +			break; +	} -	printk(KERN_INFO "NFSD: Forgot %d open owners", count); +	return count;  } -static int nfsd_process_n_delegations(u64 num, struct list_head *list) +u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max)  { -	int i, count = 0; -	struct nfs4_file *fp, *fnext; -	struct nfs4_delegation *dp, *dnext; +	return nfsd_foreach_client_open(clp, max, release_openowner); +} -	for (i = 0; i < FILE_HASH_SIZE; i++) { -		list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) { -			list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) { -				list_move(&dp->dl_recall_lru, list); -				if (++count == num) -					return count; -			} -		} -	} +u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max) +{ +	u64 count = nfsd_foreach_client_open(clp, max, NULL); +	nfsd_print_count(clp, count, "open files"); +	return count; +} +static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, +				     struct list_head *victims) +{ +	struct nfs4_delegation *dp, *next; +	u64 count = 0; + +	list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { +		if (victims) +			list_move(&dp->dl_recall_lru, victims); +		if (++count == max) +			break; +	}  	return count;  } -void nfsd_forget_delegations(u64 num) +u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)  { -	unsigned int count; +	struct nfs4_delegation *dp, *next;  	LIST_HEAD(victims); -	struct nfs4_delegation *dp, *dnext; +	u64 count;  	spin_lock(&recall_lock); -	count = nfsd_process_n_delegations(num, &victims); +	count = nfsd_find_all_delegations(clp, max, &victims);  	spin_unlock(&recall_lock); -	nfs4_lock_state(); -	list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) +	list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)  		unhash_delegation(dp); -	nfs4_unlock_state(); -	printk(KERN_INFO "NFSD: Forgot %d delegations", count); +	return count;  } -void nfsd_recall_delegations(u64 num) +u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)  { -	unsigned int count; +	struct nfs4_delegation *dp, *next;  	LIST_HEAD(victims); -	struct nfs4_delegation *dp, *dnext; +	u64 count;  	spin_lock(&recall_lock); -	count = nfsd_process_n_delegations(num, &victims); -	list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) { -		list_del(&dp->dl_recall_lru); +	count = nfsd_find_all_delegations(clp, max, &victims); +	list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)  		nfsd_break_one_deleg(dp); -	}  	spin_unlock(&recall_lock); -	printk(KERN_INFO "NFSD: Recalled %d delegations", count); +	return count; +} + +u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max) +{ +	u64 count = 0; + +	spin_lock(&recall_lock); +	count = nfsd_find_all_delegations(clp, max, NULL); +	spin_unlock(&recall_lock); + +	nfsd_print_count(clp, count, "delegations"); +	return count; +} + +u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) +{ +	struct nfs4_client *clp, *next; +	u64 count = 0; +	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); + +	if (!nfsd_netns_ready(nn)) +		return 0; + +	list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { +		count += func(clp, max - count); +		if ((max != 0) && (count >= max)) +			break; +	} + +	return count; +} + +struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) +{ +	struct nfs4_client *clp; +	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); + +	if (!nfsd_netns_ready(nn)) +		return NULL; + +	list_for_each_entry(clp, &nn->client_lru, cl_lru) { +		if (memcmp(&clp->cl_addr, addr, addr_size) == 0) +			return clp; +	} +	return NULL;  }  #endif /* CONFIG_NFSD_FAULT_INJECTION */ @@ -4686,27 +4814,10 @@ nfs4_state_init(void)  {  	int i; -	for (i = 0; i < CLIENT_HASH_SIZE; i++) { -		INIT_LIST_HEAD(&conf_id_hashtbl[i]); -		INIT_LIST_HEAD(&conf_str_hashtbl[i]); -		INIT_LIST_HEAD(&unconf_str_hashtbl[i]); -		INIT_LIST_HEAD(&unconf_id_hashtbl[i]); -		INIT_LIST_HEAD(&reclaim_str_hashtbl[i]); -	} -	for (i = 0; i < SESSION_HASH_SIZE; i++) -		INIT_LIST_HEAD(&sessionid_hashtbl[i]);  	for (i = 0; i < FILE_HASH_SIZE; i++) {  		INIT_LIST_HEAD(&file_hashtbl[i]);  	} -	for (i = 0; i < OWNER_HASH_SIZE; i++) { -		INIT_LIST_HEAD(&ownerstr_hashtbl[i]); -	} -	for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) -		INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]); -	INIT_LIST_HEAD(&close_lru); -	INIT_LIST_HEAD(&client_lru);  	INIT_LIST_HEAD(&del_recall_lru); -	reclaim_str_hashtbl_size = 0;  }  /* @@ -4730,34 +4841,126 @@ set_max_delegations(void)  	max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);  } -/* initialization to perform when the nfsd service is started: */ +static int nfs4_state_create_net(struct net *net) +{ +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +	int i; + +	nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) * +			CLIENT_HASH_SIZE, GFP_KERNEL); +	if (!nn->conf_id_hashtbl) +		goto err; +	nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) * +			CLIENT_HASH_SIZE, GFP_KERNEL); +	if (!nn->unconf_id_hashtbl) +		goto err_unconf_id; +	nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) * +			OWNER_HASH_SIZE, GFP_KERNEL); +	if (!nn->ownerstr_hashtbl) +		goto err_ownerstr; +	nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) * +			LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL); +	if (!nn->lockowner_ino_hashtbl) +		goto err_lockowner_ino; +	nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) * +			SESSION_HASH_SIZE, GFP_KERNEL); +	if (!nn->sessionid_hashtbl) +		goto err_sessionid; + +	for (i = 0; i < CLIENT_HASH_SIZE; i++) { +		INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); +		INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); +	} +	for (i = 0; i < OWNER_HASH_SIZE; i++) +		INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]); +	for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) +		INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]); +	for (i = 0; i < SESSION_HASH_SIZE; i++) +		INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); +	nn->conf_name_tree = RB_ROOT; +	nn->unconf_name_tree = RB_ROOT; +	INIT_LIST_HEAD(&nn->client_lru); +	INIT_LIST_HEAD(&nn->close_lru); +	spin_lock_init(&nn->client_lock); + +	INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); +	get_net(net); + +	return 0; + +err_sessionid: +	kfree(nn->lockowner_ino_hashtbl); +err_lockowner_ino: +	kfree(nn->ownerstr_hashtbl); +err_ownerstr: +	kfree(nn->unconf_id_hashtbl); +err_unconf_id: +	kfree(nn->conf_id_hashtbl); +err: +	return -ENOMEM; +} + +static void +nfs4_state_destroy_net(struct net *net) +{ +	int i; +	struct nfs4_client *clp = NULL; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +	struct rb_node *node, *tmp; + +	for (i = 0; i < CLIENT_HASH_SIZE; i++) { +		while (!list_empty(&nn->conf_id_hashtbl[i])) { +			clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); +			destroy_client(clp); +		} +	} + +	node = rb_first(&nn->unconf_name_tree); +	while (node != NULL) { +		tmp = node; +		node = rb_next(tmp); +		clp = rb_entry(tmp, struct nfs4_client, cl_namenode); +		rb_erase(tmp, &nn->unconf_name_tree); +		destroy_client(clp); +	} + +	kfree(nn->sessionid_hashtbl); +	kfree(nn->lockowner_ino_hashtbl); +	kfree(nn->ownerstr_hashtbl); +	kfree(nn->unconf_id_hashtbl); +	kfree(nn->conf_id_hashtbl); +	put_net(net); +}  int -nfs4_state_start(void) +nfs4_state_start_net(struct net *net)  { -	struct net *net = &init_net;  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	int ret; -	/* -	 * FIXME: For now, we hang most of the pernet global stuff off of -	 * init_net until nfsd is fully containerized. Eventually, we'll -	 * need to pass a net pointer into this function, take a reference -	 * to that instead and then do most of the rest of this on a per-net -	 * basis. -	 */ -	get_net(net); +	ret = nfs4_state_create_net(net); +	if (ret) +		return ret;  	nfsd4_client_tracking_init(net);  	nn->boot_time = get_seconds();  	locks_start_grace(net, &nn->nfsd4_manager);  	nn->grace_ended = false; -	printk(KERN_INFO "NFSD: starting %ld-second grace period\n", -	       nfsd4_grace); +	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", +	       nn->nfsd4_grace, net); +	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); +	return 0; +} + +/* initialization to perform when the nfsd service is started: */ + +int +nfs4_state_start(void) +{ +	int ret; +  	ret = set_callback_cred(); -	if (ret) { -		ret = -ENOMEM; -		goto out_recovery; -	} +	if (ret) +		return -ENOMEM;  	laundry_wq = create_singlethread_workqueue("nfsd4");  	if (laundry_wq == NULL) {  		ret = -ENOMEM; @@ -4766,39 +4969,34 @@ nfs4_state_start(void)  	ret = nfsd4_create_callback_queue();  	if (ret)  		goto out_free_laundry; -	queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ); +  	set_max_delegations(); +  	return 0; +  out_free_laundry:  	destroy_workqueue(laundry_wq);  out_recovery: -	nfsd4_client_tracking_exit(net); -	put_net(net);  	return ret;  } -static void -__nfs4_state_shutdown(void) +/* should be called with the state lock held */ +void +nfs4_state_shutdown_net(struct net *net)  { -	int i; -	struct nfs4_client *clp = NULL;  	struct nfs4_delegation *dp = NULL;  	struct list_head *pos, *next, reaplist; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); + +	cancel_delayed_work_sync(&nn->laundromat_work); +	locks_end_grace(&nn->nfsd4_manager); -	for (i = 0; i < CLIENT_HASH_SIZE; i++) { -		while (!list_empty(&conf_id_hashtbl[i])) { -			clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); -			destroy_client(clp); -		} -		while (!list_empty(&unconf_str_hashtbl[i])) { -			clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash); -			destroy_client(clp); -		} -	}  	INIT_LIST_HEAD(&reaplist);  	spin_lock(&recall_lock);  	list_for_each_safe(pos, next, &del_recall_lru) {  		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); +		if (dp->dl_stid.sc_client->net != net) +			continue;  		list_move(&dp->dl_recall_lru, &reaplist);  	}  	spin_unlock(&recall_lock); @@ -4807,22 +5005,14 @@ __nfs4_state_shutdown(void)  		unhash_delegation(dp);  	} -	nfsd4_client_tracking_exit(&init_net); -	put_net(&init_net); +	nfsd4_client_tracking_exit(net); +	nfs4_state_destroy_net(net);  }  void  nfs4_state_shutdown(void)  { -	struct net *net = &init_net; -	struct nfsd_net *nn = net_generic(net, nfsd_net_id); - -	cancel_delayed_work_sync(&laundromat_work);  	destroy_workqueue(laundry_wq); -	locks_end_grace(&nn->nfsd4_manager); -	nfs4_lock_state(); -	__nfs4_state_shutdown(); -	nfs4_unlock_state();  	nfsd4_destroy_callback_queue();  } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fd548d15508..01168865dd3 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -53,6 +53,7 @@  #include "vfs.h"  #include "state.h"  #include "cache.h" +#include "netns.h"  #define NFSDDBG_FACILITY		NFSDDBG_XDR @@ -65,17 +66,17 @@  #define NFS4_REFERRAL_FSID_MINOR	0x8000000ULL  static __be32 -check_filename(char *str, int len, __be32 err) +check_filename(char *str, int len)  {  	int i;  	if (len == 0)  		return nfserr_inval;  	if (isdotent(str, len)) -		return err; +		return nfserr_badname;  	for (i = 0; i < len; i++)  		if (str[i] == '/') -			return err; +			return nfserr_badname;  	return 0;  } @@ -292,13 +293,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,  			ace->whotype = nfs4_acl_get_whotype(buf, dummy32);  			status = nfs_ok;  			if (ace->whotype != NFS4_ACL_WHO_NAMED) -				ace->who = 0; +				;  			else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)  				status = nfsd_map_name_to_gid(argp->rqstp, -						buf, dummy32, &ace->who); +						buf, dummy32, &ace->who_gid);  			else  				status = nfsd_map_name_to_uid(argp->rqstp, -						buf, dummy32, &ace->who); +						buf, dummy32, &ace->who_uid);  			if (status)  				return status;  		} @@ -422,6 +423,93 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access  	DECODE_TAIL;  } +static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +{ +	DECODE_HEAD; +	u32 dummy, uid, gid; +	char *machine_name; +	int i; +	int nr_secflavs; + +	/* callback_sec_params4 */ +	READ_BUF(4); +	READ32(nr_secflavs); +	cbs->flavor = (u32)(-1); +	for (i = 0; i < nr_secflavs; ++i) { +		READ_BUF(4); +		READ32(dummy); +		switch (dummy) { +		case RPC_AUTH_NULL: +			/* Nothing to read */ +			if (cbs->flavor == (u32)(-1)) +				cbs->flavor = RPC_AUTH_NULL; +			break; +		case RPC_AUTH_UNIX: +			READ_BUF(8); +			/* stamp */ +			READ32(dummy); + +			/* machine name */ +			READ32(dummy); +			READ_BUF(dummy); +			SAVEMEM(machine_name, dummy); + +			/* uid, gid */ +			READ_BUF(8); +			READ32(uid); +			READ32(gid); + +			/* more gids */ +			READ_BUF(4); +			READ32(dummy); +			READ_BUF(dummy * 4); +			if (cbs->flavor == (u32)(-1)) { +				kuid_t kuid = make_kuid(&init_user_ns, uid); +				kgid_t kgid = make_kgid(&init_user_ns, gid); +				if (uid_valid(kuid) && gid_valid(kgid)) { +					cbs->uid = kuid; +					cbs->gid = kgid; +					cbs->flavor = RPC_AUTH_UNIX; +				} else { +					dprintk("RPC_AUTH_UNIX with invalid" +						"uid or gid ignoring!\n"); +				} +			} +			break; +		case RPC_AUTH_GSS: +			dprintk("RPC_AUTH_GSS callback secflavor " +				"not supported!\n"); +			READ_BUF(8); +			/* gcbp_service */ +			READ32(dummy); +			/* gcbp_handle_from_server */ +			READ32(dummy); +			READ_BUF(dummy); +			p += XDR_QUADLEN(dummy); +			/* gcbp_handle_from_client */ +			READ_BUF(4); +			READ32(dummy); +			READ_BUF(dummy); +			break; +		default: +			dprintk("Illegal callback secflavor\n"); +			return nfserr_inval; +		} +	} +	DECODE_TAIL; +} + +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) +{ +	DECODE_HEAD; + +	READ_BUF(4); +	READ32(bc->bc_cb_program); +	nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); + +	DECODE_TAIL; +} +  static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)  {  	DECODE_HEAD; @@ -490,7 +578,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create  	READ32(create->cr_namelen);  	READ_BUF(create->cr_namelen);  	SAVEMEM(create->cr_name, create->cr_namelen); -	if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) +	if ((status = check_filename(create->cr_name, create->cr_namelen)))  		return status;  	status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, @@ -522,7 +610,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)  	READ32(link->li_namelen);  	READ_BUF(link->li_namelen);  	SAVEMEM(link->li_name, link->li_namelen); -	if ((status = check_filename(link->li_name, link->li_namelen, nfserr_inval))) +	if ((status = check_filename(link->li_name, link->li_namelen)))  		return status;  	DECODE_TAIL; @@ -616,7 +704,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup  	READ32(lookup->lo_len);  	READ_BUF(lookup->lo_len);  	SAVEMEM(lookup->lo_name, lookup->lo_len); -	if ((status = check_filename(lookup->lo_name, lookup->lo_len, nfserr_noent))) +	if ((status = check_filename(lookup->lo_name, lookup->lo_len)))  		return status;  	DECODE_TAIL; @@ -780,7 +868,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)  		READ32(open->op_fname.len);  		READ_BUF(open->op_fname.len);  		SAVEMEM(open->op_fname.data, open->op_fname.len); -		if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) +		if ((status = check_filename(open->op_fname.data, open->op_fname.len)))  			return status;  		break;  	case NFS4_OPEN_CLAIM_PREVIOUS: @@ -795,7 +883,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)  		READ32(open->op_fname.len);  		READ_BUF(open->op_fname.len);  		SAVEMEM(open->op_fname.data, open->op_fname.len); -		if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) +		if ((status = check_filename(open->op_fname.data, open->op_fname.len)))  			return status;  		break;  	case NFS4_OPEN_CLAIM_FH: @@ -907,7 +995,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove  	READ32(remove->rm_namelen);  	READ_BUF(remove->rm_namelen);  	SAVEMEM(remove->rm_name, remove->rm_namelen); -	if ((status = check_filename(remove->rm_name, remove->rm_namelen, nfserr_noent))) +	if ((status = check_filename(remove->rm_name, remove->rm_namelen)))  		return status;  	DECODE_TAIL; @@ -925,9 +1013,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename  	READ32(rename->rn_tnamelen);  	READ_BUF(rename->rn_tnamelen);  	SAVEMEM(rename->rn_tname, rename->rn_tnamelen); -	if ((status = check_filename(rename->rn_sname, rename->rn_snamelen, nfserr_noent))) +	if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))  		return status; -	if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen, nfserr_inval))) +	if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))  		return status;  	DECODE_TAIL; @@ -954,8 +1042,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,  	READ32(secinfo->si_namelen);  	READ_BUF(secinfo->si_namelen);  	SAVEMEM(secinfo->si_name, secinfo->si_namelen); -	status = check_filename(secinfo->si_name, secinfo->si_namelen, -								nfserr_noent); +	status = check_filename(secinfo->si_name, secinfo->si_namelen);  	if (status)  		return status;  	DECODE_TAIL; @@ -1026,31 +1113,14 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s  static __be32  nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)  { -#if 0 -	struct nfsd4_compoundargs save = { -		.p = argp->p, -		.end = argp->end, -		.rqstp = argp->rqstp, -	}; -	u32             ve_bmval[2]; -	struct iattr    ve_iattr;           /* request */ -	struct nfs4_acl *ve_acl;            /* request */ -#endif  	DECODE_HEAD;  	if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))  		goto out;  	/* For convenience's sake, we compare raw xdr'd attributes in -	 * nfsd4_proc_verify; however we still decode here just to return -	 * correct error in case of bad xdr. */ -#if 0 -	status = nfsd4_decode_fattr(ve_bmval, &ve_iattr, &ve_acl); -	if (status == nfserr_inval) { -		status = nfserrno(status); -		goto out; -	} -#endif +	 * nfsd4_proc_verify */ +  	READ_BUF(4);  	READ32(verify->ve_attrlen);  	READ_BUF(verify->ve_attrlen); @@ -1063,7 +1133,6 @@ static __be32  nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)  {  	int avail; -	int v;  	int len;  	DECODE_HEAD; @@ -1087,27 +1156,26 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)  				__FILE__, __LINE__);  		goto xdr_error;  	} -	argp->rqstp->rq_vec[0].iov_base = p; -	argp->rqstp->rq_vec[0].iov_len = avail; -	v = 0; -	len = write->wr_buflen; -	while (len > argp->rqstp->rq_vec[v].iov_len) { -		len -= argp->rqstp->rq_vec[v].iov_len; -		v++; -		argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]); -		argp->pagelist++; -		if (argp->pagelen >= PAGE_SIZE) { -			argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE; -			argp->pagelen -= PAGE_SIZE; -		} else { -			argp->rqstp->rq_vec[v].iov_len = argp->pagelen; -			argp->pagelen -= len; -		} +	write->wr_head.iov_base = p; +	write->wr_head.iov_len = avail; +	WARN_ON(avail != (XDR_QUADLEN(avail) << 2)); +	write->wr_pagelist = argp->pagelist; + +	len = XDR_QUADLEN(write->wr_buflen) << 2; +	if (len >= avail) { +		int pages; + +		len -= avail; + +		pages = len >> PAGE_SHIFT; +		argp->pagelist += pages; +		argp->pagelen -= pages * PAGE_SIZE; +		len -= pages * PAGE_SIZE; + +		argp->p = (__be32 *)page_address(argp->pagelist[0]); +		argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);  	} -	argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len); -	argp->p = (__be32*)  (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2)); -	argp->rqstp->rq_vec[v].iov_len = len; -	write->wr_vlen = v+1; +	argp->p += XDR_QUADLEN(len);  	DECODE_TAIL;  } @@ -1237,11 +1305,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,  			    struct nfsd4_create_session *sess)  {  	DECODE_HEAD; -  	u32 dummy; -	char *machine_name; -	int i; -	int nr_secflavs;  	READ_BUF(16);  	COPYMEM(&sess->clientid, 8); @@ -1282,58 +1346,9 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,  		goto xdr_error;  	} -	READ_BUF(8); +	READ_BUF(4);  	READ32(sess->callback_prog); - -	/* callback_sec_params4 */ -	READ32(nr_secflavs); -	for (i = 0; i < nr_secflavs; ++i) { -		READ_BUF(4); -		READ32(dummy); -		switch (dummy) { -		case RPC_AUTH_NULL: -			/* Nothing to read */ -			break; -		case RPC_AUTH_UNIX: -			READ_BUF(8); -			/* stamp */ -			READ32(dummy); - -			/* machine name */ -			READ32(dummy); -			READ_BUF(dummy); -			SAVEMEM(machine_name, dummy); - -			/* uid, gid */ -			READ_BUF(8); -			READ32(sess->uid); -			READ32(sess->gid); - -			/* more gids */ -			READ_BUF(4); -			READ32(dummy); -			READ_BUF(dummy * 4); -			break; -		case RPC_AUTH_GSS: -			dprintk("RPC_AUTH_GSS callback secflavor " -				"not supported!\n"); -			READ_BUF(8); -			/* gcbp_service */ -			READ32(dummy); -			/* gcbp_handle_from_server */ -			READ32(dummy); -			READ_BUF(dummy); -			p += XDR_QUADLEN(dummy); -			/* gcbp_handle_from_client */ -			READ_BUF(4); -			READ32(dummy); -			READ_BUF(dummy); -			break; -		default: -			dprintk("Illegal callback secflavor\n"); -			return nfserr_inval; -		} -	} +	nfsd4_decode_cb_sec(argp, &sess->cb_sec);  	DECODE_TAIL;  } @@ -1528,7 +1543,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {  	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_notsupp,  	/* new operations for NFSv4.1 */ -	[OP_BACKCHANNEL_CTL]	= (nfsd4_dec)nfsd4_decode_notsupp, +	[OP_BACKCHANNEL_CTL]	= (nfsd4_dec)nfsd4_decode_backchannel_ctl,  	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,  	[OP_EXCHANGE_ID]	= (nfsd4_dec)nfsd4_decode_exchange_id,  	[OP_CREATE_SESSION]	= (nfsd4_dec)nfsd4_decode_create_session, @@ -1568,12 +1583,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)  	bool cachethis = false;  	int i; -	/* -	 * XXX: According to spec, we should check the tag -	 * for UTF-8 compliance.  I'm postponing this for -	 * now because it seems that some clients do use -	 * binary tags. -	 */  	READ_BUF(4);  	READ32(argp->taglen);  	READ_BUF(argp->taglen + 8); @@ -1603,38 +1612,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)  		op = &argp->ops[i];  		op->replay = NULL; -		/* -		 * We can't use READ_BUF() here because we need to handle -		 * a missing opcode as an OP_WRITE + 1. So we need to check -		 * to see if we're truly at the end of our buffer or if there -		 * is another page we need to flip to. -		 */ - -		if (argp->p == argp->end) { -			if (argp->pagelen < 4) { -				/* There isn't an opcode still on the wire */ -				op->opnum = OP_WRITE + 1; -				op->status = nfserr_bad_xdr; -				argp->opcnt = i+1; -				break; -			} - -			/* -			 * False alarm. We just hit a page boundary, but there -			 * is still data available.  Move pointer across page -			 * boundary.  *snip from READ_BUF* -			 */ -			argp->p = page_address(argp->pagelist[0]); -			argp->pagelist++; -			if (argp->pagelen < PAGE_SIZE) { -				argp->end = argp->p + (argp->pagelen>>2); -				argp->pagelen = 0; -			} else { -				argp->end = argp->p + (PAGE_SIZE>>2); -				argp->pagelen -= PAGE_SIZE; -			} -		} -		op->opnum = ntohl(*argp->p++); +		READ_BUF(4); +		READ32(op->opnum);  		if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)  			op->status = ops->decoders[op->opnum](argp, &op->u); @@ -1954,7 +1933,7 @@ static u32 nfs4_file_type(umode_t mode)  }  static __be32 -nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, +nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid,  			__be32 **p, int *buflen)  {  	int status; @@ -1963,10 +1942,10 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,  		return nfserr_resource;  	if (whotype != NFS4_ACL_WHO_NAMED)  		status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1)); -	else if (group) -		status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1)); +	else if (gid_valid(gid)) +		status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1));  	else -		status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1)); +		status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1));  	if (status < 0)  		return nfserrno(status);  	*p = xdr_encode_opaque(*p, NULL, status); @@ -1976,22 +1955,33 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,  }  static inline __be32 -nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen) +nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen)  { -	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen); +	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID, +				 p, buflen);  }  static inline __be32 -nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen) +nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen)  { -	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen); +	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group, +				 p, buflen);  }  static inline __be32 -nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group, +nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,  		__be32 **p, int *buflen)  { -	return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen); +	kuid_t uid = INVALID_UID; +	kgid_t gid = INVALID_GID; + +	if (ace->whotype == NFS4_ACL_WHO_NAMED) { +		if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) +			gid = ace->who_gid; +		else +			uid = ace->who_uid; +	} +	return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen);  }  #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -2014,16 +2004,31 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)  	return 0;  } + +static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) +{ +	struct path path = exp->ex_path; +	int err; + +	path_get(&path); +	while (follow_up(&path)) { +		if (path.dentry != path.mnt->mnt_root) +			break; +	} +	err = vfs_getattr(&path, stat); +	path_put(&path); +	return err; +} +  /*   * Note: @fhp can be NULL; in this case, we might have to compose the filehandle   * ourselves.   * - * @countp is the buffer size in _words_; upon successful return this becomes - * replaced with the number of words written. + * countp is the buffer size in _words_   */  __be32  nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, -		struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval, +		struct dentry *dentry, __be32 **buffer, int count, u32 *bmval,  		struct svc_rqst *rqstp, int ignore_crossmnt)  {  	u32 bmval0 = bmval[0]; @@ -2032,12 +2037,12 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,  	struct kstat stat;  	struct svc_fh tempfh;  	struct kstatfs statfs; -	int buflen = *countp << 2; +	int buflen = count << 2;  	__be32 *attrlenp;  	u32 dummy;  	u64 dummy64;  	u32 rdattr_err = 0; -	__be32 *p = buffer; +	__be32 *p = *buffer;  	__be32 status;  	int err;  	int aclsupport = 0; @@ -2048,6 +2053,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,  		.mnt	= exp->ex_path.mnt,  		.dentry	= dentry,  	}; +	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);  	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);  	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); @@ -2061,7 +2067,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,  			goto out;  	} -	err = vfs_getattr(exp->ex_path.mnt, dentry, &stat); +	err = vfs_getattr(&path, &stat);  	if (err)  		goto out_nfserr;  	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | @@ -2208,7 +2214,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,  	if (bmval0 & FATTR4_WORD0_LEASE_TIME) {  		if ((buflen -= 4) < 0)  			goto out_resource; -		WRITE32(nfsd4_lease); +		WRITE32(nn->nfsd4_lease);  	}  	if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {  		if ((buflen -= 4) < 0) @@ -2235,9 +2241,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,  			WRITE32(ace->type);  			WRITE32(ace->flag);  			WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); -			status = nfsd4_encode_aclname(rqstp, ace->whotype, -				ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP, -				&p, &buflen); +			status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen);  			if (status == nfserr_resource)  				goto out_resource;  			if (status) @@ -2430,18 +2434,8 @@ out_acl:  		 * and this is the root of a cross-mounted filesystem.  		 */  		if (ignore_crossmnt == 0 && -		    dentry == exp->ex_path.mnt->mnt_root) { -			struct path path = exp->ex_path; -			path_get(&path); -			while (follow_up(&path)) { -				if (path.dentry != path.mnt->mnt_root) -					break; -			} -			err = vfs_getattr(path.mnt, path.dentry, &stat); -			path_put(&path); -			if (err) -				goto out_nfserr; -		} +		    dentry == exp->ex_path.mnt->mnt_root) +			get_parent_attributes(exp, &stat);  		WRITE64(stat.ino);  	}  	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { @@ -2452,7 +2446,7 @@ out_acl:  	}  	*attrlenp = htonl((char *)p - (char *)attrlenp - 4); -	*countp = p - buffer; +	*buffer = p;  	status = nfs_ok;  out: @@ -2464,7 +2458,6 @@ out_nfserr:  	status = nfserrno(err);  	goto out;  out_resource: -	*countp = 0;  	status = nfserr_resource;  	goto out;  out_serverfault: @@ -2483,7 +2476,7 @@ static inline int attributes_need_mount(u32 *bmval)  static __be32  nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, -		const char *name, int namlen, __be32 *p, int *buflen) +		const char *name, int namlen, __be32 **p, int buflen)  {  	struct svc_export *exp = cd->rd_fhp->fh_export;  	struct dentry *dentry; @@ -2589,10 +2582,9 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,  	p = xdr_encode_hyper(p, NFS_OFFSET_MAX);    /* offset of next entry */  	p = xdr_encode_array(p, name, namlen);      /* name length & name */ -	nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen); +	nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen);  	switch (nfserr) {  	case nfs_ok: -		p += buflen;  		break;  	case nfserr_resource:  		nfserr = nfserr_toosmall; @@ -2719,10 +2711,8 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4  	buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);  	nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, -				    resp->p, &buflen, getattr->ga_bmval, +				    &resp->p, buflen, getattr->ga_bmval,  				    resp->rqstp, 0); -	if (!nfserr) -		resp->p += buflen;  	return nfserr;  } @@ -2927,7 +2917,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,  		  struct nfsd4_read *read)  {  	u32 eof; -	int v, pn; +	int v; +	struct page *page;  	unsigned long maxcount;   	long len;  	__be32 *p; @@ -2946,11 +2937,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,  	len = maxcount;  	v = 0;  	while (len > 0) { -		pn = resp->rqstp->rq_resused++; -		resp->rqstp->rq_vec[v].iov_base = -			page_address(resp->rqstp->rq_respages[pn]); +		page = *(resp->rqstp->rq_next_page); +		if (!page) { /* ran out of pages */ +			maxcount -= len; +			break; +		} +		resp->rqstp->rq_vec[v].iov_base = page_address(page);  		resp->rqstp->rq_vec[v].iov_len =  			len < PAGE_SIZE ? len : PAGE_SIZE; +		resp->rqstp->rq_next_page++;  		v++;  		len -= PAGE_SIZE;  	} @@ -2996,8 +2991,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd  		return nfserr;  	if (resp->xbuf->page_len)  		return nfserr_resource; +	if (!*resp->rqstp->rq_next_page) +		return nfserr_resource; -	page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); +	page = page_address(*(resp->rqstp->rq_next_page++));  	maxcount = PAGE_SIZE;  	RESERVE_SPACE(4); @@ -3045,6 +3042,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4  		return nfserr;  	if (resp->xbuf->page_len)  		return nfserr_resource; +	if (!*resp->rqstp->rq_next_page) +		return nfserr_resource;  	RESERVE_SPACE(NFS4_VERIFIER_SIZE);  	savep = p; @@ -3071,7 +3070,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4  		goto err_no_verf;  	} -	page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); +	page = page_address(*(resp->rqstp->rq_next_page++));  	readdir->common.err = 0;  	readdir->buflen = maxcount;  	readdir->buffer = page; @@ -3094,8 +3093,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4  	p = readdir->buffer;  	*p++ = 0;	/* no more entries */  	*p++ = htonl(readdir->common.err == nfserr_eof); -	resp->xbuf->page_len = ((char*)p) - (char*)page_address( -		resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); +	resp->xbuf->page_len = ((char*)p) - +		(char*)page_address(*(resp->rqstp->rq_next_page-1));  	/* Use rest of head for padding and remaining ops: */  	resp->xbuf->tail[0].iov_base = tailbase; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 2cbac34a55d..62c1ee128ae 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -9,22 +9,22 @@   */  #include <linux/slab.h> +#include <linux/sunrpc/addr.h> +#include <linux/highmem.h> +#include <net/checksum.h>  #include "nfsd.h"  #include "cache.h" -/* Size of reply cache. Common values are: - * 4.3BSD:	128 - * 4.4BSD:	256 - * Solaris2:	1024 - * DEC Unix:	512-4096 - */ -#define CACHESIZE		1024 +#define NFSDDBG_FACILITY	NFSDDBG_REPCACHE +  #define HASHSIZE		64  static struct hlist_head *	cache_hash;  static struct list_head 	lru_head; -static int			cache_disabled = 1; +static struct kmem_cache	*drc_slab; +static unsigned int		num_drc_entries; +static unsigned int		max_drc_entries;  /*   * Calculate the hash index from an XID. @@ -37,6 +37,14 @@ static inline u32 request_hash(u32 xid)  }  static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); +static void	cache_cleaner_func(struct work_struct *unused); +static int 	nfsd_reply_cache_shrink(struct shrinker *shrink, +					struct shrink_control *sc); + +struct shrinker nfsd_reply_cache_shrinker = { +	.shrink	= nfsd_reply_cache_shrink, +	.seeks	= 1, +};  /*   * locking for the reply cache: @@ -44,30 +52,86 @@ static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);   * Otherwise, it when accessing _prev or _next, the lock must be held.   */  static DEFINE_SPINLOCK(cache_lock); +static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); -int nfsd_reply_cache_init(void) +/* + * Put a cap on the size of the DRC based on the amount of available + * low memory in the machine. + * + *  64MB:    8192 + * 128MB:   11585 + * 256MB:   16384 + * 512MB:   23170 + *   1GB:   32768 + *   2GB:   46340 + *   4GB:   65536 + *   8GB:   92681 + *  16GB:  131072 + * + * ...with a hard cap of 256k entries. In the worst case, each entry will be + * ~1k, so the above numbers should give a rough max of the amount of memory + * used in k. + */ +static unsigned int +nfsd_cache_size_limit(void) +{ +	unsigned int limit; +	unsigned long low_pages = totalram_pages - totalhigh_pages; + +	limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); +	return min_t(unsigned int, limit, 256*1024); +} + +static struct svc_cacherep * +nfsd_reply_cache_alloc(void)  {  	struct svc_cacherep	*rp; -	int			i; -	INIT_LIST_HEAD(&lru_head); -	i = CACHESIZE; -	while (i) { -		rp = kmalloc(sizeof(*rp), GFP_KERNEL); -		if (!rp) -			goto out_nomem; -		list_add(&rp->c_lru, &lru_head); +	rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); +	if (rp) {  		rp->c_state = RC_UNUSED;  		rp->c_type = RC_NOCACHE; +		INIT_LIST_HEAD(&rp->c_lru);  		INIT_HLIST_NODE(&rp->c_hash); -		i--;  	} +	return rp; +} + +static void +nfsd_reply_cache_free_locked(struct svc_cacherep *rp) +{ +	if (rp->c_type == RC_REPLBUFF) +		kfree(rp->c_replvec.iov_base); +	hlist_del(&rp->c_hash); +	list_del(&rp->c_lru); +	--num_drc_entries; +	kmem_cache_free(drc_slab, rp); +} + +static void +nfsd_reply_cache_free(struct svc_cacherep *rp) +{ +	spin_lock(&cache_lock); +	nfsd_reply_cache_free_locked(rp); +	spin_unlock(&cache_lock); +} + +int nfsd_reply_cache_init(void) +{ +	register_shrinker(&nfsd_reply_cache_shrinker); +	drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), +					0, 0, NULL); +	if (!drc_slab) +		goto out_nomem; -	cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); +	cache_hash = kcalloc(HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);  	if (!cache_hash)  		goto out_nomem; -	cache_disabled = 0; +	INIT_LIST_HEAD(&lru_head); +	max_drc_entries = nfsd_cache_size_limit(); +	num_drc_entries = 0; +  	return 0;  out_nomem:  	printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); @@ -79,27 +143,33 @@ void nfsd_reply_cache_shutdown(void)  {  	struct svc_cacherep	*rp; +	unregister_shrinker(&nfsd_reply_cache_shrinker); +	cancel_delayed_work_sync(&cache_cleaner); +  	while (!list_empty(&lru_head)) {  		rp = list_entry(lru_head.next, struct svc_cacherep, c_lru); -		if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF) -			kfree(rp->c_replvec.iov_base); -		list_del(&rp->c_lru); -		kfree(rp); +		nfsd_reply_cache_free_locked(rp);  	} -	cache_disabled = 1; -  	kfree (cache_hash);  	cache_hash = NULL; + +	if (drc_slab) { +		kmem_cache_destroy(drc_slab); +		drc_slab = NULL; +	}  }  /* - * Move cache entry to end of LRU list + * Move cache entry to end of LRU list, and queue the cleaner to run if it's + * not already scheduled.   */  static void  lru_put_end(struct svc_cacherep *rp)  { +	rp->c_timestamp = jiffies;  	list_move_tail(&rp->c_lru, &lru_head); +	schedule_delayed_work(&cache_cleaner, RC_EXPIRE);  }  /* @@ -112,83 +182,214 @@ hash_refile(struct svc_cacherep *rp)  	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));  } +static inline bool +nfsd_cache_entry_expired(struct svc_cacherep *rp) +{ +	return rp->c_state != RC_INPROG && +	       time_after(jiffies, rp->c_timestamp + RC_EXPIRE); +} + +/* + * Walk the LRU list and prune off entries that are older than RC_EXPIRE. + * Also prune the oldest ones when the total exceeds the max number of entries. + */ +static void +prune_cache_entries(void) +{ +	struct svc_cacherep *rp, *tmp; + +	list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { +		if (!nfsd_cache_entry_expired(rp) && +		    num_drc_entries <= max_drc_entries) +			break; +		nfsd_reply_cache_free_locked(rp); +	} + +	/* +	 * Conditionally rearm the job. If we cleaned out the list, then +	 * cancel any pending run (since there won't be any work to do). +	 * Otherwise, we rearm the job or modify the existing one to run in +	 * RC_EXPIRE since we just ran the pruner. +	 */ +	if (list_empty(&lru_head)) +		cancel_delayed_work(&cache_cleaner); +	else +		mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); +} + +static void +cache_cleaner_func(struct work_struct *unused) +{ +	spin_lock(&cache_lock); +	prune_cache_entries(); +	spin_unlock(&cache_lock); +} + +static int +nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ +	unsigned int num; + +	spin_lock(&cache_lock); +	if (sc->nr_to_scan) +		prune_cache_entries(); +	num = num_drc_entries; +	spin_unlock(&cache_lock); + +	return num; +} + +/* + * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes + */ +static __wsum +nfsd_cache_csum(struct svc_rqst *rqstp) +{ +	int idx; +	unsigned int base; +	__wsum csum; +	struct xdr_buf *buf = &rqstp->rq_arg; +	const unsigned char *p = buf->head[0].iov_base; +	size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, +				RC_CSUMLEN); +	size_t len = min(buf->head[0].iov_len, csum_len); + +	/* rq_arg.head first */ +	csum = csum_partial(p, len, 0); +	csum_len -= len; + +	/* Continue into page array */ +	idx = buf->page_base / PAGE_SIZE; +	base = buf->page_base & ~PAGE_MASK; +	while (csum_len) { +		p = page_address(buf->pages[idx]) + base; +		len = min_t(size_t, PAGE_SIZE - base, csum_len); +		csum = csum_partial(p, len, csum); +		csum_len -= len; +		base = 0; +		++idx; +	} +	return csum; +} + +/* + * Search the request hash for an entry that matches the given rqstp. + * Must be called with cache_lock held. Returns the found entry or + * NULL on failure. + */ +static struct svc_cacherep * +nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) +{ +	struct svc_cacherep	*rp; +	struct hlist_head 	*rh; +	__be32			xid = rqstp->rq_xid; +	u32			proto =  rqstp->rq_prot, +				vers = rqstp->rq_vers, +				proc = rqstp->rq_proc; + +	rh = &cache_hash[request_hash(xid)]; +	hlist_for_each_entry(rp, rh, c_hash) { +		if (xid == rp->c_xid && proc == rp->c_proc && +		    proto == rp->c_prot && vers == rp->c_vers && +		    rqstp->rq_arg.len == rp->c_len && csum == rp->c_csum && +		    rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) && +		    rpc_get_port(svc_addr(rqstp)) == rpc_get_port((struct sockaddr *)&rp->c_addr)) +			return rp; +	} +	return NULL; +} +  /*   * Try to find an entry matching the current call in the cache. When none - * is found, we grab the oldest unlocked entry off the LRU list. - * Note that no operation within the loop may sleep. + * is found, we try to grab the oldest expired entry off the LRU list. If + * a suitable one isn't there, then drop the cache_lock and allocate a + * new one, then search again in case one got inserted while this thread + * didn't hold the lock.   */  int  nfsd_cache_lookup(struct svc_rqst *rqstp)  { -	struct hlist_node	*hn; -	struct hlist_head 	*rh; -	struct svc_cacherep	*rp; +	struct svc_cacherep	*rp, *found;  	__be32			xid = rqstp->rq_xid;  	u32			proto =  rqstp->rq_prot,  				vers = rqstp->rq_vers,  				proc = rqstp->rq_proc; +	__wsum			csum;  	unsigned long		age;  	int type = rqstp->rq_cachetype;  	int rtn;  	rqstp->rq_cacherep = NULL; -	if (cache_disabled || type == RC_NOCACHE) { +	if (type == RC_NOCACHE) {  		nfsdstats.rcnocache++;  		return RC_DOIT;  	} +	csum = nfsd_cache_csum(rqstp); +  	spin_lock(&cache_lock);  	rtn = RC_DOIT; -	rh = &cache_hash[request_hash(xid)]; -	hlist_for_each_entry(rp, hn, rh, c_hash) { -		if (rp->c_state != RC_UNUSED && -		    xid == rp->c_xid && proc == rp->c_proc && -		    proto == rp->c_prot && vers == rp->c_vers && -		    time_before(jiffies, rp->c_timestamp + 120*HZ) && -		    memcmp((char*)&rqstp->rq_addr, (char*)&rp->c_addr, sizeof(rp->c_addr))==0) { -			nfsdstats.rchits++; -			goto found_entry; -		} -	} -	nfsdstats.rcmisses++; +	rp = nfsd_cache_search(rqstp, csum); +	if (rp) +		goto found_entry; -	/* This loop shouldn't take more than a few iterations normally */ -	{ -	int	safe = 0; -	list_for_each_entry(rp, &lru_head, c_lru) { -		if (rp->c_state != RC_INPROG) -			break; -		if (safe++ > CACHESIZE) { -			printk("nfsd: loop in repcache LRU list\n"); -			cache_disabled = 1; -			goto out; +	/* Try to use the first entry on the LRU */ +	if (!list_empty(&lru_head)) { +		rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); +		if (nfsd_cache_entry_expired(rp) || +		    num_drc_entries >= max_drc_entries) { +			lru_put_end(rp); +			prune_cache_entries(); +			goto setup_entry;  		}  	} -	} -	/* All entries on the LRU are in-progress. This should not happen */ -	if (&rp->c_lru == &lru_head) { -		static int	complaints; +	/* Drop the lock and allocate a new entry */ +	spin_unlock(&cache_lock); +	rp = nfsd_reply_cache_alloc(); +	if (!rp) { +		dprintk("nfsd: unable to allocate DRC entry!\n"); +		return RC_DOIT; +	} +	spin_lock(&cache_lock); +	++num_drc_entries; -		printk(KERN_WARNING "nfsd: all repcache entries locked!\n"); -		if (++complaints > 5) { -			printk(KERN_WARNING "nfsd: disabling repcache.\n"); -			cache_disabled = 1; -		} -		goto out; +	/* +	 * Must search again just in case someone inserted one +	 * after we dropped the lock above. +	 */ +	found = nfsd_cache_search(rqstp, csum); +	if (found) { +		nfsd_reply_cache_free_locked(rp); +		rp = found; +		goto found_entry;  	} +	/* +	 * We're keeping the one we just allocated. Are we now over the +	 * limit? Prune one off the tip of the LRU in trade for the one we +	 * just allocated if so. +	 */ +	if (num_drc_entries >= max_drc_entries) +		nfsd_reply_cache_free_locked(list_first_entry(&lru_head, +						struct svc_cacherep, c_lru)); + +setup_entry: +	nfsdstats.rcmisses++;  	rqstp->rq_cacherep = rp;  	rp->c_state = RC_INPROG;  	rp->c_xid = xid;  	rp->c_proc = proc; -	memcpy(&rp->c_addr, svc_addr_in(rqstp), sizeof(rp->c_addr)); +	rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); +	rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));  	rp->c_prot = proto;  	rp->c_vers = vers; -	rp->c_timestamp = jiffies; +	rp->c_len = rqstp->rq_arg.len; +	rp->c_csum = csum;  	hash_refile(rp); +	lru_put_end(rp);  	/* release any buffer */  	if (rp->c_type == RC_REPLBUFF) { @@ -201,9 +402,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)  	return rtn;  found_entry: +	nfsdstats.rchits++;  	/* We found a matching entry which is either in progress or done. */  	age = jiffies - rp->c_timestamp; -	rp->c_timestamp = jiffies;  	lru_put_end(rp);  	rtn = RC_DROPIT; @@ -232,7 +433,7 @@ found_entry:  		break;  	default:  		printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); -		rp->c_state = RC_UNUSED; +		nfsd_reply_cache_free_locked(rp);  	}  	goto out; @@ -257,11 +458,11 @@ found_entry:  void  nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)  { -	struct svc_cacherep *rp; +	struct svc_cacherep *rp = rqstp->rq_cacherep;  	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;  	int		len; -	if (!(rp = rqstp->rq_cacherep) || cache_disabled) +	if (!rp)  		return;  	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); @@ -269,7 +470,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)  	/* Don't cache excessive amounts of data and XDR failures */  	if (!statp || len > (256 >> 2)) { -		rp->c_state = RC_UNUSED; +		nfsd_reply_cache_free(rp);  		return;  	} @@ -283,21 +484,21 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)  		cachv = &rp->c_replvec;  		cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);  		if (!cachv->iov_base) { -			spin_lock(&cache_lock); -			rp->c_state = RC_UNUSED; -			spin_unlock(&cache_lock); +			nfsd_reply_cache_free(rp);  			return;  		}  		cachv->iov_len = len << 2;  		memcpy(cachv->iov_base, statp, len << 2);  		break; +	case RC_NOCACHE: +		nfsd_reply_cache_free(rp); +		return;  	}  	spin_lock(&cache_lock);  	lru_put_end(rp);  	rp->c_secure = rqstp->rq_secure;  	rp->c_type = cachetype;  	rp->c_state = RC_DONE; -	rp->c_timestamp = jiffies;  	spin_unlock(&cache_lock);  	return;  } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index dab350dfc37..13a21c8fca4 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -10,7 +10,7 @@  #include <linux/sunrpc/svcsock.h>  #include <linux/lockd/lockd.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/sunrpc/gss_api.h>  #include <linux/sunrpc/gss_krb5_enctypes.h>  #include <linux/sunrpc/rpc_pipe_fs.h> @@ -19,7 +19,7 @@  #include "idmap.h"  #include "nfsd.h"  #include "cache.h" -#include "fault_inject.h" +#include "state.h"  #include "netns.h"  /* @@ -85,7 +85,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {  static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)  { -	ino_t ino =  file->f_path.dentry->d_inode->i_ino; +	ino_t ino =  file_inode(file)->i_ino;  	char *data;  	ssize_t rv; @@ -125,11 +125,11 @@ static const struct file_operations transaction_ops = {  	.llseek		= default_llseek,  }; -static int exports_open(struct inode *inode, struct file *file) +static int exports_net_open(struct net *net, struct file *file)  {  	int err;  	struct seq_file *seq; -	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	err = seq_open(file, &nfs_exports_op);  	if (err) @@ -140,8 +140,26 @@ static int exports_open(struct inode *inode, struct file *file)  	return 0;  } -static const struct file_operations exports_operations = { -	.open		= exports_open, +static int exports_proc_open(struct inode *inode, struct file *file) +{ +	return exports_net_open(current->nsproxy->net_ns, file); +} + +static const struct file_operations exports_proc_operations = { +	.open		= exports_proc_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release, +	.owner		= THIS_MODULE, +}; + +static int exports_nfsd_open(struct inode *inode, struct file *file) +{ +	return exports_net_open(inode->i_sb->s_fs_info, file); +} + +static const struct file_operations exports_nfsd_operations = { +	.open		= exports_nfsd_open,  	.read		= seq_read,  	.llseek		= seq_lseek,  	.release	= seq_release, @@ -186,9 +204,6 @@ static struct file_operations supported_enctypes_ops = {  };  #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ -extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); -extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); -  static const struct file_operations pool_stats_operations = {  	.open		= nfsd_pool_stats_open,  	.read		= seq_read, @@ -223,6 +238,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)  	struct sockaddr *sap = (struct sockaddr *)&address;  	size_t salen = sizeof(address);  	char *fo_path; +	struct net *net = file->f_dentry->d_sb->s_fs_info;  	/* sanity check */  	if (size == 0) @@ -235,7 +251,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)  	if (qword_get(&buf, fo_path, size) < 0)  		return -EINVAL; -	if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0) +	if (rpc_pton(net, fo_path, size, sap, salen) == 0)  		return -EINVAL;  	return nlmsvc_unlock_all_by_ip(sap); @@ -320,6 +336,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)  	int len;  	struct auth_domain *dom;  	struct knfsd_fh fh; +	struct net *net = file->f_dentry->d_sb->s_fs_info;  	if (size == 0)  		return -EINVAL; @@ -355,7 +372,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)  	if (!dom)  		return -ENOMEM; -	len = exp_rootfh(&init_net, dom, path, &fh,  maxsize); +	len = exp_rootfh(net, dom, path, &fh,  maxsize);  	auth_domain_put(dom);  	if (len)  		return len; @@ -399,6 +416,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)  {  	char *mesg = buf;  	int rv; +	struct net *net = file->f_dentry->d_sb->s_fs_info; +  	if (size > 0) {  		int newthreads;  		rv = get_int(&mesg, &newthreads); @@ -406,11 +425,11 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)  			return rv;  		if (newthreads < 0)  			return -EINVAL; -		rv = nfsd_svc(newthreads); +		rv = nfsd_svc(newthreads, net);  		if (rv < 0)  			return rv;  	} else -		rv = nfsd_nrthreads(); +		rv = nfsd_nrthreads(net);  	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);  } @@ -448,9 +467,10 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)  	int len;  	int npools;  	int *nthreads; +	struct net *net = file->f_dentry->d_sb->s_fs_info;  	mutex_lock(&nfsd_mutex); -	npools = nfsd_nrpools(); +	npools = nfsd_nrpools(net);  	if (npools == 0) {  		/*  		 * NFS is shut down.  The admin can start it by @@ -478,12 +498,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)  			if (nthreads[i] < 0)  				goto out_free;  		} -		rv = nfsd_set_nrthreads(i, nthreads); +		rv = nfsd_set_nrthreads(i, nthreads, net);  		if (rv)  			goto out_free;  	} -	rv = nfsd_get_nrthreads(npools, nthreads); +	rv = nfsd_get_nrthreads(npools, nthreads, net);  	if (rv)  		goto out_free; @@ -510,11 +530,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)  	unsigned minor;  	ssize_t tlen = 0;  	char *sep; +	struct net *net = file->f_dentry->d_sb->s_fs_info; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	if (size>0) { -		if (nfsd_serv) +		if (nn->nfsd_serv)  			/* Cannot change versions without updating -			 * nfsd_serv->sv_xdrsize, and reallocing +			 * nn->nfsd_serv->sv_xdrsize, and reallocing  			 * rq_argp and rq_resp  			 */  			return -EBUSY; @@ -532,7 +554,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)  			else  				num = simple_strtol(vers, &minorp, 0);  			if (*minorp == '.') { -				if (num < 4) +				if (num != 4)  					return -EINVAL;  				minor = simple_strtoul(minorp+1, NULL, 0);  				if (minor == 0) @@ -645,11 +667,13 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)   * Zero-length write.  Return a list of NFSD's current listener   * transports.   */ -static ssize_t __write_ports_names(char *buf) +static ssize_t __write_ports_names(char *buf, struct net *net)  { -	if (nfsd_serv == NULL) +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); + +	if (nn->nfsd_serv == NULL)  		return 0; -	return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); +	return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);  }  /* @@ -657,28 +681,28 @@ static ssize_t __write_ports_names(char *buf)   * a socket of a supported family/protocol, and we use it as an   * nfsd listener.   */ -static ssize_t __write_ports_addfd(char *buf) +static ssize_t __write_ports_addfd(char *buf, struct net *net)  {  	char *mesg = buf;  	int fd, err; -	struct net *net = &init_net; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	err = get_int(&mesg, &fd);  	if (err != 0 || fd < 0)  		return -EINVAL; -	err = nfsd_create_serv(); +	err = nfsd_create_serv(net);  	if (err != 0)  		return err; -	err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); +	err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);  	if (err < 0) {  		nfsd_destroy(net);  		return err;  	}  	/* Decrease the count, but don't shut down the service */ -	nfsd_serv->sv_nrthreads--; +	nn->nfsd_serv->sv_nrthreads--;  	return err;  } @@ -686,12 +710,12 @@ static ssize_t __write_ports_addfd(char *buf)   * A transport listener is added by writing it's transport name and   * a port number.   */ -static ssize_t __write_ports_addxprt(char *buf) +static ssize_t __write_ports_addxprt(char *buf, struct net *net)  {  	char transport[16];  	struct svc_xprt *xprt;  	int port, err; -	struct net *net = &init_net; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	if (sscanf(buf, "%15s %5u", transport, &port) != 2)  		return -EINVAL; @@ -699,25 +723,25 @@ static ssize_t __write_ports_addxprt(char *buf)  	if (port < 1 || port > USHRT_MAX)  		return -EINVAL; -	err = nfsd_create_serv(); +	err = nfsd_create_serv(net);  	if (err != 0)  		return err; -	err = svc_create_xprt(nfsd_serv, transport, net, +	err = svc_create_xprt(nn->nfsd_serv, transport, net,  				PF_INET, port, SVC_SOCK_ANONYMOUS);  	if (err < 0)  		goto out_err; -	err = svc_create_xprt(nfsd_serv, transport, net, +	err = svc_create_xprt(nn->nfsd_serv, transport, net,  				PF_INET6, port, SVC_SOCK_ANONYMOUS);  	if (err < 0 && err != -EAFNOSUPPORT)  		goto out_close;  	/* Decrease the count, but don't shut down the service */ -	nfsd_serv->sv_nrthreads--; +	nn->nfsd_serv->sv_nrthreads--;  	return 0;  out_close: -	xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port); +	xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);  	if (xprt != NULL) {  		svc_close_xprt(xprt);  		svc_xprt_put(xprt); @@ -727,16 +751,17 @@ out_err:  	return err;  } -static ssize_t __write_ports(struct file *file, char *buf, size_t size) +static ssize_t __write_ports(struct file *file, char *buf, size_t size, +			     struct net *net)  {  	if (size == 0) -		return __write_ports_names(buf); +		return __write_ports_names(buf, net);  	if (isdigit(buf[0])) -		return __write_ports_addfd(buf); +		return __write_ports_addfd(buf, net);  	if (isalpha(buf[0])) -		return __write_ports_addxprt(buf); +		return __write_ports_addxprt(buf, net);  	return -EINVAL;  } @@ -787,9 +812,10 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)  static ssize_t write_ports(struct file *file, char *buf, size_t size)  {  	ssize_t rv; +	struct net *net = file->f_dentry->d_sb->s_fs_info;  	mutex_lock(&nfsd_mutex); -	rv = __write_ports(file, buf, size); +	rv = __write_ports(file, buf, size, net);  	mutex_unlock(&nfsd_mutex);  	return rv;  } @@ -821,6 +847,9 @@ int nfsd_max_blksize;  static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)  {  	char *mesg = buf; +	struct net *net = file->f_dentry->d_sb->s_fs_info; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +  	if (size > 0) {  		int bsize;  		int rv = get_int(&mesg, &bsize); @@ -835,7 +864,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)  			bsize = NFSSVC_MAXBLKSIZE;  		bsize &= ~(1024-1);  		mutex_lock(&nfsd_mutex); -		if (nfsd_serv) { +		if (nn->nfsd_serv) {  			mutex_unlock(&nfsd_mutex);  			return -EBUSY;  		} @@ -848,13 +877,14 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)  }  #ifdef CONFIG_NFSD_V4 -static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) +static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, +				  time_t *time, struct nfsd_net *nn)  {  	char *mesg = buf;  	int rv, i;  	if (size > 0) { -		if (nfsd_serv) +		if (nn->nfsd_serv)  			return -EBUSY;  		rv = get_int(&mesg, &i);  		if (rv) @@ -879,12 +909,13 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, tim  	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);  } -static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) +static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, +				time_t *time, struct nfsd_net *nn)  {  	ssize_t rv;  	mutex_lock(&nfsd_mutex); -	rv = __nfsd4_write_time(file, buf, size, time); +	rv = __nfsd4_write_time(file, buf, size, time, nn);  	mutex_unlock(&nfsd_mutex);  	return rv;  } @@ -912,7 +943,9 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_   */  static ssize_t write_leasetime(struct file *file, char *buf, size_t size)  { -	return nfsd4_write_time(file, buf, size, &nfsd4_lease); +	struct net *net = file->f_dentry->d_sb->s_fs_info; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +	return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);  }  /** @@ -927,17 +960,20 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)   */  static ssize_t write_gracetime(struct file *file, char *buf, size_t size)  { -	return nfsd4_write_time(file, buf, size, &nfsd4_grace); +	struct net *net = file->f_dentry->d_sb->s_fs_info; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +	return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);  } -static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) +static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, +				   struct nfsd_net *nn)  {  	char *mesg = buf;  	char *recdir;  	int len, status;  	if (size > 0) { -		if (nfsd_serv) +		if (nn->nfsd_serv)  			return -EBUSY;  		if (size > PATH_MAX || buf[size-1] != '\n')  			return -EINVAL; @@ -981,9 +1017,11 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)  static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)  {  	ssize_t rv; +	struct net *net = file->f_dentry->d_sb->s_fs_info; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	mutex_lock(&nfsd_mutex); -	rv = __write_recoverydir(file, buf, size); +	rv = __write_recoverydir(file, buf, size, nn);  	mutex_unlock(&nfsd_mutex);  	return rv;  } @@ -998,7 +1036,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)  static int nfsd_fill_super(struct super_block * sb, void * data, int silent)  {  	static struct tree_descr nfsd_files[] = { -		[NFSD_List] = {"exports", &exports_operations, S_IRUGO}, +		[NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},  		[NFSD_Export_features] = {"export_features",  					&export_features_operations, S_IRUGO},  		[NFSD_FO_UnlockIP] = {"unlock_ip", @@ -1022,20 +1060,35 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)  #endif  		/* last one */ {""}  	}; -	return simple_fill_super(sb, 0x6e667364, nfsd_files); +	struct net *net = data; +	int ret; + +	ret = simple_fill_super(sb, 0x6e667364, nfsd_files); +	if (ret) +		return ret; +	sb->s_fs_info = get_net(net); +	return 0;  }  static struct dentry *nfsd_mount(struct file_system_type *fs_type,  	int flags, const char *dev_name, void *data)  { -	return mount_single(fs_type, flags, data, nfsd_fill_super); +	return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super); +} + +static void nfsd_umount(struct super_block *sb) +{ +	struct net *net = sb->s_fs_info; + +	kill_litter_super(sb); +	put_net(net);  }  static struct file_system_type nfsd_fs_type = {  	.owner		= THIS_MODULE,  	.name		= "nfsd",  	.mount		= nfsd_mount, -	.kill_sb	= kill_litter_super, +	.kill_sb	= nfsd_umount,  };  #ifdef CONFIG_PROC_FS @@ -1046,7 +1099,8 @@ static int create_proc_exports_entry(void)  	entry = proc_mkdir("fs/nfs", NULL);  	if (!entry)  		return -ENOMEM; -	entry = proc_create("exports", 0, entry, &exports_operations); +	entry = proc_create("exports", 0, entry, +				 &exports_proc_operations);  	if (!entry)  		return -ENOMEM;  	return 0; @@ -1063,6 +1117,7 @@ int nfsd_net_id;  static __net_init int nfsd_init_net(struct net *net)  {  	int retval; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	retval = nfsd_export_init(net);  	if (retval) @@ -1070,6 +1125,8 @@ static __net_init int nfsd_init_net(struct net *net)  	retval = nfsd_idmap_init(net);  	if (retval)  		goto out_idmap_error; +	nn->nfsd4_lease = 90;	/* default lease time */ +	nn->nfsd4_grace = 90;  	return 0;  out_idmap_error: diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 80d5ce40aad..07a473fd49b 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -55,36 +55,26 @@ extern struct svc_version	nfsd_version2, nfsd_version3,  				nfsd_version4;  extern u32			nfsd_supported_minorversion;  extern struct mutex		nfsd_mutex; -extern struct svc_serv		*nfsd_serv;  extern spinlock_t		nfsd_drc_lock; -extern unsigned int		nfsd_drc_max_mem; -extern unsigned int		nfsd_drc_mem_used; +extern unsigned long		nfsd_drc_max_mem; +extern unsigned long		nfsd_drc_mem_used;  extern const struct seq_operations nfs_exports_op;  /*   * Function prototypes.   */ -int		nfsd_svc(int nrservs); +int		nfsd_svc(int nrservs, struct net *net);  int		nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); -int		nfsd_nrthreads(void); -int		nfsd_nrpools(void); -int		nfsd_get_nrthreads(int n, int *); -int		nfsd_set_nrthreads(int n, int *); +int		nfsd_nrthreads(struct net *); +int		nfsd_nrpools(struct net *); +int		nfsd_get_nrthreads(int n, int *, struct net *); +int		nfsd_set_nrthreads(int n, int *, struct net *);  int		nfsd_pool_stats_open(struct inode *, struct file *);  int		nfsd_pool_stats_release(struct inode *, struct file *); -static inline void nfsd_destroy(struct net *net) -{ -	int destroy = (nfsd_serv->sv_nrthreads == 1); - -	if (destroy) -		svc_shutdown_net(nfsd_serv, net); -	svc_destroy(nfsd_serv); -	if (destroy) -		nfsd_serv = NULL; -} +void		nfsd_destroy(struct net *net);  #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)  #ifdef CONFIG_NFSD_V2_ACL @@ -103,7 +93,7 @@ enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };  int nfsd_vers(int vers, enum vers_op change);  int nfsd_minorversion(u32 minorversion, enum vers_op change);  void nfsd_reset_versions(void); -int nfsd_create_serv(void); +int nfsd_create_serv(struct net *net);  extern int nfsd_max_blksize; @@ -116,12 +106,14 @@ static inline int nfsd_v4client(struct svc_rqst *rq)   * NFSv4 State   */  #ifdef CONFIG_NFSD_V4 -extern unsigned int max_delegations; +extern unsigned long max_delegations;  void nfs4_state_init(void);  int nfsd4_init_slabs(void);  void nfsd4_free_slabs(void);  int nfs4_state_start(void); +int nfs4_state_start_net(struct net *net);  void nfs4_state_shutdown(void); +void nfs4_state_shutdown_net(struct net *net);  void nfs4_reset_lease(time_t leasetime);  int nfs4_reset_recoverydir(char *recdir);  char * nfs4_recoverydir(void); @@ -130,7 +122,9 @@ static inline void nfs4_state_init(void) { }  static inline int nfsd4_init_slabs(void) { return 0; }  static inline void nfsd4_free_slabs(void) { }  static inline int nfs4_state_start(void) { return 0; } +static inline int nfs4_state_start_net(struct net *net) { return 0; }  static inline void nfs4_state_shutdown(void) { } +static inline void nfs4_state_shutdown_net(struct net *net) { }  static inline void nfs4_reset_lease(time_t leasetime) { }  static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }  static inline char * nfs4_recoverydir(void) {return NULL; } @@ -265,16 +259,8 @@ void		nfsd_lockd_shutdown(void);  /* Check for dir entries '.' and '..' */  #define isdotent(n, l)	(l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) -/* - * Time of server startup - */ -extern struct timeval	nfssvc_boot; -  #ifdef CONFIG_NFSD_V4 -extern time_t nfsd4_lease; -extern time_t nfsd4_grace; -  /* before processing a COMPOUND operation, we have to check that there   * is enough space in the buffer for XDR encode to succeed.  otherwise,   * we might process an operation with side effects, and be unable to diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 032af381b3a..814afaa4458 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -572,7 +572,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,  		if (inode)  			_fh_update(fhp, exp, dentry); -		if (fhp->fh_handle.fh_fileid_type == 255) { +		if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {  			fh_put(fhp);  			return nfserr_opnotsupp;  		} @@ -603,7 +603,7 @@ fh_update(struct svc_fh *fhp)  			goto out;  		_fh_update(fhp, fhp->fh_export, dentry); -		if (fhp->fh_handle.fh_fileid_type == 255) +		if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)  			return nfserr_opnotsupp;  	}  out: diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index aad6d457b9e..54c6b3d3cc7 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -26,17 +26,13 @@ static __be32  nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)  {  	if (err) return err; -	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt, -				    resp->fh.fh_dentry, -				    &resp->stat)); +	return fh_getattr(&resp->fh, &resp->stat);  }  static __be32  nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)  {  	if (err) return err; -	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt, -				    resp->fh.fh_dentry, -				    &resp->stat)); +	return fh_getattr(&resp->fh, &resp->stat);  }  /*   * Get a file's attributes @@ -150,9 +146,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,  				  &resp->count);  	if (nfserr) return nfserr; -	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt, -				    resp->fh.fh_dentry, -				    &resp->stat)); +	return fh_getattr(&resp->fh, &resp->stat);  }  /* diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2013aa001da..262df5ccbf5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -11,7 +11,6 @@  #include <linux/module.h>  #include <linux/fs_struct.h>  #include <linux/swap.h> -#include <linux/nsproxy.h>  #include <linux/sunrpc/stats.h>  #include <linux/sunrpc/svcsock.h> @@ -22,19 +21,19 @@  #include "nfsd.h"  #include "cache.h"  #include "vfs.h" +#include "netns.h"  #define NFSDDBG_FACILITY	NFSDDBG_SVC  extern struct svc_program	nfsd_program;  static int			nfsd(void *vrqstp); -struct timeval			nfssvc_boot;  /* - * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members + * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members   * of the svc_serv struct. In particular, ->sv_nrthreads but also to some   * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt   * - * If (out side the lock) nfsd_serv is non-NULL, then it must point to a + * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a   * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number   * of nfsd threads must exist and each must listed in ->sp_all_threads in each   * entry of ->sv_pools[]. @@ -52,7 +51,6 @@ struct timeval			nfssvc_boot;   *	nfsd_versions   */  DEFINE_MUTEX(nfsd_mutex); -struct svc_serv 		*nfsd_serv;  /*   * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. @@ -61,8 +59,8 @@ struct svc_serv 		*nfsd_serv;   * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.   */  spinlock_t	nfsd_drc_lock; -unsigned int	nfsd_drc_max_mem; -unsigned int	nfsd_drc_mem_used; +unsigned long	nfsd_drc_max_mem; +unsigned long	nfsd_drc_mem_used;  #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)  static struct svc_stat	nfsd_acl_svcstats; @@ -173,28 +171,32 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)   */  #define	NFSD_MAXSERVS		8192 -int nfsd_nrthreads(void) +int nfsd_nrthreads(struct net *net)  {  	int rv = 0; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +  	mutex_lock(&nfsd_mutex); -	if (nfsd_serv) -		rv = nfsd_serv->sv_nrthreads; +	if (nn->nfsd_serv) +		rv = nn->nfsd_serv->sv_nrthreads;  	mutex_unlock(&nfsd_mutex);  	return rv;  } -static int nfsd_init_socks(void) +static int nfsd_init_socks(struct net *net)  {  	int error; -	if (!list_empty(&nfsd_serv->sv_permsocks)) +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); + +	if (!list_empty(&nn->nfsd_serv->sv_permsocks))  		return 0; -	error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT, +	error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,  					SVC_SOCK_DEFAULTS);  	if (error < 0)  		return error; -	error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT, +	error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,  					SVC_SOCK_DEFAULTS);  	if (error < 0)  		return error; @@ -202,14 +204,15 @@ static int nfsd_init_socks(void)  	return 0;  } -static bool nfsd_up = false; +static int nfsd_users = 0; -static int nfsd_startup(int nrservs) +static int nfsd_startup_generic(int nrservs)  {  	int ret; -	if (nfsd_up) +	if (nfsd_users++)  		return 0; +  	/*  	 * Readahead param cache - will no-op if it already exists.  	 * (Note therefore results will be suboptimal if number of @@ -218,43 +221,79 @@ static int nfsd_startup(int nrservs)  	ret = nfsd_racache_init(2*nrservs);  	if (ret)  		return ret; -	ret = nfsd_init_socks(); +	ret = nfs4_state_start();  	if (ret)  		goto out_racache; -	ret = lockd_up(&init_net); +	return 0; + +out_racache: +	nfsd_racache_shutdown(); +	return ret; +} + +static void nfsd_shutdown_generic(void) +{ +	if (--nfsd_users) +		return; + +	nfs4_state_shutdown(); +	nfsd_racache_shutdown(); +} + +static int nfsd_startup_net(int nrservs, struct net *net) +{ +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +	int ret; + +	if (nn->nfsd_net_up) +		return 0; + +	ret = nfsd_startup_generic(nrservs);  	if (ret) -		goto out_racache; -	ret = nfs4_state_start(); +		return ret; +	ret = nfsd_init_socks(net); +	if (ret) +		goto out_socks; +	ret = lockd_up(net); +	if (ret) +		goto out_socks; +	ret = nfs4_state_start_net(net);  	if (ret)  		goto out_lockd; -	nfsd_up = true; + +	nn->nfsd_net_up = true;  	return 0; +  out_lockd: -	lockd_down(&init_net); -out_racache: -	nfsd_racache_shutdown(); +	lockd_down(net); +out_socks: +	nfsd_shutdown_generic();  	return ret;  } -static void nfsd_shutdown(void) +static void nfsd_shutdown_net(struct net *net)  { +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); + +	nfs4_state_shutdown_net(net); +	lockd_down(net); +	nn->nfsd_net_up = false; +	nfsd_shutdown_generic(); +} + +static void nfsd_last_thread(struct svc_serv *serv, struct net *net) +{ +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +  	/*  	 * write_ports can create the server without actually starting  	 * any threads--if we get shut down before any threads are  	 * started, then nfsd_last_thread will be run before any of this  	 * other initialization has been done.  	 */ -	if (!nfsd_up) +	if (!nn->nfsd_net_up)  		return; -	nfs4_state_shutdown(); -	lockd_down(&init_net); -	nfsd_racache_shutdown(); -	nfsd_up = false; -} - -static void nfsd_last_thread(struct svc_serv *serv, struct net *net) -{ -	nfsd_shutdown(); +	nfsd_shutdown_net(net);  	svc_rpcb_cleanup(serv, net); @@ -303,7 +342,7 @@ static void set_max_drc(void)  					>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;  	nfsd_drc_mem_used = 0;  	spin_lock_init(&nfsd_drc_lock); -	dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); +	dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);  }  static int nfsd_get_default_max_blksize(void) @@ -327,69 +366,84 @@ static int nfsd_get_default_max_blksize(void)  	return ret;  } -int nfsd_create_serv(void) +int nfsd_create_serv(struct net *net)  {  	int error; -	struct net *net = current->nsproxy->net_ns; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	WARN_ON(!mutex_is_locked(&nfsd_mutex)); -	if (nfsd_serv) { -		svc_get(nfsd_serv); +	if (nn->nfsd_serv) { +		svc_get(nn->nfsd_serv);  		return 0;  	}  	if (nfsd_max_blksize == 0)  		nfsd_max_blksize = nfsd_get_default_max_blksize();  	nfsd_reset_versions(); -	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, +	nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,  				      nfsd_last_thread, nfsd, THIS_MODULE); -	if (nfsd_serv == NULL) +	if (nn->nfsd_serv == NULL)  		return -ENOMEM; -	error = svc_bind(nfsd_serv, net); +	error = svc_bind(nn->nfsd_serv, net);  	if (error < 0) { -		svc_destroy(nfsd_serv); +		svc_destroy(nn->nfsd_serv);  		return error;  	}  	set_max_drc(); -	do_gettimeofday(&nfssvc_boot);		/* record boot time */ +	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */  	return 0;  } -int nfsd_nrpools(void) +int nfsd_nrpools(struct net *net)  { -	if (nfsd_serv == NULL) +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); + +	if (nn->nfsd_serv == NULL)  		return 0;  	else -		return nfsd_serv->sv_nrpools; +		return nn->nfsd_serv->sv_nrpools;  } -int nfsd_get_nrthreads(int n, int *nthreads) +int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)  {  	int i = 0; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); -	if (nfsd_serv != NULL) { -		for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++) -			nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads; +	if (nn->nfsd_serv != NULL) { +		for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++) +			nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;  	}  	return 0;  } -int nfsd_set_nrthreads(int n, int *nthreads) +void nfsd_destroy(struct net *net) +{ +	struct nfsd_net *nn = net_generic(net, nfsd_net_id); +	int destroy = (nn->nfsd_serv->sv_nrthreads == 1); + +	if (destroy) +		svc_shutdown_net(nn->nfsd_serv, net); +	svc_destroy(nn->nfsd_serv); +	if (destroy) +		nn->nfsd_serv = NULL; +} + +int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)  {  	int i = 0;  	int tot = 0;  	int err = 0; -	struct net *net = &init_net; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	WARN_ON(!mutex_is_locked(&nfsd_mutex)); -	if (nfsd_serv == NULL || n <= 0) +	if (nn->nfsd_serv == NULL || n <= 0)  		return 0; -	if (n > nfsd_serv->sv_nrpools) -		n = nfsd_serv->sv_nrpools; +	if (n > nn->nfsd_serv->sv_nrpools) +		n = nn->nfsd_serv->sv_nrpools;  	/* enforce a global maximum number of threads */  	tot = 0; @@ -419,9 +473,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)  		nthreads[0] = 1;  	/* apply the new numbers */ -	svc_get(nfsd_serv); +	svc_get(nn->nfsd_serv);  	for (i = 0; i < n; i++) { -		err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i], +		err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],  				    	  nthreads[i]);  		if (err)  			break; @@ -436,11 +490,11 @@ int nfsd_set_nrthreads(int n, int *nthreads)   * this is the first time nrservs is nonzero.   */  int -nfsd_svc(int nrservs) +nfsd_svc(int nrservs, struct net *net)  {  	int	error;  	bool	nfsd_up_before; -	struct net *net = &init_net; +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);  	mutex_lock(&nfsd_mutex);  	dprintk("nfsd: creating service\n"); @@ -449,29 +503,29 @@ nfsd_svc(int nrservs)  	if (nrservs > NFSD_MAXSERVS)  		nrservs = NFSD_MAXSERVS;  	error = 0; -	if (nrservs == 0 && nfsd_serv == NULL) +	if (nrservs == 0 && nn->nfsd_serv == NULL)  		goto out; -	error = nfsd_create_serv(); +	error = nfsd_create_serv(net);  	if (error)  		goto out; -	nfsd_up_before = nfsd_up; +	nfsd_up_before = nn->nfsd_net_up; -	error = nfsd_startup(nrservs); +	error = nfsd_startup_net(nrservs, net);  	if (error)  		goto out_destroy; -	error = svc_set_num_threads(nfsd_serv, NULL, nrservs); +	error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);  	if (error)  		goto out_shutdown; -	/* We are holding a reference to nfsd_serv which +	/* We are holding a reference to nn->nfsd_serv which  	 * we don't want to count in the return value,  	 * so subtract 1  	 */ -	error = nfsd_serv->sv_nrthreads - 1; +	error = nn->nfsd_serv->sv_nrthreads - 1;  out_shutdown:  	if (error < 0 && !nfsd_up_before) -		nfsd_shutdown(); +		nfsd_shutdown_net(net);  out_destroy:  	nfsd_destroy(net);		/* Release server */  out: @@ -487,6 +541,8 @@ static int  nfsd(void *vrqstp)  {  	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; +	struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); +	struct net *net = perm_sock->xpt_net;  	int err;  	/* Lock module and set up kernel thread */ @@ -551,7 +607,7 @@ out:  	/* Release the thread */  	svc_exit_thread(rqstp); -	nfsd_destroy(&init_net); +	nfsd_destroy(net);  	/* Release module */  	mutex_unlock(&nfsd_mutex); @@ -596,7 +652,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)  	/* Check whether we have this call in the cache. */  	switch (nfsd_cache_lookup(rqstp)) { -	case RC_INTR:  	case RC_DROPIT:  		return 0;  	case RC_REPLY: @@ -640,21 +695,23 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)  	}  	/* Store reply in cache. */ -	nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); +	nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);  	return 1;  }  int nfsd_pool_stats_open(struct inode *inode, struct file *file)  {  	int ret; +	struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); +  	mutex_lock(&nfsd_mutex); -	if (nfsd_serv == NULL) { +	if (nn->nfsd_serv == NULL) {  		mutex_unlock(&nfsd_mutex);  		return -ENODEV;  	}  	/* bump up the psudo refcount while traversing */ -	svc_get(nfsd_serv); -	ret = svc_pool_stats_open(nfsd_serv, file); +	svc_get(nn->nfsd_serv); +	ret = svc_pool_stats_open(nn->nfsd_serv, file);  	mutex_unlock(&nfsd_mutex);  	return ret;  } @@ -662,7 +719,7 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)  int nfsd_pool_stats_release(struct inode *inode, struct file *file)  {  	int ret = seq_release(inode, file); -	struct net *net = &init_net; +	struct net *net = inode->i_sb->s_fs_info;  	mutex_lock(&nfsd_mutex);  	/* this function really, really should have been called svc_put() */ diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 65ec595e222..9c769a47ac5 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -4,6 +4,7 @@   * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>   */ +#include "vfs.h"  #include "xdr.h"  #include "auth.h" @@ -100,12 +101,14 @@ decode_sattr(__be32 *p, struct iattr *iap)  		iap->ia_mode = tmp;  	}  	if ((tmp = ntohl(*p++)) != (u32)-1) { -		iap->ia_valid |= ATTR_UID; -		iap->ia_uid = tmp; +		iap->ia_uid = make_kuid(&init_user_ns, tmp); +		if (uid_valid(iap->ia_uid)) +			iap->ia_valid |= ATTR_UID;  	}  	if ((tmp = ntohl(*p++)) != (u32)-1) { -		iap->ia_valid |= ATTR_GID; -		iap->ia_gid = tmp; +		iap->ia_gid = make_kgid(&init_user_ns, tmp); +		if (gid_valid(iap->ia_gid)) +			iap->ia_valid |= ATTR_GID;  	}  	if ((tmp = ntohl(*p++)) != (u32)-1) {  		iap->ia_valid |= ATTR_SIZE; @@ -151,8 +154,8 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,  	*p++ = htonl(nfs_ftypes[type >> 12]);  	*p++ = htonl((u32) stat->mode);  	*p++ = htonl((u32) stat->nlink); -	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); -	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); +	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); +	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));  	if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {  		*p++ = htonl(NFS_MAXPATHLEN); @@ -194,11 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,  }  /* Helper function for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat)  { -	struct kstat stat; -	vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &stat); -	return encode_fattr(rqstp, p, fhp, &stat); +	return encode_fattr(rqstp, p, fhp, stat);  }  /* @@ -246,7 +247,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,  					struct nfsd_readargs *args)  {  	unsigned int len; -	int v,pn; +	int v;  	if (!(p = decode_fh(p, &args->fh)))  		return 0; @@ -262,8 +263,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,  	 */  	v=0;  	while (len > 0) { -		pn = rqstp->rq_resused++; -		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); +		struct page *p = *(rqstp->rq_next_page++); + +		rqstp->rq_vec[v].iov_base = page_address(p);  		rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;  		len -= rqstp->rq_vec[v].iov_len;  		v++; @@ -355,7 +357,7 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli  {  	if (!(p = decode_fh(p, &args->fh)))  		return 0; -	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); +	args->buffer = page_address(*(rqstp->rq_next_page++));  	return xdr_argsize_check(rqstp, p);  } @@ -396,7 +398,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,  	if (args->count > PAGE_SIZE)  		args->count = PAGE_SIZE; -	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); +	args->buffer = page_address(*(rqstp->rq_next_page++));  	return xdr_argsize_check(rqstp, p);  } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e036894bce5..1a8c7391f7a 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -150,6 +150,12 @@ struct nfsd4_channel_attrs {  	u32		rdma_attrs;  }; +struct nfsd4_cb_sec { +	u32	flavor; /* (u32)(-1) used to mean "no valid flavor" */ +	kuid_t	uid; +	kgid_t	gid; +}; +  struct nfsd4_create_session {  	clientid_t			clientid;  	struct nfs4_sessionid		sessionid; @@ -158,8 +164,12 @@ struct nfsd4_create_session {  	struct nfsd4_channel_attrs	fore_channel;  	struct nfsd4_channel_attrs	back_channel;  	u32				callback_prog; -	u32				uid; -	u32				gid; +	struct nfsd4_cb_sec		cb_sec; +}; + +struct nfsd4_backchannel_ctl { +	u32	bc_cb_program; +	struct nfsd4_cb_sec		bc_cb_sec;  };  struct nfsd4_bind_conn_to_session { @@ -192,6 +202,7 @@ struct nfsd4_session {  	struct nfs4_sessionid	se_sessionid;  	struct nfsd4_channel_attrs se_fchannel;  	struct nfsd4_channel_attrs se_bchannel; +	struct nfsd4_cb_sec	se_cb_sec;  	struct list_head	se_conns;  	u32			se_cb_prog;  	u32			se_cb_seq_nr; @@ -221,13 +232,12 @@ struct nfsd4_sessionid {   */  struct nfs4_client {  	struct list_head	cl_idhash; 	/* hash by cl_clientid.id */ -	struct list_head	cl_strhash; 	/* hash by cl_name */ +	struct rb_node		cl_namenode;	/* link into by-name trees */  	struct list_head	cl_openowners;  	struct idr		cl_stateids;	/* stateid lookup */  	struct list_head	cl_delegations;  	struct list_head        cl_lru;         /* tail queue */  	struct xdr_netobj	cl_name; 	/* id generated by client */ -	char                    cl_recdir[HEXDIR_LEN]; /* recovery dir */  	nfs4_verifier		cl_verifier; 	/* generated by client */  	time_t                  cl_time;        /* time of last lease renewal */  	struct sockaddr_storage	cl_addr; 	/* client ipaddress */ @@ -242,9 +252,11 @@ struct nfs4_client {  #define NFSD4_CLIENT_CB_KILL		(1)  #define NFSD4_CLIENT_STABLE		(2)	/* client on stable storage */  #define NFSD4_CLIENT_RECLAIM_COMPLETE	(3)	/* reclaim_complete done */ +#define NFSD4_CLIENT_CONFIRMED		(4)	/* client is confirmed */  #define NFSD4_CLIENT_CB_FLAG_MASK	(1 << NFSD4_CLIENT_CB_UPDATE | \  					 1 << NFSD4_CLIENT_CB_KILL)  	unsigned long		cl_flags; +	struct rpc_cred		*cl_cb_cred;  	struct rpc_clnt		*cl_cb_client;  	u32			cl_cb_ident;  #define NFSD4_CB_UP		0 @@ -271,6 +283,7 @@ struct nfs4_client {  	unsigned long		cl_cb_slot_busy;  	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */  						/* wait here for slots */ +	struct net		*net;  };  static inline void @@ -292,6 +305,7 @@ is_client_expired(struct nfs4_client *clp)   */  struct nfs4_client_reclaim {  	struct list_head	cr_strhash;	/* hash by cr_name */ +	struct nfs4_client	*cr_clp;	/* pointer to associated clp */  	char			cr_recdir[HEXDIR_LEN]; /* recover dir */  }; @@ -452,25 +466,26 @@ extern __be32 nfs4_preprocess_stateid_op(struct net *net,  		stateid_t *stateid, int flags, struct file **filp);  extern void nfs4_lock_state(void);  extern void nfs4_unlock_state(void); -extern int nfs4_in_grace(void); -extern void nfs4_release_reclaim(void); -extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp); -extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions); +void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); +extern void nfs4_release_reclaim(struct nfsd_net *); +extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, +							struct nfsd_net *nn); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);  extern void nfs4_free_openowner(struct nfs4_openowner *);  extern void nfs4_free_lockowner(struct nfs4_lockowner *);  extern int set_callback_cred(void); +extern void nfsd4_init_callback(struct nfsd4_callback *);  extern void nfsd4_probe_callback(struct nfs4_client *clp);  extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);  extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); -extern void nfsd4_do_callback_rpc(struct work_struct *);  extern void nfsd4_cb_recall(struct nfs4_delegation *dp);  extern int nfsd4_create_callback_queue(void);  extern void nfsd4_destroy_callback_queue(void);  extern void nfsd4_shutdown_callback(struct nfs4_client *);  extern void nfs4_put_delegation(struct nfs4_delegation *dp); -extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); -extern int nfs4_client_to_reclaim(const char *name); -extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); +extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, +							struct nfsd_net *nn); +extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);  extern void release_session_client(struct nfsd4_session *);  extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); @@ -480,5 +495,28 @@ extern void nfsd4_client_tracking_exit(struct net *net);  extern void nfsd4_client_record_create(struct nfs4_client *clp);  extern void nfsd4_client_record_remove(struct nfs4_client *clp);  extern int nfsd4_client_record_check(struct nfs4_client *clp); -extern void nfsd4_record_grace_done(struct net *net, time_t boot_time); +extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); + +/* nfs fault injection functions */ +#ifdef CONFIG_NFSD_FAULT_INJECTION +int nfsd_fault_inject_init(void); +void nfsd_fault_inject_cleanup(void); +u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64)); +struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t); + +u64 nfsd_forget_client(struct nfs4_client *, u64); +u64 nfsd_forget_client_locks(struct nfs4_client*, u64); +u64 nfsd_forget_client_openowners(struct nfs4_client *, u64); +u64 nfsd_forget_client_delegations(struct nfs4_client *, u64); +u64 nfsd_recall_client_delegations(struct nfs4_client *, u64); + +u64 nfsd_print_client(struct nfs4_client *, u64); +u64 nfsd_print_client_locks(struct nfs4_client *, u64); +u64 nfsd_print_client_openowners(struct nfs4_client *, u64); +u64 nfsd_print_client_delegations(struct nfs4_client *, u64); +#else /* CONFIG_NFSD_FAULT_INJECTION */ +static inline int nfsd_fault_inject_init(void) { return 0; } +static inline void nfsd_fault_inject_cleanup(void) {} +#endif /* CONFIG_NFSD_FAULT_INJECTION */ +  #endif   /* NFSD4_STATE_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c120b48ec30..2a7eb536de0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -401,8 +401,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,  	/* Revoke setuid/setgid on chown */  	if (!S_ISDIR(inode->i_mode) && -	    (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || -	     ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) { +	    (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) || +	     ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) {  		iap->ia_valid |= ATTR_KILL_PRIV;  		if (iap->ia_valid & ATTR_MODE) {  			/* we're setting mode too, just clear the s*id bits */ @@ -886,7 +886,7 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,  		  struct splice_desc *sd)  {  	struct svc_rqst *rqstp = sd->u.data; -	struct page **pp = rqstp->rq_respages + rqstp->rq_resused; +	struct page **pp = rqstp->rq_next_page;  	struct page *page = buf->page;  	size_t size; @@ -894,17 +894,15 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,  	if (rqstp->rq_res.page_len == 0) {  		get_page(page); -		put_page(*pp); -		*pp = page; -		rqstp->rq_resused++; +		put_page(*rqstp->rq_next_page); +		*(rqstp->rq_next_page++) = page;  		rqstp->rq_res.page_base = buf->offset;  		rqstp->rq_res.page_len = size;  	} else if (page != pp[-1]) {  		get_page(page); -		if (*pp) -			put_page(*pp); -		*pp = page; -		rqstp->rq_resused++; +		if (*rqstp->rq_next_page) +			put_page(*rqstp->rq_next_page); +		*(rqstp->rq_next_page++) = page;  		rqstp->rq_res.page_len += size;  	} else  		rqstp->rq_res.page_len += size; @@ -936,7 +934,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,  			.u.data		= rqstp,  		}; -		rqstp->rq_resused = 1; +		rqstp->rq_next_page = rqstp->rq_respages + 1;  		host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);  	} else {  		oldfs = get_fs(); @@ -981,7 +979,7 @@ static void kill_suid(struct dentry *dentry)   */  static int wait_for_concurrent_writes(struct file *file)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	static ino_t last_ino;  	static dev_t last_dev;  	int err = 0; @@ -1020,28 +1018,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,  	inode = dentry->d_inode;  	exp   = fhp->fh_export; -	/* -	 * Request sync writes if -	 *  -	the sync export option has been set, or -	 *  -	the client requested O_SYNC behavior (NFSv3 feature). -	 *  -   The file system doesn't support fsync(). -	 * When NFSv2 gathered writes have been configured for this volume, -	 * flushing the data to disk is handled separately below. -	 */  	use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); -	if (!file->f_op->fsync) {/* COMMIT3 cannot work */ -	       stable = 2; -	       *stablep = 2; /* FILE_SYNC */ -	} -  	if (!EX_ISSYNC(exp))  		stable = 0; -	if (stable && !use_wgather) { -		spin_lock(&file->f_lock); -		file->f_flags |= O_SYNC; -		spin_unlock(&file->f_lock); -	}  	/* Write the data. */  	oldfs = get_fs(); set_fs(KERNEL_DS); @@ -1057,8 +1037,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,  	if (inode->i_mode & (S_ISUID | S_ISGID))  		kill_suid(dentry); -	if (stable && use_wgather) -		host_err = wait_for_concurrent_writes(file); +	if (stable) { +		if (use_wgather) +			host_err = wait_for_concurrent_writes(file); +		else +			host_err = vfs_fsync_range(file, offset, offset+*cnt, 0); +	}  out_nfserr:  	dprintk("nfsd: write complete host_err=%d\n", host_err); @@ -1086,7 +1070,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,  	if (err)  		return err; -	inode = file->f_path.dentry->d_inode; +	inode = file_inode(file);  	/* Get readahead parameters */  	ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); @@ -1221,7 +1205,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,  	 * send along the gid on create when it tries to implement  	 * setgid directories via NFS:  	 */ -	if (current_fsuid() != 0) +	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))  		iap->ia_valid &= ~(ATTR_UID|ATTR_GID);  	if (iap->ia_valid)  		return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); @@ -1485,13 +1469,19 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  		case NFS3_CREATE_EXCLUSIVE:  			if (   dchild->d_inode->i_mtime.tv_sec == v_mtime  			    && dchild->d_inode->i_atime.tv_sec == v_atime -			    && dchild->d_inode->i_size  == 0 ) +			    && dchild->d_inode->i_size  == 0 ) { +				if (created) +					*created = 1;  				break; +			}  		case NFS4_CREATE_EXCLUSIVE4_1:  			if (   dchild->d_inode->i_mtime.tv_sec == v_mtime  			    && dchild->d_inode->i_atime.tv_sec == v_atime -			    && dchild->d_inode->i_size  == 0 ) +			    && dchild->d_inode->i_size  == 0 ) { +				if (created) +					*created = 1;  				goto set_attr; +			}  			 /* fallthru */  		case NFS3_CREATE_GUARDED:  			err = nfserr_exist; @@ -1967,7 +1957,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,  	offset = *offsetp;  	while (1) { -		struct inode *dir_inode = file->f_path.dentry->d_inode; +		struct inode *dir_inode = file_inode(file);  		unsigned int reclen;  		cdp->err = nfserr_eof; /* will be cleared on successful read */ @@ -2160,7 +2150,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,  	 * with NFSv3.  	 */  	if ((acc & NFSD_MAY_OWNER_OVERRIDE) && -	    inode->i_uid == current_fsuid()) +	    uid_eq(inode->i_uid, current_fsuid()))  		return 0;  	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 359594c393d..5b5894159f2 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -6,6 +6,7 @@  #define LINUX_NFSD_VFS_H  #include "nfsfh.h" +#include "nfsd.h"  /*   * Flags for nfsd_permission @@ -125,4 +126,11 @@ static inline void fh_drop_write(struct svc_fh *fh)  	}  } +static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) +{ +	struct path p = {.mnt = fh->fh_export->ex_path.mnt, +			 .dentry = fh->fh_dentry}; +	return nfserrno(vfs_getattr(&p, stat)); +} +  #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 53b1863dd8f..4f0481d6380 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -167,7 +167,7 @@ int nfssvc_encode_entry(void *, const char *name,  int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);  /* Helper functions for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat);  __be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);  #endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 7df980eb056..b6d5542a4ac 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -136,6 +136,7 @@ struct nfsd3_accessres {  	__be32			status;  	struct svc_fh		fh;  	__u32			access; +	struct kstat		stat;  };  struct nfsd3_readlinkres { @@ -225,6 +226,7 @@ struct nfsd3_getaclres {  	int			mask;  	struct posix_acl	*acl_access;  	struct posix_acl	*acl_default; +	struct kstat		stat;  };  /* dummy type for release */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index acd127d4ee8..546f8983ecf 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -385,7 +385,8 @@ struct nfsd4_write {  	u64		wr_offset;          /* request */  	u32		wr_stable_how;      /* request */  	u32		wr_buflen;          /* request */ -	int		wr_vlen; +	struct kvec	wr_head; +	struct page **	wr_pagelist;        /* request */  	u32		wr_bytes_written;   /* response */  	u32		wr_how_written;     /* response */ @@ -462,6 +463,7 @@ struct nfsd4_op {  		/* NFSv4.1 */  		struct nfsd4_exchange_id	exchange_id; +		struct nfsd4_backchannel_ctl	backchannel_ctl;  		struct nfsd4_bind_conn_to_session bind_conn_to_session;  		struct nfsd4_create_session	create_session;  		struct nfsd4_destroy_session	destroy_session; @@ -526,6 +528,14 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)  		|| nfsd4_is_solo_sequence(resp);  } +static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) +{ +	struct nfsd4_compoundres *resp = rqstp->rq_resp; +	struct nfsd4_compoundargs *argp = rqstp->rq_argp; + +	return argp->opcnt == resp->opcnt; +} +  #define NFS4_SVC_XDRSIZE		sizeof(struct nfsd4_compoundargs)  static inline void @@ -553,7 +563,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);  void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);  void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);  __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, -		       struct dentry *dentry, __be32 *buffer, int *countp, +		       struct dentry *dentry, __be32 **buffer, int countp,  		       u32 *bmval, struct svc_rqst *, int ignore_crossmnt);  extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,  		struct nfsd4_compound_state *, @@ -566,6 +576,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,  		struct nfsd4_sequence *seq);  extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,  		struct nfsd4_compound_state *, struct nfsd4_exchange_id *); +extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);  extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);  extern __be32 nfsd4_create_session(struct svc_rqst *,  		struct nfsd4_compound_state *, @@ -579,7 +590,7 @@ extern __be32 nfsd4_destroy_session(struct svc_rqst *,  extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);  __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);  extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, -		struct nfsd4_open *open); +		struct nfsd4_open *open, struct nfsd_net *nn);  extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,  		struct svc_fh *current_fh, struct nfsd4_open *open);  extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 251da07b2a1..80da8eb2739 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,5 @@  config NILFS2_FS -	tristate "NILFS2 file system support (EXPERIMENTAL)" -	depends on EXPERIMENTAL +	tristate "NILFS2 file system support"  	select CRC32  	help  	  NILFS2 is a log-structured file system (LFS) supporting continuous diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index df1a7fb238d..f30b017740a 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -259,7 +259,7 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)  static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  {  	loff_t pos = filp->f_pos; -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	unsigned int offset = pos & ~PAGE_CACHE_MASK;  	unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 16f35f7423c..08fdb77852a 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -67,7 +67,7 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	struct page *page = vmf->page; -	struct inode *inode = vma->vm_file->f_dentry->d_inode; +	struct inode *inode = file_inode(vma->vm_file);  	struct nilfs_transaction_info ti;  	int ret = 0; @@ -126,7 +126,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	nilfs_transaction_commit(inode->i_sb);   mapped: -	wait_on_page_writeback(page); +	wait_for_stable_page(page);   out:  	sb_end_pagefault(inode->i_sb);  	return block_page_mkwrite_return(ret); @@ -167,7 +167,6 @@ const struct file_operations nilfs_file_operations = {  };  const struct inode_operations nilfs_file_inode_operations = { -	.truncate	= nilfs_truncate,  	.setattr	= nilfs_setattr,  	.permission     = nilfs_permission,  	.fiemap		= nilfs_fiemap, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 4d31d2cca7f..6b49f14eac8 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -213,6 +213,16 @@ static int nilfs_set_page_dirty(struct page *page)  	return ret;  } +void nilfs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		nilfs_truncate(inode); +	} +} +  static int nilfs_write_begin(struct file *file, struct address_space *mapping,  			     loff_t pos, unsigned len, unsigned flags,  			     struct page **pagep, void **fsdata) @@ -227,10 +237,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,  	err = block_write_begin(mapping, pos, len, flags, pagep,  				nilfs_get_block);  	if (unlikely(err)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); - +		nilfs_write_failed(mapping, pos + len);  		nilfs_transaction_abort(inode->i_sb);  	}  	return err; @@ -259,6 +266,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,  		loff_t offset, unsigned long nr_segs)  {  	struct file *file = iocb->ki_filp; +	struct address_space *mapping = file->f_mapping;  	struct inode *inode = file->f_mapping->host;  	ssize_t size; @@ -278,7 +286,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,  		loff_t end = offset + iov_length(iov, nr_segs);  		if (end > isize) -			vmtruncate(inode, isize); +			nilfs_write_failed(mapping, end);  	}  	return size; @@ -786,10 +794,8 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)  	if ((iattr->ia_valid & ATTR_SIZE) &&  	    iattr->ia_size != i_size_read(inode)) {  		inode_dio_wait(inode); - -		err = vmtruncate(inode, iattr->ia_size); -		if (unlikely(err)) -			goto out_err; +		truncate_setsize(inode, iattr->ia_size); +		nilfs_truncate(inode);  	}  	setattr_copy(inode, iattr); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index fdb18076948..b44bdb291b8 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -664,8 +664,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,  	if (ret < 0)  		printk(KERN_ERR "NILFS: GC failed during preparation: "  			"cannot read source blocks: err=%d\n", ret); -	else +	else { +		if (nilfs_sb_need_update(nilfs)) +			set_nilfs_discontinued(nilfs);  		ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); +	}  	nilfs_remove_all_gcinodes(nilfs);  	clear_nilfs_gc_running(nilfs); @@ -793,7 +796,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,  long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	void __user *argp = (void __user *)arg;  	switch (cmd) { diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1d0c0b84c5a..9de78f08989 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -517,11 +517,11 @@ static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp,  	if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) {  		*lenp = NILFS_FID_SIZE_CONNECTABLE; -		return 255; +		return FILEID_INVALID;  	}  	if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) {  		*lenp = NILFS_FID_SIZE_NON_CONNECTABLE; -		return 255; +		return FILEID_INVALID;  	}  	fid->cno = root->cno; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 74cece80e9a..9bc72dec3fa 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -277,6 +277,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);  extern void nilfs_truncate(struct inode *);  extern void nilfs_evict_inode(struct inode *);  extern int nilfs_setattr(struct dentry *, struct iattr *); +extern void nilfs_write_failed(struct address_space *mapping, loff_t to);  int nilfs_permission(struct inode *inode, int mask);  int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);  extern int nilfs_inode_dirty(struct inode *); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index f1626f5011c..ff00a0b7acb 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -527,7 +527,8 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,  		if (unlikely(err)) {  			loff_t isize = inode->i_size;  			if (pos + blocksize > isize) -				vmtruncate(inode, isize); +				nilfs_write_failed(inode->i_mapping, +							pos + blocksize);  			goto failed_inode;  		} diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 3344bdd5506..2bfe6dc413a 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -174,7 +174,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)  	struct dnotify_struct **prev;  	struct inode *inode; -	inode = filp->f_path.dentry->d_inode; +	inode = file_inode(filp);  	if (!S_ISDIR(inode->i_mode))  		return; @@ -201,7 +201,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)  	/* nothing else could have found us thanks to the dnotify_mark_mutex */  	if (dn_mark->dn == NULL) -		fsnotify_destroy_mark(fsn_mark); +		fsnotify_destroy_mark(fsn_mark, dnotify_group);  	mutex_unlock(&dnotify_mark_mutex); @@ -296,7 +296,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)  	}  	/* dnotify only works on directories */ -	inode = filp->f_path.dentry->d_inode; +	inode = file_inode(filp);  	if (!S_ISDIR(inode->i_mode)) {  		error = -ENOTDIR;  		goto out_err; @@ -385,7 +385,7 @@ out:  	spin_unlock(&fsn_mark->lock);  	if (destroy) -		fsnotify_destroy_mark(fsn_mark); +		fsnotify_destroy_mark(fsn_mark, dnotify_group);  	mutex_unlock(&dnotify_mark_mutex);  	fsnotify_put_mark(fsn_mark); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index a5063602536..0c2f9122b26 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -18,6 +18,12 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)  	    old->tgid == new->tgid) {  		switch (old->data_type) {  		case (FSNOTIFY_EVENT_PATH): +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +			/* dont merge two permission events */ +			if ((old->mask & FAN_ALL_PERM_EVENTS) && +			    (new->mask & FAN_ALL_PERM_EVENTS)) +				return false; +#endif  			if ((old->path.mnt == new->path.mnt) &&  			    (old->path.dentry == new->path.dentry))  				return true; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index a5cd9bba022..5d8444268a1 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -397,8 +397,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)  	wake_up(&group->fanotify_data.access_waitq);  #endif + +	if (file->f_flags & FASYNC) +		fsnotify_fasync(-1, file, 0); +  	/* matches the fanotify_init->fsnotify_alloc_group */ -	fsnotify_put_group(group); +	fsnotify_destroy_group(group);  	return 0;  } @@ -462,7 +466,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,  		ret = -ENOTDIR;  		if ((flags & FAN_MARK_ONLYDIR) && -		    !(S_ISDIR(f.file->f_path.dentry->d_inode->i_mode))) { +		    !(S_ISDIR(file_inode(f.file)->i_mode))) {  			fdput(f);  			goto out;  		} @@ -493,7 +497,8 @@ out:  static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,  					    __u32 mask, -					    unsigned int flags) +					    unsigned int flags, +					    int *destroy)  {  	__u32 oldmask; @@ -507,8 +512,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,  	}  	spin_unlock(&fsn_mark->lock); -	if (!(oldmask & ~mask)) -		fsnotify_destroy_mark(fsn_mark); +	*destroy = !(oldmask & ~mask);  	return mask & oldmask;  } @@ -519,12 +523,17 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,  {  	struct fsnotify_mark *fsn_mark = NULL;  	__u32 removed; +	int destroy_mark;  	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);  	if (!fsn_mark)  		return -ENOENT; -	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); +	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, +						 &destroy_mark); +	if (destroy_mark) +		fsnotify_destroy_mark(fsn_mark, group); +  	fsnotify_put_mark(fsn_mark);  	if (removed & real_mount(mnt)->mnt_fsnotify_mask)  		fsnotify_recalc_vfsmount_mask(mnt); @@ -538,12 +547,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,  {  	struct fsnotify_mark *fsn_mark = NULL;  	__u32 removed; +	int destroy_mark;  	fsn_mark = fsnotify_find_inode_mark(group, inode);  	if (!fsn_mark)  		return -ENOENT; -	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); +	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, +						 &destroy_mark); +	if (destroy_mark) +		fsnotify_destroy_mark(fsn_mark, group);  	/* matches the fsnotify_find_inode_mark() */  	fsnotify_put_mark(fsn_mark);  	if (removed & inode->i_fsnotify_mask) @@ -710,13 +723,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)  		break;  	default:  		fd = -EINVAL; -		goto out_put_group; +		goto out_destroy_group;  	}  	if (flags & FAN_UNLIMITED_QUEUE) {  		fd = -EPERM;  		if (!capable(CAP_SYS_ADMIN)) -			goto out_put_group; +			goto out_destroy_group;  		group->max_events = UINT_MAX;  	} else {  		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; @@ -725,7 +738,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)  	if (flags & FAN_UNLIMITED_MARKS) {  		fd = -EPERM;  		if (!capable(CAP_SYS_ADMIN)) -			goto out_put_group; +			goto out_destroy_group;  		group->fanotify_data.max_marks = UINT_MAX;  	} else {  		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; @@ -733,12 +746,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)  	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);  	if (fd < 0) -		goto out_put_group; +		goto out_destroy_group;  	return fd; -out_put_group: -	fsnotify_put_group(group); +out_destroy_group: +	fsnotify_destroy_group(group);  	return fd;  } diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 514c4b81483..238a5930cb3 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -27,13 +27,13 @@ static int show_fdinfo(struct seq_file *m, struct file *f,  	struct fsnotify_mark *mark;  	int ret = 0; -	spin_lock(&group->mark_lock); +	mutex_lock(&group->mark_mutex);  	list_for_each_entry(mark, &group->marks_list, g_list) {  		ret = show(m, mark);  		if (ret)  			break;  	} -	spin_unlock(&group->mark_lock); +	mutex_unlock(&group->mark_mutex);  	return ret;  } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 6baadb5a843..4bb21d67d9b 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -52,7 +52,6 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)  void __fsnotify_update_child_dentry_flags(struct inode *inode)  {  	struct dentry *alias; -	struct hlist_node *p;  	int watched;  	if (!S_ISDIR(inode->i_mode)) @@ -64,7 +63,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)  	spin_lock(&inode->i_lock);  	/* run all of the dentries associated with this inode.  Since this is a  	 * directory, there damn well better only be one item on this list */ -	hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { +	hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {  		struct dentry *child;  		/* run all of the children of the original inode and fix their diff --git a/fs/notify/group.c b/fs/notify/group.c index 63fc294a469..bd2625bd88b 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -33,9 +33,6 @@   */  void fsnotify_final_destroy_group(struct fsnotify_group *group)  { -	/* clear the notification queue of all events */ -	fsnotify_flush_notify(group); -  	if (group->ops->free_group_priv)  		group->ops->free_group_priv(group); @@ -43,23 +40,30 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)  }  /* - * Trying to get rid of a group.  We need to first get rid of any outstanding - * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group - * could miss marks that are being freed by inode and those marks could still - * hold a reference to this group (via group->num_marks)  If we get into that - * situtation, the fsnotify_final_destroy_group will get called when that final - * mark is freed. + * Trying to get rid of a group. Remove all marks, flush all events and release + * the group reference. + * Note that another thread calling fsnotify_clear_marks_by_group() may still + * hold a ref to the group.   */ -static void fsnotify_destroy_group(struct fsnotify_group *group) +void fsnotify_destroy_group(struct fsnotify_group *group)  {  	/* clear all inode marks for this group */  	fsnotify_clear_marks_by_group(group);  	synchronize_srcu(&fsnotify_mark_srcu); -	/* past the point of no return, matches the initial value of 1 */ -	if (atomic_dec_and_test(&group->num_marks)) -		fsnotify_final_destroy_group(group); +	/* clear the notification queue of all events */ +	fsnotify_flush_notify(group); + +	fsnotify_put_group(group); +} + +/* + * Get reference to a group. + */ +void fsnotify_get_group(struct fsnotify_group *group) +{ +	atomic_inc(&group->refcnt);  }  /* @@ -68,7 +72,7 @@ static void fsnotify_destroy_group(struct fsnotify_group *group)  void fsnotify_put_group(struct fsnotify_group *group)  {  	if (atomic_dec_and_test(&group->refcnt)) -		fsnotify_destroy_group(group); +		fsnotify_final_destroy_group(group);  }  /* @@ -84,21 +88,24 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)  	/* set to 0 when there a no external references to this group */  	atomic_set(&group->refcnt, 1); -	/* -	 * hits 0 when there are no external references AND no marks for -	 * this group -	 */ -	atomic_set(&group->num_marks, 1); +	atomic_set(&group->num_marks, 0);  	mutex_init(&group->notification_mutex);  	INIT_LIST_HEAD(&group->notification_list);  	init_waitqueue_head(&group->notification_waitq);  	group->max_events = UINT_MAX; -	spin_lock_init(&group->mark_lock); +	mutex_init(&group->mark_mutex);  	INIT_LIST_HEAD(&group->marks_list);  	group->ops = ops;  	return group;  } + +int fsnotify_fasync(int fd, struct file *file, int on) +{ +	struct fsnotify_group *group = file->private_data; + +	return fasync_helper(fd, file, on, &group->fsn_fa) >= 0 ? 0 : -EIO; +} diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index f3035691f52..74825be65b7 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -36,12 +36,11 @@  static void fsnotify_recalc_inode_mask_locked(struct inode *inode)  {  	struct fsnotify_mark *mark; -	struct hlist_node *pos;  	__u32 new_mask = 0;  	assert_spin_locked(&inode->i_lock); -	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) +	hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list)  		new_mask |= mark->mask;  	inode->i_fsnotify_mask = new_mask;  } @@ -63,8 +62,8 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)  {  	struct inode *inode = mark->i.inode; +	BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));  	assert_spin_locked(&mark->lock); -	assert_spin_locked(&mark->group->mark_lock);  	spin_lock(&inode->i_lock); @@ -87,11 +86,11 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)  void fsnotify_clear_marks_by_inode(struct inode *inode)  {  	struct fsnotify_mark *mark, *lmark; -	struct hlist_node *pos, *n; +	struct hlist_node *n;  	LIST_HEAD(free_list);  	spin_lock(&inode->i_lock); -	hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) { +	hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) {  		list_add(&mark->i.free_i_list, &free_list);  		hlist_del_init_rcu(&mark->i.i_list);  		fsnotify_get_mark(mark); @@ -99,8 +98,16 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)  	spin_unlock(&inode->i_lock);  	list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) { -		fsnotify_destroy_mark(mark); +		struct fsnotify_group *group; + +		spin_lock(&mark->lock); +		fsnotify_get_group(mark->group); +		group = mark->group; +		spin_unlock(&mark->lock); + +		fsnotify_destroy_mark(mark, group);  		fsnotify_put_mark(mark); +		fsnotify_put_group(group);  	}  } @@ -121,11 +128,10 @@ static struct fsnotify_mark *fsnotify_find_inode_mark_locked(  		struct inode *inode)  {  	struct fsnotify_mark *mark; -	struct hlist_node *pos;  	assert_spin_locked(&inode->i_lock); -	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) { +	hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) {  		if (mark->group == group) {  			fsnotify_get_mark(mark);  			return mark; @@ -186,14 +192,13 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,  			    struct fsnotify_group *group, struct inode *inode,  			    int allow_dups)  { -	struct fsnotify_mark *lmark; -	struct hlist_node *node, *last = NULL; +	struct fsnotify_mark *lmark, *last = NULL;  	int ret = 0;  	mark->flags |= FSNOTIFY_MARK_FLAG_INODE; +	BUG_ON(!mutex_is_locked(&group->mark_mutex));  	assert_spin_locked(&mark->lock); -	assert_spin_locked(&group->mark_lock);  	spin_lock(&inode->i_lock); @@ -206,8 +211,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,  	}  	/* should mark be in the middle of the current list? */ -	hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) { -		last = node; +	hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) { +		last = lmark;  		if ((lmark->group == group) && !allow_dups) {  			ret = -EEXIST; @@ -227,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,  	BUG_ON(last == NULL);  	/* mark should be the last entry.  last is the current last entry */ -	hlist_add_after_rcu(last, &mark->i.i_list); +	hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list);  out:  	fsnotify_recalc_inode_mask_locked(inode);  	spin_unlock(&inode->i_lock); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index e3cbd746f64..4216308b81b 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -118,6 +118,7 @@ static int inotify_handle_event(struct fsnotify_group *group,  	fsn_event_priv = &event_priv->fsnotify_event_priv_data; +	fsnotify_get_group(group);  	fsn_event_priv->group = group;  	event_priv->wd = wd; @@ -131,7 +132,7 @@ static int inotify_handle_event(struct fsnotify_group *group,  	}  	if (inode_mark->mask & IN_ONESHOT) -		fsnotify_destroy_mark(inode_mark); +		fsnotify_destroy_mark(inode_mark, group);  	return ret;  } @@ -196,7 +197,6 @@ static void inotify_free_group_priv(struct fsnotify_group *group)  {  	/* ideally the idr is empty and we won't hit the BUG in the callback */  	idr_for_each(&group->inotify_data.idr, idr_callback, group); -	idr_remove_all(&group->inotify_data.idr);  	idr_destroy(&group->inotify_data.idr);  	atomic_dec(&group->inotify_data.user->inotify_devs);  	free_uid(group->inotify_data.user); @@ -210,6 +210,7 @@ void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)  	event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,  				  fsnotify_event_priv_data); +	fsnotify_put_group(fsn_event_priv->group);  	kmem_cache_free(event_priv_cachep, event_priv);  } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 36cb013c7c1..e0f7c1241a6 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -265,7 +265,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,  		ret = -EAGAIN;  		if (file->f_flags & O_NONBLOCK)  			break; -		ret = -EINTR; +		ret = -ERESTARTSYS;  		if (signal_pending(current))  			break; @@ -281,23 +281,17 @@ static ssize_t inotify_read(struct file *file, char __user *buf,  	return ret;  } -static int inotify_fasync(int fd, struct file *file, int on) -{ -	struct fsnotify_group *group = file->private_data; - -	return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO; -} -  static int inotify_release(struct inode *ignored, struct file *file)  {  	struct fsnotify_group *group = file->private_data;  	pr_debug("%s: group=%p\n", __func__, group); -	fsnotify_clear_marks_by_group(group); +	if (file->f_flags & FASYNC) +		fsnotify_fasync(-1, file, 0);  	/* free this group, matching get was inotify_init->fsnotify_obtain_group */ -	fsnotify_put_group(group); +	fsnotify_destroy_group(group);  	return 0;  } @@ -339,7 +333,7 @@ static const struct file_operations inotify_fops = {  	.show_fdinfo	= inotify_show_fdinfo,  	.poll		= inotify_poll,  	.read		= inotify_read, -	.fasync		= inotify_fasync, +	.fasync		= fsnotify_fasync,  	.release	= inotify_release,  	.unlocked_ioctl	= inotify_ioctl,  	.compat_ioctl	= inotify_ioctl, @@ -370,22 +364,20 @@ static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,  {  	int ret; -	do { -		if (unlikely(!idr_pre_get(idr, GFP_KERNEL))) -			return -ENOMEM; +	idr_preload(GFP_KERNEL); +	spin_lock(idr_lock); -		spin_lock(idr_lock); -		ret = idr_get_new_above(idr, i_mark, *last_wd + 1, -					&i_mark->wd); +	ret = idr_alloc(idr, i_mark, *last_wd + 1, 0, GFP_NOWAIT); +	if (ret >= 0) {  		/* we added the mark to the idr, take a reference */ -		if (!ret) { -			*last_wd = i_mark->wd; -			fsnotify_get_mark(&i_mark->fsn_mark); -		} -		spin_unlock(idr_lock); -	} while (ret == -EAGAIN); +		i_mark->wd = ret; +		*last_wd = i_mark->wd; +		fsnotify_get_mark(&i_mark->fsn_mark); +	} -	return ret; +	spin_unlock(idr_lock); +	idr_preload_end(); +	return ret < 0 ? ret : 0;  }  static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group, @@ -521,13 +513,13 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,  	struct fsnotify_event_private_data *fsn_event_priv;  	int ret; +	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); +  	ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,  					      FSNOTIFY_EVENT_NONE, NULL, 0,  					      GFP_NOFS);  	if (!ignored_event) -		return; - -	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); +		goto skip_send_ignore;  	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);  	if (unlikely(!event_priv)) @@ -535,6 +527,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,  	fsn_event_priv = &event_priv->fsnotify_event_priv_data; +	fsnotify_get_group(group);  	fsn_event_priv->group = group;  	event_priv->wd = i_mark->wd; @@ -548,9 +541,9 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,  	}  skip_send_ignore: -  	/* matches the reference taken when the event was created */ -	fsnotify_put_event(ignored_event); +	if (ignored_event) +		fsnotify_put_event(ignored_event);  	/* remove this mark from the idr */  	inotify_remove_from_idr(group, i_mark); @@ -581,8 +574,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,  	/* don't allow invalid bits: we don't want flags set */  	mask = inotify_arg_to_mask(arg); -	if (unlikely(!(mask & IN_ALL_EVENTS))) -		return -EINVAL;  	fsn_mark = fsnotify_find_inode_mark(group, inode);  	if (!fsn_mark) @@ -634,8 +625,6 @@ static int inotify_new_watch(struct fsnotify_group *group,  	/* don't allow invalid bits: we don't want flags set */  	mask = inotify_arg_to_mask(arg); -	if (unlikely(!(mask & IN_ALL_EVENTS))) -		return -EINVAL;  	tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);  	if (unlikely(!tmp_i_mark)) @@ -709,12 +698,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)  	spin_lock_init(&group->inotify_data.idr_lock);  	idr_init(&group->inotify_data.idr);  	group->inotify_data.last_wd = 0; -	group->inotify_data.fa = NULL;  	group->inotify_data.user = get_current_user();  	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >  	    inotify_max_user_instances) { -		fsnotify_put_group(group); +		fsnotify_destroy_group(group);  		return ERR_PTR(-EMFILE);  	} @@ -743,7 +731,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)  	ret = anon_inode_getfd("inotify", &inotify_fops, group,  				  O_RDONLY | flags);  	if (ret < 0) -		fsnotify_put_group(group); +		fsnotify_destroy_group(group);  	return ret;  } @@ -819,7 +807,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)  	ret = 0; -	fsnotify_destroy_mark(&i_mark->fsn_mark); +	fsnotify_destroy_mark(&i_mark->fsn_mark, group);  	/* match ref taken by inotify_idr_find */  	fsnotify_put_mark(&i_mark->fsn_mark); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index f104d565b68..fc6b49bf736 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -109,8 +109,11 @@ void fsnotify_get_mark(struct fsnotify_mark *mark)  void fsnotify_put_mark(struct fsnotify_mark *mark)  { -	if (atomic_dec_and_test(&mark->refcnt)) +	if (atomic_dec_and_test(&mark->refcnt)) { +		if (mark->group) +			fsnotify_put_group(mark->group);  		mark->free_mark(mark); +	}  }  /* @@ -118,14 +121,14 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)   * The caller had better be holding a reference to this mark so we don't actually   * do the final put under the mark->lock   */ -void fsnotify_destroy_mark(struct fsnotify_mark *mark) +void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, +				  struct fsnotify_group *group)  { -	struct fsnotify_group *group;  	struct inode *inode = NULL; -	spin_lock(&mark->lock); +	BUG_ON(!mutex_is_locked(&group->mark_mutex)); -	group = mark->group; +	spin_lock(&mark->lock);  	/* something else already called this function on this mark */  	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { @@ -135,8 +138,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)  	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; -	spin_lock(&group->mark_lock); -  	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {  		inode = mark->i.inode;  		fsnotify_destroy_inode_mark(mark); @@ -147,13 +148,22 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)  	list_del_init(&mark->g_list); -	spin_unlock(&group->mark_lock);  	spin_unlock(&mark->lock); +	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) +		iput(inode); +	/* release lock temporarily */ +	mutex_unlock(&group->mark_mutex); +  	spin_lock(&destroy_lock);  	list_add(&mark->destroy_list, &destroy_list);  	spin_unlock(&destroy_lock);  	wake_up(&destroy_waitq); +	/* +	 * We don't necessarily have a ref on mark from caller so the above destroy +	 * may have actually freed it, unless this group provides a 'freeing_mark' +	 * function which must be holding a reference. +	 */  	/*  	 * Some groups like to know that marks are being freed.  This is a @@ -175,21 +185,17 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)  	 * is just a lazy update (and could be a perf win...)  	 */ -	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) -		iput(inode); +	atomic_dec(&group->num_marks); -	/* -	 * We don't necessarily have a ref on mark from caller so the above iput -	 * may have already destroyed it.  Don't touch from now on. -	 */ +	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); +} -	/* -	 * it's possible that this group tried to destroy itself, but this -	 * this mark was simultaneously being freed by inode.  If that's the -	 * case, we finish freeing the group here. -	 */ -	if (unlikely(atomic_dec_and_test(&group->num_marks))) -		fsnotify_final_destroy_group(group); +void fsnotify_destroy_mark(struct fsnotify_mark *mark, +			   struct fsnotify_group *group) +{ +	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); +	fsnotify_destroy_mark_locked(mark, group); +	mutex_unlock(&group->mark_mutex);  }  void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) @@ -214,26 +220,26 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas   * These marks may be used for the fsnotify backend to determine which   * event types should be delivered to which group.   */ -int fsnotify_add_mark(struct fsnotify_mark *mark, -		      struct fsnotify_group *group, struct inode *inode, -		      struct vfsmount *mnt, int allow_dups) +int fsnotify_add_mark_locked(struct fsnotify_mark *mark, +			     struct fsnotify_group *group, struct inode *inode, +			     struct vfsmount *mnt, int allow_dups)  {  	int ret = 0;  	BUG_ON(inode && mnt);  	BUG_ON(!inode && !mnt); +	BUG_ON(!mutex_is_locked(&group->mark_mutex));  	/*  	 * LOCKING ORDER!!!! +	 * group->mark_mutex  	 * mark->lock -	 * group->mark_lock  	 * inode->i_lock  	 */  	spin_lock(&mark->lock); -	spin_lock(&group->mark_lock); -  	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; +	fsnotify_get_group(group);  	mark->group = group;  	list_add(&mark->g_list, &group->marks_list);  	atomic_inc(&group->num_marks); @@ -251,11 +257,8 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,  		BUG();  	} -	spin_unlock(&group->mark_lock); -  	/* this will pin the object if appropriate */  	fsnotify_set_mark_mask_locked(mark, mark->mask); -  	spin_unlock(&mark->lock);  	if (inode) @@ -265,10 +268,10 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,  err:  	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;  	list_del_init(&mark->g_list); +	fsnotify_put_group(group);  	mark->group = NULL;  	atomic_dec(&group->num_marks); -	spin_unlock(&group->mark_lock);  	spin_unlock(&mark->lock);  	spin_lock(&destroy_lock); @@ -279,6 +282,16 @@ err:  	return ret;  } +int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, +		      struct inode *inode, struct vfsmount *mnt, int allow_dups) +{ +	int ret; +	mutex_lock(&group->mark_mutex); +	ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups); +	mutex_unlock(&group->mark_mutex); +	return ret; +} +  /*   * clear any marks in a group in which mark->flags & flags is true   */ @@ -286,22 +299,16 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,  					 unsigned int flags)  {  	struct fsnotify_mark *lmark, *mark; -	LIST_HEAD(free_list); -	spin_lock(&group->mark_lock); +	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);  	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {  		if (mark->flags & flags) { -			list_add(&mark->free_g_list, &free_list); -			list_del_init(&mark->g_list);  			fsnotify_get_mark(mark); +			fsnotify_destroy_mark_locked(mark, group); +			fsnotify_put_mark(mark);  		}  	} -	spin_unlock(&group->mark_lock); - -	list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) { -		fsnotify_destroy_mark(mark); -		fsnotify_put_mark(mark); -	} +	mutex_unlock(&group->mark_mutex);  }  /* @@ -317,6 +324,8 @@ void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *ol  	assert_spin_locked(&old->lock);  	new->i.inode = old->i.inode;  	new->m.mnt = old->m.mnt; +	if (old->group) +		fsnotify_get_group(old->group);  	new->group = old->group;  	new->mask = old->mask;  	new->free_mark = old->free_mark; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 48cb994e492..7b51b05f160 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -225,6 +225,7 @@ alloc_holder:  	mutex_unlock(&group->notification_mutex);  	wake_up(&group->notification_waitq); +	kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);  	return return_event;  } diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index b7b4b0e8554..68ca5a8704b 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -33,12 +33,12 @@  void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)  {  	struct fsnotify_mark *mark, *lmark; -	struct hlist_node *pos, *n; +	struct hlist_node *n;  	struct mount *m = real_mount(mnt);  	LIST_HEAD(free_list);  	spin_lock(&mnt->mnt_root->d_lock); -	hlist_for_each_entry_safe(mark, pos, n, &m->mnt_fsnotify_marks, m.m_list) { +	hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) {  		list_add(&mark->m.free_m_list, &free_list);  		hlist_del_init_rcu(&mark->m.m_list);  		fsnotify_get_mark(mark); @@ -46,8 +46,16 @@ void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)  	spin_unlock(&mnt->mnt_root->d_lock);  	list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) { -		fsnotify_destroy_mark(mark); +		struct fsnotify_group *group; + +		spin_lock(&mark->lock); +		fsnotify_get_group(mark->group); +		group = mark->group; +		spin_unlock(&mark->lock); + +		fsnotify_destroy_mark(mark, group);  		fsnotify_put_mark(mark); +		fsnotify_put_group(group);  	}  } @@ -63,12 +71,11 @@ static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)  {  	struct mount *m = real_mount(mnt);  	struct fsnotify_mark *mark; -	struct hlist_node *pos;  	__u32 new_mask = 0;  	assert_spin_locked(&mnt->mnt_root->d_lock); -	hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) +	hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list)  		new_mask |= mark->mask;  	m->mnt_fsnotify_mask = new_mask;  } @@ -88,8 +95,8 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)  {  	struct vfsmount *mnt = mark->m.mnt; +	BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));  	assert_spin_locked(&mark->lock); -	assert_spin_locked(&mark->group->mark_lock);  	spin_lock(&mnt->mnt_root->d_lock); @@ -106,11 +113,10 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_  {  	struct mount *m = real_mount(mnt);  	struct fsnotify_mark *mark; -	struct hlist_node *pos;  	assert_spin_locked(&mnt->mnt_root->d_lock); -	hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) { +	hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) {  		if (mark->group == group) {  			fsnotify_get_mark(mark);  			return mark; @@ -145,14 +151,13 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,  			       int allow_dups)  {  	struct mount *m = real_mount(mnt); -	struct fsnotify_mark *lmark; -	struct hlist_node *node, *last = NULL; +	struct fsnotify_mark *lmark, *last = NULL;  	int ret = 0;  	mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; +	BUG_ON(!mutex_is_locked(&group->mark_mutex));  	assert_spin_locked(&mark->lock); -	assert_spin_locked(&group->mark_lock);  	spin_lock(&mnt->mnt_root->d_lock); @@ -165,8 +170,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,  	}  	/* should mark be in the middle of the current list? */ -	hlist_for_each_entry(lmark, node, &m->mnt_fsnotify_marks, m.m_list) { -		last = node; +	hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) { +		last = lmark;  		if ((lmark->group == group) && !allow_dups) {  			ret = -EEXIST; @@ -186,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,  	BUG_ON(last == NULL);  	/* mark should be the last entry.  last is the current last entry */ -	hlist_add_after_rcu(last, &mark->m.m_list); +	hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list);  out:  	fsnotify_recalc_vfsmount_mask_locked(mnt);  	spin_unlock(&mnt->mnt_root->d_lock); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 99e36107ff6..aa411c3f20e 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1101,7 +1101,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  {  	s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;  	loff_t fpos, i_size; -	struct inode *bmp_vi, *vdir = filp->f_path.dentry->d_inode; +	struct inode *bmp_vi, *vdir = file_inode(filp);  	struct super_block *sb = vdir->i_sb;  	ntfs_inode *ndir = NTFS_I(vdir);  	ntfs_volume *vol = NTFS_SB(sb); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1ecf46448f8..5b2d4f0853a 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1762,6 +1762,16 @@ err_out:  	return err;  } +static void ntfs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		ntfs_truncate_vfs(inode); +	} +} +  /**   * ntfs_file_buffered_write -   * @@ -2022,8 +2032,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,  				 * allocated space, which is not a disaster.  				 */  				i_size = i_size_read(vi); -				if (pos + bytes > i_size) -					vmtruncate(vi, i_size); +				if (pos + bytes > i_size) { +					ntfs_write_failed(mapping, pos + bytes); +				}  				break;  			}  		} @@ -2227,7 +2238,6 @@ const struct file_operations ntfs_file_ops = {  const struct inode_operations ntfs_file_inode_ops = {  #ifdef NTFS_RW -	.truncate	= ntfs_truncate_vfs,  	.setattr	= ntfs_setattr,  #endif /* NTFS_RW */  }; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 1d27331e6fc..d3e118cc6ff 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2866,9 +2866,11 @@ conv_err_out:   *   * See ntfs_truncate() description above for details.   */ +#ifdef NTFS_RW  void ntfs_truncate_vfs(struct inode *vi) {  	ntfs_truncate(vi);  } +#endif  /**   * ntfs_setattr - called from notify_change() when an attribute is being changed @@ -2914,8 +2916,10 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)  						NInoCompressed(ni) ?  						"compressed" : "encrypted");  				err = -EOPNOTSUPP; -			} else -				err = vmtruncate(vi, attr->ia_size); +			} else { +				truncate_setsize(vi, attr->ia_size); +				ntfs_truncate_vfs(vi); +			}  			if (err || ia_valid == ATTR_SIZE)  				goto out;  		} else { diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index db29695f845..76b6cfb579d 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -316,6 +316,10 @@ static inline void ntfs_commit_inode(struct inode *vi)  	return;  } +#else + +static inline void ntfs_truncate_vfs(struct inode *vi) {} +  #endif /* NTFS_RW */  #endif /* _LINUX_NTFS_INODE_H */ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 260b16281fc..8a404576fb2 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -65,7 +65,20 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)  		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);  		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); -		acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id); +		switch(acl->a_entries[n].e_tag) { +		case ACL_USER: +			acl->a_entries[n].e_uid = +				make_kuid(&init_user_ns, +					  le32_to_cpu(entry->e_id)); +			break; +		case ACL_GROUP: +			acl->a_entries[n].e_gid = +				make_kgid(&init_user_ns, +					  le32_to_cpu(entry->e_id)); +			break; +		default: +			break; +		}  		value += sizeof(struct posix_acl_entry);  	} @@ -91,7 +104,21 @@ static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)  	for (n = 0; n < acl->a_count; n++, entry++) {  		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);  		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); -		entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id); +		switch(acl->a_entries[n].e_tag) { +		case ACL_USER: +			entry->e_id = cpu_to_le32( +				from_kuid(&init_user_ns, +					  acl->a_entries[n].e_uid)); +			break; +		case ACL_GROUP: +			entry->e_id = cpu_to_le32( +				from_kgid(&init_user_ns, +					  acl->a_entries[n].e_gid)); +			break; +		default: +			entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); +			break; +		}  	}  	return ocfs2_acl;  } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 31b9463fba1..b8a9d87231b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6751,8 +6751,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,  		mlog_errno(ret);  out: -	if (pages) -		kfree(pages); +	kfree(pages);  	return ret;  } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 657743254eb..20dfec72e90 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -569,7 +569,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,  			     int ret,  			     bool is_async)  { -	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(iocb->ki_filp);  	int level;  	wait_queue_head_t *wq = ocfs2_ioend_wq(inode); @@ -593,9 +593,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,  	level = ocfs2_iocb_rw_locked_level(iocb);  	ocfs2_rw_unlock(inode, level); +	inode_dio_done(inode);  	if (is_async)  		aio_complete(iocb, ret, 0); -	inode_dio_done(inode);  }  /* @@ -626,7 +626,7 @@ static ssize_t ocfs2_direct_IO(int rw,  			       unsigned long nr_segs)  {  	struct file *file = iocb->ki_filp; -	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; +	struct inode *inode = file_inode(file)->i_mapping->host;  	/*  	 * Fallback to buffered I/O if we see an inode without @@ -1194,6 +1194,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,  				goto out;  			}  		} +		wait_for_stable_page(wc->w_pages[i]);  		if (index == target_index)  			wc->w_target_page = wc->w_pages[i]; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f7c648d7d6b..42252bf64b5 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1471,8 +1471,7 @@ static void o2hb_region_release(struct config_item *item)  	mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); -	if (reg->hr_tmp_block) -		kfree(reg->hr_tmp_block); +	kfree(reg->hr_tmp_block);  	if (reg->hr_slot_data) {  		for (i = 0; i < reg->hr_num_pages; i++) { @@ -1486,8 +1485,7 @@ static void o2hb_region_release(struct config_item *item)  	if (reg->hr_bdev)  		blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); -	if (reg->hr_slots) -		kfree(reg->hr_slots); +	kfree(reg->hr_slots);  	kfree(reg->hr_db_regnum);  	kfree(reg->hr_db_livenodes); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1bfe8802cc1..aa88bd8bced 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -304,28 +304,22 @@ static u8 o2net_num_from_nn(struct o2net_node *nn)  static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)  { -	int ret = 0; - -	do { -		if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) { -			ret = -EAGAIN; -			break; -		} -		spin_lock(&nn->nn_lock); -		ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id); -		if (ret == 0) -			list_add_tail(&nsw->ns_node_item, -				      &nn->nn_status_list); -		spin_unlock(&nn->nn_lock); -	} while (ret == -EAGAIN); +	int ret; -	if (ret == 0)  { -		init_waitqueue_head(&nsw->ns_wq); -		nsw->ns_sys_status = O2NET_ERR_NONE; -		nsw->ns_status = 0; +	spin_lock(&nn->nn_lock); +	ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC); +	if (ret >= 0) { +		nsw->ns_id = ret; +		list_add_tail(&nsw->ns_node_item, &nn->nn_status_list);  	} +	spin_unlock(&nn->nn_lock); +	if (ret < 0) +		return ret; -	return ret; +	init_waitqueue_head(&nsw->ns_wq); +	nsw->ns_sys_status = O2NET_ERR_NONE; +	nsw->ns_status = 0; +	return 0;  }  static void o2net_complete_nsw_locked(struct o2net_node *nn, @@ -870,7 +864,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,  		/* we've had some trouble with handlers seemingly vanishing. */  		mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,  							  &parent) == NULL, -			        "couldn't find handler we *just* registerd " +			        "couldn't find handler we *just* registered "  				"for type %u key %08x\n", msg_type, key);  	}  	write_unlock(&o2net_handler_lock); @@ -1165,10 +1159,8 @@ out:  	o2net_debug_del_nst(&nst); /* must be before dropping sc and node */  	if (sc)  		sc_put(sc); -	if (vec) -		kfree(vec); -	if (msg) -		kfree(msg); +	kfree(vec); +	kfree(msg);  	o2net_complete_nsw(nn, &nsw, 0, 0, 0);  	return ret;  } diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 8db4b58b2e4..ef999729e27 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -169,11 +169,10 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,  				      u64 parent_blkno,  				      int skip_unhashed)  { -	struct hlist_node *p;  	struct dentry *dentry;  	spin_lock(&inode->i_lock); -	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { +	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {  		spin_lock(&dentry->d_lock);  		if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {  			trace_ocfs2_find_local_alias(dentry->d_name.len, diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8fe4e2892ab..f1e1aed8f63 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -67,7 +67,6 @@  #define NAMEI_RA_CHUNKS  2  #define NAMEI_RA_BLOCKS  4  #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))  static unsigned char ocfs2_filetype_table[] = {  	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK @@ -2015,12 +2014,12 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,  int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)  {  	int error = 0; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	int lock_level = 0;  	trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); -	error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); +	error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);  	if (lock_level && error >= 0) {  		/* We release EX lock which used to update atime  		 * and get PR lock again to reduce contention diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9e89d70df33..dbb17c07656 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -319,9 +319,7 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)  	if (dlm->master_hash)  		dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); -	if (dlm->name) -		kfree(dlm->name); - +	kfree(dlm->name);  	kfree(dlm);  } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 005261c333b..33ecbe0e673 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2020,7 +2020,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,  			       int ignore_higher, u8 request_from, u32 flags)  {  	struct dlm_work_item *item; -	item = kzalloc(sizeof(*item), GFP_NOFS); +	item = kzalloc(sizeof(*item), GFP_ATOMIC);  	if (!item)  		return -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 01ebfd0bdad..eeac97bb3bf 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2083,7 +2083,6 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,  					      u8 dead_node, u8 new_master)  {  	int i; -	struct hlist_node *hash_iter;  	struct hlist_head *bucket;  	struct dlm_lock_resource *res, *next; @@ -2114,7 +2113,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,  	 * if necessary */  	for (i = 0; i < DLM_HASH_BUCKETS; i++) {  		bucket = dlm_lockres_hash(dlm, i); -		hlist_for_each_entry(res, hash_iter, bucket, hash_node) { +		hlist_for_each_entry(res, bucket, hash_node) {  			if (!(res->state & DLM_LOCK_RES_RECOVERING))  				continue; @@ -2273,7 +2272,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,  static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)  { -	struct hlist_node *iter;  	struct dlm_lock_resource *res;  	int i;  	struct hlist_head *bucket; @@ -2299,7 +2297,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)  	 */  	for (i = 0; i < DLM_HASH_BUCKETS; i++) {  		bucket = dlm_lockres_hash(dlm, i); -		hlist_for_each_entry(res, iter, bucket, hash_node) { +		hlist_for_each_entry(res, bucket, hash_node) {   			/* always prune any $RECOVERY entries for dead nodes,   			 * otherwise hangs can occur during later recovery */  			if (dlm_is_recovery_lock(res->lockname.name, diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 16b712d260d..4c5fc8d77dc 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -224,7 +224,7 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)  static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)  {  	int event = 0; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct dlmfs_inode_private *ip = DLMFS_I(inode);  	poll_wait(file, &ip->ip_lockres.l_event, wait); @@ -245,7 +245,7 @@ static ssize_t dlmfs_file_read(struct file *filp,  	int bytes_left;  	ssize_t readlen, got;  	char *lvb_buf; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",  		inode->i_ino, count, *ppos); @@ -293,7 +293,7 @@ static ssize_t dlmfs_file_write(struct file *filp,  	int bytes_left;  	ssize_t writelen;  	char *lvb_buf; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",  		inode->i_ino, count, *ppos); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4f7795fb5fc..12ae194ac94 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2045,8 +2045,8 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)  	lvb->lvb_version   = OCFS2_LVB_VERSION;  	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));  	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); -	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid); -	lvb->lvb_igid      = cpu_to_be32(inode->i_gid); +	lvb->lvb_iuid      = cpu_to_be32(i_uid_read(inode)); +	lvb->lvb_igid      = cpu_to_be32(i_gid_read(inode));  	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);  	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);  	lvb->lvb_iatime_packed  = @@ -2095,8 +2095,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)  	else  		inode->i_blocks = ocfs2_inode_sector_count(inode); -	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid); -	inode->i_gid     = be32_to_cpu(lvb->lvb_igid); +	i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid)); +	i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));  	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);  	set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));  	ocfs2_unpack_timespec(&inode->i_atime, @@ -2545,6 +2545,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,  	 * everything is up to the caller :) */  	status = ocfs2_should_refresh_lock_res(lockres);  	if (status < 0) { +		ocfs2_cluster_unlock(osb, lockres, level);  		mlog_errno(status);  		goto bail;  	} @@ -2553,8 +2554,10 @@ int ocfs2_super_lock(struct ocfs2_super *osb,  		ocfs2_complete_lock_res_refresh(lockres, status); -		if (status < 0) +		if (status < 0) { +			ocfs2_cluster_unlock(osb, lockres, level);  			mlog_errno(status); +		}  		ocfs2_track_lock_refresh(lockres);  	}  bail: diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 322216a5f0d..29651167190 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -195,11 +195,11 @@ static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,  	if (parent && (len < 6)) {  		*max_len = 6; -		type = 255; +		type = FILEID_INVALID;  		goto bail;  	} else if (len < 3) {  		*max_len = 3; -		type = 255; +		type = FILEID_INVALID;  		goto bail;  	} diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f487aa34344..1c39efb71ba 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -282,8 +282,7 @@ search:  	spin_unlock(&oi->ip_lock);  out: -	if (new_emi) -		kfree(new_emi); +	kfree(new_emi);  }  static int ocfs2_last_eb_is_empty(struct inode *inode, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fe492e1a3cf..6474cb44004 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1116,7 +1116,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)  			    (unsigned long long)OCFS2_I(inode)->ip_blkno,  			    dentry->d_name.len, dentry->d_name.name,  			    attr->ia_valid, attr->ia_mode, -			    attr->ia_uid, attr->ia_gid); +			    from_kuid(&init_user_ns, attr->ia_uid), +			    from_kgid(&init_user_ns, attr->ia_gid));  	/* ensuring we don't even attempt to truncate a symlink */  	if (S_ISLNK(inode->i_mode)) @@ -1174,14 +1175,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)  		}  	} -	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || -	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { +	if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || +	    (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {  		/*  		 * Gather pointers to quota structures so that allocation /  		 * freeing of quota structures happens here and not inside  		 * dquot_transfer() where we have problems with lock ordering  		 */ -		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid +		if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)  		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,  		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {  			transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); @@ -1190,7 +1191,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)  				goto bail_unlock;  			}  		} -		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid +		if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)  		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,  		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {  			transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); @@ -1218,24 +1219,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)  		}  	} -	/* -	 * This will intentionally not wind up calling truncate_setsize(), -	 * since all the work for a size change has been done above. -	 * Otherwise, we could get into problems with truncate as -	 * ip_alloc_sem is used there to protect against i_size -	 * changes. -	 * -	 * XXX: this means the conditional below can probably be removed. -	 */ -	if ((attr->ia_valid & ATTR_SIZE) && -	    attr->ia_size != i_size_read(inode)) { -		status = vmtruncate(inode, attr->ia_size); -		if (status) { -			mlog_errno(status); -			goto bail_commit; -		} -	} -  	setattr_copy(inode, attr);  	mark_inode_dirty(inode); @@ -1967,7 +1950,7 @@ out:  int ocfs2_change_file_space(struct file *file, unsigned int cmd,  			    struct ocfs2_space_resv *sr)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);  	int ret; @@ -1995,7 +1978,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,  static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,  			    loff_t len)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);  	struct ocfs2_space_resv sr;  	int change_size = 1; @@ -2250,7 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,  	loff_t old_size, *ppos = &iocb->ki_pos;  	u32 old_clusters;  	struct file *file = iocb->ki_filp; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);  	int full_coherency = !(osb->s_mount_opt &  			       OCFS2_MOUNT_COHERENCY_BUFFERED); @@ -2534,7 +2517,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,  				      unsigned int flags)  {  	int ret = 0, lock_level = 0; -	struct inode *inode = in->f_path.dentry->d_inode; +	struct inode *inode = file_inode(in);  	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,  			(unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2544,7 +2527,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,  	/*  	 * See the comment in ocfs2_file_aio_read()  	 */ -	ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level); +	ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);  	if (ret < 0) {  		mlog_errno(ret);  		goto bail; @@ -2564,7 +2547,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,  {  	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;  	struct file *filp = iocb->ki_filp; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,  			(unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2607,7 +2590,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,  	 * like i_size. This allows the checks down below  	 * generic_file_aio_read() a chance of actually working.  	 */ -	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); +	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);  	if (ret < 0) {  		mlog_errno(ret);  		goto bail; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d89e08a81ed..f87f9bd1edf 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -269,8 +269,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,  	inode->i_generation = le32_to_cpu(fe->i_generation);  	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));  	inode->i_mode = le16_to_cpu(fe->i_mode); -	inode->i_uid = le32_to_cpu(fe->i_uid); -	inode->i_gid = le32_to_cpu(fe->i_gid); +	i_uid_write(inode, le32_to_cpu(fe->i_uid)); +	i_gid_write(inode, le32_to_cpu(fe->i_gid));  	/* Fast symlinks will have i_size but no allocated clusters. */  	if (S_ISLNK(inode->i_mode) && !fe->i_clusters) { @@ -1259,8 +1259,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,  	fe->i_size = cpu_to_le64(i_size_read(inode));  	ocfs2_set_links_count(fe, inode->i_nlink); -	fe->i_uid = cpu_to_le32(inode->i_uid); -	fe->i_gid = cpu_to_le32(inode->i_gid); +	fe->i_uid = cpu_to_le32(i_uid_read(inode)); +	fe->i_gid = cpu_to_le32(i_gid_read(inode));  	fe->i_mode = cpu_to_le16(inode->i_mode);  	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);  	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); @@ -1290,8 +1290,8 @@ void ocfs2_refresh_inode(struct inode *inode,  	ocfs2_set_inode_flags(inode);  	i_size_write(inode, le64_to_cpu(fe->i_size));  	set_nlink(inode, ocfs2_read_links_count(fe)); -	inode->i_uid = le32_to_cpu(fe->i_uid); -	inode->i_gid = le32_to_cpu(fe->i_gid); +	i_uid_write(inode, le32_to_cpu(fe->i_uid)); +	i_gid_write(inode, le32_to_cpu(fe->i_gid));  	inode->i_mode = le16_to_cpu(fe->i_mode);  	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)  		inode->i_blocks = 0; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index f20edcbfe70..752f0b26221 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -881,7 +881,7 @@ bail:  long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	unsigned int flags;  	int new_clusters;  	int status; @@ -994,7 +994,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)  {  	bool preserve;  	struct reflink_arguments args; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ocfs2_info info;  	void __user *argp = (void __user *)arg; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 2dd36af79e2..8eccfabcd12 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1234,11 +1234,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,  		/* Though we wish to avoid it, we are in fact safe in  		 * skipping local alloc cleanup as fsck.ocfs2 is more  		 * than capable of reclaiming unused space. */ -		if (la_dinode) -			kfree(la_dinode); - -		if (tl_dinode) -			kfree(tl_dinode); +		kfree(la_dinode); +		kfree(tl_dinode);  		if (qrec)  			ocfs2_free_quota_recovery(qrec); @@ -1408,8 +1405,7 @@ bail:  	mutex_unlock(&osb->recovery_lock); -	if (rm_quota) -		kfree(rm_quota); +	kfree(rm_quota);  	/* no one is callint kthread_stop() for us so the kthread() api  	 * requires that we call do_exit().  And it isn't exported, but diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index a9f78c74d68..aebeacd807c 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -476,8 +476,7 @@ out:  	if (local_alloc_inode)  		iput(local_alloc_inode); -	if (alloc_copy) -		kfree(alloc_copy); +	kfree(alloc_copy);  }  /* @@ -534,7 +533,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,  		mlog_errno(status);  bail: -	if ((status < 0) && (*alloc_copy)) { +	if (status < 0) {  		kfree(*alloc_copy);  		*alloc_copy = NULL;  	} @@ -1290,8 +1289,7 @@ bail:  	if (main_bm_inode)  		iput(main_bm_inode); -	if (alloc_copy) -		kfree(alloc_copy); +	kfree(alloc_copy);  	if (ac)  		ocfs2_free_alloc_context(ac); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 47a87dda54c..10d66c75cec 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -62,7 +62,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,  				struct page *page)  {  	int ret = VM_FAULT_NOPAGE; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct address_space *mapping = inode->i_mapping;  	loff_t pos = page_offset(page);  	unsigned int len = PAGE_CACHE_SIZE; @@ -131,7 +131,7 @@ out:  static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	struct page *page = vmf->page; -	struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(vma->vm_file);  	struct buffer_head *di_bh = NULL;  	sigset_t oldset;  	int ret; @@ -180,13 +180,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)  {  	int ret = 0, lock_level = 0; -	ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode, -				    file->f_vfsmnt, &lock_level); +	ret = ocfs2_inode_lock_atime(file_inode(file), +				    file->f_path.mnt, &lock_level);  	if (ret < 0) {  		mlog_errno(ret);  		goto out;  	} -	ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); +	ocfs2_inode_unlock(file_inode(file), lock_level);  out:  	vma->vm_ops = &ocfs2_file_vm_ops;  	return 0; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 6083432f667..9f8dcadd9a5 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -1055,7 +1055,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)  {  	int status; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ocfs2_move_extents range;  	struct ocfs2_move_extents_context *context = NULL; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f1fd0741162..04ee1b57c24 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -512,8 +512,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,  	fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);  	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);  	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); -	fe->i_uid = cpu_to_le32(inode->i_uid); -	fe->i_gid = cpu_to_le32(inode->i_gid); +	fe->i_uid = cpu_to_le32(i_uid_read(inode)); +	fe->i_gid = cpu_to_le32(i_gid_read(inode));  	fe->i_mode = cpu_to_le16(inode->i_mode);  	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))  		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 30a055049e1..998b17eda09 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2927,7 +2927,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,  				     u32 new_cluster, u32 new_len)  {  	int ret = 0, partial; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ocfs2_caching_info *ci = INODE_CACHE(inode);  	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);  	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); @@ -3020,7 +3020,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,  				    u32 new_cluster, u32 new_len)  {  	int ret = 0; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct super_block *sb = inode->i_sb;  	struct ocfs2_caching_info *ci = INODE_CACHE(inode);  	int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); @@ -4407,7 +4407,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,  	 * rights to do so.  	 */  	if (preserve) { -		if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN)) +		if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))  			return -EPERM;  		if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))  			return -EPERM; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 94368017edb..bf1f8930456 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -376,7 +376,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)  	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);  out_free: -	if (rc && conn->cc_private) +	if (rc)  		kfree(conn->cc_private);  out: diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f169da4624f..b7e74b580c0 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -642,7 +642,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,  	 * cluster groups will be staying in cache for the duration of  	 * this operation.  	 */ -	ac->ac_allow_chain_relink = 0; +	ac->ac_disable_chain_relink = 1;  	/* Claim the first region */  	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, @@ -1823,7 +1823,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,  	 * Do this *after* figuring out how many bits we're taking out  	 * of our target group.  	 */ -	if (ac->ac_allow_chain_relink && +	if (!ac->ac_disable_chain_relink &&  	    (prev_group_bh) &&  	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {  		status = ocfs2_relink_block_group(handle, alloc_inode, @@ -1928,7 +1928,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,  	victim = ocfs2_find_victim_chain(cl);  	ac->ac_chain = victim; -	ac->ac_allow_chain_relink = 1;  	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,  				    res, &bits_left); @@ -1947,7 +1946,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,  	 * searching each chain in order. Don't allow chain relinking  	 * because we only calculate enough journal credits for one  	 * relink per alloc. */ -	ac->ac_allow_chain_relink = 0; +	ac->ac_disable_chain_relink = 1;  	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {  		if (i == victim)  			continue; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index b8afabfeede..a36d0aa5091 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -49,7 +49,7 @@ struct ocfs2_alloc_context {  	/* these are used by the chain search */  	u16    ac_chain; -	int    ac_allow_chain_relink; +	int    ac_disable_chain_relink;  	group_search_t *ac_group_search;  	u64    ac_last_group; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0e91ec22a94..9b6910dec4b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2525,8 +2525,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)  		mlog_errno(status);  finally: -	if (local_alloc) -		kfree(local_alloc); +	kfree(local_alloc);  	if (status)  		mlog_errno(status); @@ -2553,8 +2552,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)  	 * we free it here.  	 */  	kfree(osb->journal); -	if (osb->local_alloc_copy) -		kfree(osb->local_alloc_copy); +	kfree(osb->local_alloc_copy);  	kfree(osb->uuid_str);  	ocfs2_put_dlm_debug(osb->osb_dlm_debug);  	memset(osb, 0, sizeof(struct ocfs2_super)); diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index f1fbb4b552a..66edce7ecfd 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -57,7 +57,7 @@  static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)  {  	struct inode *inode = page->mapping->host; -	struct buffer_head *bh; +	struct buffer_head *bh = NULL;  	int status = ocfs2_read_inode_block(inode, &bh);  	struct ocfs2_dinode *fe;  	const char *link; diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 3d635f4bbb2..f053688d22a 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -91,8 +91,7 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb,  		} else  			osb->local_system_inodes = local_system_inodes;  		spin_unlock(&osb->osb_lock); -		if (unlikely(free)) -			kfree(free); +		kfree(free);  	}  	index = (slot * NUM_LOCAL_SYSTEM_INODES) + diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 0ba9ea1e796..2e3ea308c14 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7189,7 +7189,7 @@ int ocfs2_init_security_and_acl(struct inode *dir,  	struct buffer_head *dir_bh = NULL;  	ret = ocfs2_init_security_get(inode, dir, qstr, NULL); -	if (!ret) { +	if (ret) {  		mlog_errno(ret);  		goto leave;  	} diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index fb5b3ff79dc..acbaebcad3a 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -330,7 +330,7 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,  static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,  		u64 fsblock, int hindex)  { -	struct inode *dir = filp->f_dentry->d_inode; +	struct inode *dir = file_inode(filp);  	struct buffer_head *bh;  	struct omfs_inode *oi;  	u64 self; @@ -405,7 +405,7 @@ out:  static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *dir = filp->f_dentry->d_inode; +	struct inode *dir = file_inode(filp);  	struct buffer_head *bh;  	loff_t offset, res;  	unsigned int hchain, hindex; diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 77e3cb2962b..e0d9b3e722b 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -306,6 +306,16 @@ omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)  	return mpage_writepages(mapping, wbc, omfs_get_block);  } +static void omfs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		omfs_truncate(inode); +	} +} +  static int omfs_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -314,11 +324,8 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,  	ret = block_write_begin(mapping, pos, len, flags, pagep,  				omfs_get_block); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		omfs_write_failed(mapping, pos + len);  	return ret;  } @@ -350,9 +357,11 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)  	if ((attr->ia_valid & ATTR_SIZE) &&  	    attr->ia_size != i_size_read(inode)) { -		error = vmtruncate(inode, attr->ia_size); +		error = inode_newsize_ok(inode, attr->ia_size);  		if (error)  			return error; +		truncate_setsize(inode, attr->ia_size); +		omfs_truncate(inode);  	}  	setattr_copy(inode, attr); @@ -362,7 +371,6 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)  const struct inode_operations omfs_file_inops = {  	.setattr = omfs_setattr, -	.truncate = omfs_truncate  };  const struct address_space_operations omfs_aops = { diff --git a/fs/open.c b/fs/open.c index 182d8667b7b..68354466879 100644 --- a/fs/open.c +++ b/fs/open.c @@ -30,6 +30,7 @@  #include <linux/fs_struct.h>  #include <linux/ima.h>  #include <linux/dnotify.h> +#include <linux/compat.h>  #include "internal.h" @@ -61,33 +62,22 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,  	return ret;  } -static long do_sys_truncate(const char __user *pathname, loff_t length) +long vfs_truncate(struct path *path, loff_t length)  { -	struct path path;  	struct inode *inode; -	int error; +	long error; -	error = -EINVAL; -	if (length < 0)	/* sorry, but loff_t says... */ -		goto out; - -	error = user_path(pathname, &path); -	if (error) -		goto out; -	inode = path.dentry->d_inode; +	inode = path->dentry->d_inode;  	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */ -	error = -EISDIR;  	if (S_ISDIR(inode->i_mode)) -		goto dput_and_out; - -	error = -EINVAL; +		return -EISDIR;  	if (!S_ISREG(inode->i_mode)) -		goto dput_and_out; +		return -EINVAL; -	error = mnt_want_write(path.mnt); +	error = mnt_want_write(path->mnt);  	if (error) -		goto dput_and_out; +		goto out;  	error = inode_permission(inode, MAY_WRITE);  	if (error) @@ -111,25 +101,53 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)  	error = locks_verify_truncate(inode, NULL, length);  	if (!error) -		error = security_path_truncate(&path); +		error = security_path_truncate(path);  	if (!error) -		error = do_truncate(path.dentry, length, 0, NULL); +		error = do_truncate(path->dentry, length, 0, NULL);  put_write_and_out:  	put_write_access(inode);  mnt_drop_write_and_out: -	mnt_drop_write(path.mnt); -dput_and_out: -	path_put(&path); +	mnt_drop_write(path->mnt);  out:  	return error;  } +EXPORT_SYMBOL_GPL(vfs_truncate); + +static long do_sys_truncate(const char __user *pathname, loff_t length) +{ +	unsigned int lookup_flags = LOOKUP_FOLLOW; +	struct path path; +	int error; + +	if (length < 0)	/* sorry, but loff_t says... */ +		return -EINVAL; + +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); +	if (!error) { +		error = vfs_truncate(&path, length); +		path_put(&path); +	} +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	} +	return error; +}  SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)  {  	return do_sys_truncate(path, length);  } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) +{ +	return do_sys_truncate(path, length); +} +#endif +  static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)  {  	struct inode *inode; @@ -185,6 +203,13 @@ SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)  	return ret;  } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +{ +	return do_sys_ftruncate(fd, length, 1); +} +#endif +  /* LFS versions of truncate are only needed on 32 bit machines */  #if BITS_PER_LONG == 32  SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length) @@ -218,7 +243,7 @@ SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);  int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	long ret;  	if (offset < 0 || len <= 0) @@ -306,6 +331,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)  	struct path path;  	struct inode *inode;  	int res; +	unsigned int lookup_flags = LOOKUP_FOLLOW;  	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */  		return -EINVAL; @@ -328,8 +354,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)  	}  	old_cred = override_creds(override_cred); - -	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); +retry: +	res = user_path_at(dfd, filename, lookup_flags, &path);  	if (res)  		goto out; @@ -364,6 +390,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)  out_path_release:  	path_put(&path); +	if (retry_estale(res, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  out:  	revert_creds(old_cred);  	put_cred(override_cred); @@ -379,8 +409,9 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)  {  	struct path path;  	int error; - -	error = user_path_dir(filename, &path); +	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; +retry: +	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);  	if (error)  		goto out; @@ -392,6 +423,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)  dput_and_out:  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  out:  	return error;  } @@ -406,7 +441,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)  	if (!f.file)  		goto out; -	inode = f.file->f_path.dentry->d_inode; +	inode = file_inode(f.file);  	error = -ENOTDIR;  	if (!S_ISDIR(inode->i_mode)) @@ -425,8 +460,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)  {  	struct path path;  	int error; - -	error = user_path_dir(filename, &path); +	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; +retry: +	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);  	if (error)  		goto out; @@ -445,6 +481,10 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)  	error = 0;  dput_and_out:  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  out:  	return error;  } @@ -489,11 +529,16 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode  {  	struct path path;  	int error; - -	error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); +	unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: +	error = user_path_at(dfd, filename, lookup_flags, &path);  	if (!error) {  		error = chmod_common(&path, mode);  		path_put(&path); +		if (retry_estale(error, lookup_flags)) { +			lookup_flags |= LOOKUP_REVAL; +			goto retry; +		}  	}  	return error;  } @@ -552,6 +597,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,  	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;  	if (flag & AT_EMPTY_PATH)  		lookup_flags |= LOOKUP_EMPTY; +retry:  	error = user_path_at(dfd, filename, lookup_flags, &path);  	if (error)  		goto out; @@ -562,6 +608,10 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,  	mnt_drop_write(path.mnt);  out_release:  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  out:  	return error;  } @@ -654,7 +704,7 @@ static int do_dentry_open(struct file *f,  		f->f_mode = FMODE_PATH;  	path_get(&f->f_path); -	inode = f->f_path.dentry->d_inode; +	inode = f->f_inode = f->f_path.dentry->d_inode;  	if (f->f_mode & FMODE_WRITE) {  		error = __get_file_write_access(inode, f->f_path.mnt);  		if (error) @@ -664,7 +714,6 @@ static int do_dentry_open(struct file *f,  	}  	f->f_mapping = inode->i_mapping; -	f->f_pos = 0;  	file_sb_list_add(f, inode->i_sb);  	if (unlikely(f->f_mode & FMODE_PATH)) { @@ -718,6 +767,7 @@ cleanup_file:  	path_put(&f->f_path);  	f->f_path.mnt = NULL;  	f->f_path.dentry = NULL; +	f->f_inode = NULL;  	return error;  } @@ -775,23 +825,22 @@ struct file *dentry_open(const struct path *path, int flags,  	/* We must always pass in a valid mount pointer. */  	BUG_ON(!path->mnt); -	error = -ENFILE;  	f = get_empty_filp(); -	if (f == NULL) -		return ERR_PTR(error); - -	f->f_flags = flags; -	f->f_path = *path; -	error = do_dentry_open(f, NULL, cred); -	if (!error) { -		error = open_check_o_direct(f); -		if (error) { -			fput(f); +	if (!IS_ERR(f)) { +		f->f_flags = flags; +		f->f_path = *path; +		error = do_dentry_open(f, NULL, cred); +		if (!error) { +			/* from now on we need fput() to dispose of f */ +			error = open_check_o_direct(f); +			if (error) { +				fput(f); +				f = ERR_PTR(error); +			} +		} else {  +			put_filp(f);  			f = ERR_PTR(error);  		} -	} else {  -		put_filp(f); -		f = ERR_PTR(error);  	}  	return f;  } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 2ad080faca3..ae47fa7efb9 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -262,7 +262,7 @@ found:  static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct op_inode_info *oi = OP_I(inode);  	struct device_node *dp = oi->u.node;  	struct device_node *child; diff --git a/fs/pipe.c b/fs/pipe.c index bd3479db4b6..64a494cef0a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -361,7 +361,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,  	   unsigned long nr_segs, loff_t pos)  {  	struct file *filp = iocb->ki_filp; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct pipe_inode_info *pipe;  	int do_wakeup;  	ssize_t ret; @@ -486,7 +486,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,  	    unsigned long nr_segs, loff_t ppos)  {  	struct file *filp = iocb->ki_filp; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct pipe_inode_info *pipe;  	ssize_t ret;  	int do_wakeup; @@ -677,7 +677,7 @@ bad_pipe_w(struct file *filp, const char __user *buf, size_t count,  static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct pipe_inode_info *pipe;  	int count, buf, nrbufs; @@ -705,7 +705,7 @@ static unsigned int  pipe_poll(struct file *filp, poll_table *wait)  {  	unsigned int mask; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct pipe_inode_info *pipe = inode->i_pipe;  	int nrbufs; @@ -758,7 +758,7 @@ pipe_release(struct inode *inode, int decr, int decw)  static int  pipe_read_fasync(int fd, struct file *filp, int on)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	int retval;  	mutex_lock(&inode->i_mutex); @@ -772,7 +772,7 @@ pipe_read_fasync(int fd, struct file *filp, int on)  static int  pipe_write_fasync(int fd, struct file *filp, int on)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	int retval;  	mutex_lock(&inode->i_mutex); @@ -786,7 +786,7 @@ pipe_write_fasync(int fd, struct file *filp, int on)  static int  pipe_rdwr_fasync(int fd, struct file *filp, int on)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct pipe_inode_info *pipe = inode->i_pipe;  	int retval; @@ -1037,13 +1037,13 @@ int create_pipe_files(struct file **res, int flags)  	err = -ENFILE;  	f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); -	if (!f) +	if (IS_ERR(f))  		goto err_dentry;  	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));  	res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); -	if (!res[0]) +	if (IS_ERR(res[0]))  		goto err_file;  	path_get(&path); @@ -1226,7 +1226,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,   */  struct pipe_inode_info *get_pipe_info(struct file *file)  { -	struct inode *i = file->f_path.dentry->d_inode; +	struct inode *i = file_inode(file);  	return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;  } diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 981b0560193..712f24db960 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -8,7 +8,8 @@ proc-y			:= nommu.o task_nommu.o  proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o  proc-y       += inode.o root.o base.o generic.o array.o \ -		proc_tty.o fd.o +		fd.o +proc-$(CONFIG_TTY)      += proc_tty.o  proc-y	+= cmdline.o  proc-y	+= consoles.o  proc-y	+= cpuinfo.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 6a91e6ffbcb..f7ed9ee46eb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -449,7 +449,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,  			do {  				min_flt += t->min_flt;  				maj_flt += t->maj_flt; -				gtime += t->gtime; +				gtime += task_gtime(t);  				t = next_thread(t);  			} while (t != task); @@ -472,7 +472,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,  		min_flt = task->min_flt;  		maj_flt = task->maj_flt;  		task_cputime_adjusted(task, &utime, &stime); -		gtime = task->gtime; +		gtime = task_gtime(task);  	}  	/* scale priority and nice values from timeslices to -20..20 */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 5a5a0be40e4..69078c7cef1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -73,6 +73,7 @@  #include <linux/security.h>  #include <linux/ptrace.h>  #include <linux/tracehook.h> +#include <linux/printk.h>  #include <linux/cgroup.h>  #include <linux/cpuset.h>  #include <linux/audit.h> @@ -383,7 +384,7 @@ static int lstats_open(struct inode *inode, struct file *file)  static ssize_t lstats_write(struct file *file, const char __user *buf,  			    size_t count, loff_t *offs)  { -	struct task_struct *task = get_proc_task(file->f_dentry->d_inode); +	struct task_struct *task = get_proc_task(file_inode(file));  	if (!task)  		return -ESRCH; @@ -542,13 +543,6 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)  	if (error)  		return error; -	if ((attr->ia_valid & ATTR_SIZE) && -	    attr->ia_size != i_size_read(inode)) { -		error = vmtruncate(inode, attr->ia_size); -		if (error) -			return error; -	} -  	setattr_copy(inode, attr);  	mark_inode_dirty(inode);  	return 0; @@ -609,7 +603,7 @@ static const struct inode_operations proc_def_inode_operations = {  static ssize_t proc_info_read(struct file * file, char __user * buf,  			  size_t count, loff_t *ppos)  { -	struct inode * inode = file->f_path.dentry->d_inode; +	struct inode * inode = file_inode(file);  	unsigned long page;  	ssize_t length;  	struct task_struct *task = get_proc_task(inode); @@ -675,7 +669,7 @@ static const struct file_operations proc_single_file_operations = {  static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)  { -	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); +	struct task_struct *task = get_proc_task(file_inode(file));  	struct mm_struct *mm;  	if (!task) @@ -876,7 +870,7 @@ static const struct file_operations proc_environ_operations = {  static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,  			    loff_t *ppos)  { -	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); +	struct task_struct *task = get_proc_task(file_inode(file));  	char buffer[PROC_NUMBUF];  	int oom_adj = OOM_ADJUST_MIN;  	size_t len; @@ -923,7 +917,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,  		goto out;  	} -	task = get_proc_task(file->f_path.dentry->d_inode); +	task = get_proc_task(file_inode(file));  	if (!task) {  		err = -ESRCH;  		goto out; @@ -959,7 +953,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,  	 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use  	 * /proc/pid/oom_score_adj instead.  	 */ -	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", +	pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",  		  current->comm, task_pid_nr(current), task_pid_nr(task),  		  task_pid_nr(task)); @@ -983,7 +977,7 @@ static const struct file_operations proc_oom_adj_operations = {  static ssize_t oom_score_adj_read(struct file *file, char __user *buf,  					size_t count, loff_t *ppos)  { -	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); +	struct task_struct *task = get_proc_task(file_inode(file));  	char buffer[PROC_NUMBUF];  	short oom_score_adj = OOM_SCORE_ADJ_MIN;  	unsigned long flags; @@ -1026,7 +1020,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,  		goto out;  	} -	task = get_proc_task(file->f_path.dentry->d_inode); +	task = get_proc_task(file_inode(file));  	if (!task) {  		err = -ESRCH;  		goto out; @@ -1074,7 +1068,7 @@ static const struct file_operations proc_oom_score_adj_operations = {  static ssize_t proc_loginuid_read(struct file * file, char __user * buf,  				  size_t count, loff_t *ppos)  { -	struct inode * inode = file->f_path.dentry->d_inode; +	struct inode * inode = file_inode(file);  	struct task_struct *task = get_proc_task(inode);  	ssize_t length;  	char tmpbuf[TMPBUFLEN]; @@ -1091,7 +1085,7 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,  static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,  				   size_t count, loff_t *ppos)  { -	struct inode * inode = file->f_path.dentry->d_inode; +	struct inode * inode = file_inode(file);  	char *page, *tmp;  	ssize_t length;  	uid_t loginuid; @@ -1149,7 +1143,7 @@ static const struct file_operations proc_loginuid_operations = {  static ssize_t proc_sessionid_read(struct file * file, char __user * buf,  				  size_t count, loff_t *ppos)  { -	struct inode * inode = file->f_path.dentry->d_inode; +	struct inode * inode = file_inode(file);  	struct task_struct *task = get_proc_task(inode);  	ssize_t length;  	char tmpbuf[TMPBUFLEN]; @@ -1172,7 +1166,7 @@ static const struct file_operations proc_sessionid_operations = {  static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,  				      size_t count, loff_t *ppos)  { -	struct task_struct *task = get_proc_task(file->f_dentry->d_inode); +	struct task_struct *task = get_proc_task(file_inode(file));  	char buffer[PROC_NUMBUF];  	size_t len;  	int make_it_fail; @@ -1204,7 +1198,7 @@ static ssize_t proc_fault_inject_write(struct file * file,  	make_it_fail = simple_strtol(strstrip(buffer), &end, 0);  	if (*end)  		return -EINVAL; -	task = get_proc_task(file->f_dentry->d_inode); +	task = get_proc_task(file_inode(file));  	if (!task)  		return -ESRCH;  	task->make_it_fail = make_it_fail; @@ -1244,7 +1238,7 @@ static ssize_t  sched_write(struct file *file, const char __user *buf,  	    size_t count, loff_t *offset)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct task_struct *p;  	p = get_proc_task(inode); @@ -1295,7 +1289,7 @@ static ssize_t  sched_autogroup_write(struct file *file, const char __user *buf,  	    size_t count, loff_t *offset)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct task_struct *p;  	char buffer[PROC_NUMBUF];  	int nice; @@ -1350,7 +1344,7 @@ static const struct file_operations proc_pid_sched_autogroup_operations = {  static ssize_t comm_write(struct file *file, const char __user *buf,  				size_t count, loff_t *offset)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct task_struct *p;  	char buffer[TASK_COMM_LEN]; @@ -1718,7 +1712,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)  		return -ECHILD;  	if (!capable(CAP_SYS_ADMIN)) { -		status = -EACCES; +		status = -EPERM;  		goto out_notask;  	} @@ -1851,7 +1845,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,  	struct dentry *result;  	struct mm_struct *mm; -	result = ERR_PTR(-EACCES); +	result = ERR_PTR(-EPERM);  	if (!capable(CAP_SYS_ADMIN))  		goto out; @@ -1907,7 +1901,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)  	ino_t ino;  	int ret; -	ret = -EACCES; +	ret = -EPERM;  	if (!capable(CAP_SYS_ADMIN))  		goto out; @@ -2153,7 +2147,7 @@ out_no_task:  static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,  				  size_t count, loff_t *ppos)  { -	struct inode * inode = file->f_path.dentry->d_inode; +	struct inode * inode = file_inode(file);  	char *p = NULL;  	ssize_t length;  	struct task_struct *task = get_proc_task(inode); @@ -2174,7 +2168,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,  static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,  				   size_t count, loff_t *ppos)  { -	struct inode * inode = file->f_path.dentry->d_inode; +	struct inode * inode = file_inode(file);  	char *page;  	ssize_t length;  	struct task_struct *task = get_proc_task(inode); @@ -2263,7 +2257,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = {  static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,  					 size_t count, loff_t *ppos)  { -	struct task_struct *task = get_proc_task(file->f_dentry->d_inode); +	struct task_struct *task = get_proc_task(file_inode(file));  	struct mm_struct *mm;  	char buffer[PROC_NUMBUF];  	size_t len; @@ -2315,7 +2309,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,  		goto out_no_task;  	ret = -ESRCH; -	task = get_proc_task(file->f_dentry->d_inode); +	task = get_proc_task(file_inode(file));  	if (!task)  		goto out_no_task; @@ -2625,6 +2619,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)  	name.name = buf;  	name.len = snprintf(buf, sizeof(buf), "%d", pid); +	/* no ->d_hash() rejects on procfs */  	dentry = d_hash_and_lookup(mnt->mnt_root, &name);  	if (dentry) {  		shrink_dcache_parent(dentry); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 7b3ae3cc0ef..4b3b3ffb52f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -15,6 +15,7 @@  #include <linux/mm.h>  #include <linux/module.h>  #include <linux/slab.h> +#include <linux/printk.h>  #include <linux/mount.h>  #include <linux/init.h>  #include <linux/idr.h> @@ -42,7 +43,7 @@ static ssize_t  __proc_file_read(struct file *file, char __user *buf, size_t nbytes,  	       loff_t *ppos)  { -	struct inode * inode = file->f_path.dentry->d_inode; +	struct inode * inode = file_inode(file);  	char 	*page;  	ssize_t	retval=0;  	int	eof=0; @@ -132,11 +133,8 @@ __proc_file_read(struct file *file, char __user *buf, size_t nbytes,  		}  		if (start == NULL) { -			if (n > PAGE_SIZE) { -				printk(KERN_ERR -				       "proc_file_read: Apparent buffer overflow!\n"); +			if (n > PAGE_SIZE)	/* Apparent buffer overflow */  				n = PAGE_SIZE; -			}  			n -= *ppos;  			if (n <= 0)  				break; @@ -144,26 +142,19 @@ __proc_file_read(struct file *file, char __user *buf, size_t nbytes,  				n = count;  			start = page + *ppos;  		} else if (start < page) { -			if (n > PAGE_SIZE) { -				printk(KERN_ERR -				       "proc_file_read: Apparent buffer overflow!\n"); +			if (n > PAGE_SIZE)	/* Apparent buffer overflow */  				n = PAGE_SIZE; -			}  			if (n > count) {  				/*  				 * Don't reduce n because doing so might  				 * cut off part of a data block.  				 */ -				printk(KERN_WARNING -				       "proc_file_read: Read count exceeded\n"); +				pr_warn("proc_file_read: count exceeded\n");  			}  		} else /* start >= page */ {  			unsigned long startoff = (unsigned long)(start - page); -			if (n > (PAGE_SIZE - startoff)) { -				printk(KERN_ERR -				       "proc_file_read: Apparent buffer overflow!\n"); +			if (n > (PAGE_SIZE - startoff))	/* buffer overflow? */  				n = PAGE_SIZE - startoff; -			}  			if (n > count)  				n = count;  		} @@ -188,7 +179,7 @@ static ssize_t  proc_file_read(struct file *file, char __user *buf, size_t nbytes,  	       loff_t *ppos)  { -	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); +	struct proc_dir_entry *pde = PDE(file_inode(file));  	ssize_t rv = -EIO;  	spin_lock(&pde->pde_unload_lock); @@ -209,7 +200,7 @@ static ssize_t  proc_file_write(struct file *file, const char __user *buffer,  		size_t count, loff_t *ppos)  { -	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); +	struct proc_dir_entry *pde = PDE(file_inode(file));  	ssize_t rv = -EIO;  	if (pde->write_proc) { @@ -261,16 +252,9 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)  	if (error)  		return error; -	if ((iattr->ia_valid & ATTR_SIZE) && -	    iattr->ia_size != i_size_read(inode)) { -		error = vmtruncate(inode, iattr->ia_size); -		if (error) -			return error; -	} -  	setattr_copy(inode, iattr);  	mark_inode_dirty(inode); -	 +  	de->uid = inode->i_uid;  	de->gid = inode->i_gid;  	de->mode = inode->i_mode; @@ -359,18 +343,18 @@ retry:  	if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))  		return -ENOMEM; -	spin_lock(&proc_inum_lock); +	spin_lock_irq(&proc_inum_lock);  	error = ida_get_new(&proc_inum_ida, &i); -	spin_unlock(&proc_inum_lock); +	spin_unlock_irq(&proc_inum_lock);  	if (error == -EAGAIN)  		goto retry;  	else if (error)  		return error;  	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { -		spin_lock(&proc_inum_lock); +		spin_lock_irq(&proc_inum_lock);  		ida_remove(&proc_inum_ida, i); -		spin_unlock(&proc_inum_lock); +		spin_unlock_irq(&proc_inum_lock);  		return -ENOSPC;  	}  	*inum = PROC_DYNAMIC_FIRST + i; @@ -379,9 +363,10 @@ retry:  void proc_free_inum(unsigned int inum)  { -	spin_lock(&proc_inum_lock); +	unsigned long flags; +	spin_lock_irqsave(&proc_inum_lock, flags);  	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); -	spin_unlock(&proc_inum_lock); +	spin_unlock_irqrestore(&proc_inum_lock, flags);  }  static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) @@ -418,8 +403,7 @@ static const struct dentry_operations proc_dentry_operations =  struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,  		struct dentry *dentry)  { -	struct inode *inode = NULL; -	int error = -ENOENT; +	struct inode *inode;  	spin_lock(&proc_subdir_lock);  	for (de = de->subdir; de ; de = de->next) { @@ -428,22 +412,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,  		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {  			pde_get(de);  			spin_unlock(&proc_subdir_lock); -			error = -ENOMEM;  			inode = proc_get_inode(dir->i_sb, de); -			goto out_unlock; +			if (!inode) +				return ERR_PTR(-ENOMEM); +			d_set_d_op(dentry, &proc_dentry_operations); +			d_add(dentry, inode); +			return NULL;  		}  	}  	spin_unlock(&proc_subdir_lock); -out_unlock: - -	if (inode) { -		d_set_d_op(dentry, &proc_dentry_operations); -		d_add(dentry, inode); -		return NULL; -	} -	if (de) -		pde_put(de); -	return ERR_PTR(error); +	return ERR_PTR(-ENOENT);  }  struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, @@ -466,7 +444,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,  {  	unsigned int ino;  	int i; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	int ret = 0;  	ino = inode->i_ino; @@ -528,7 +506,7 @@ out:  int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	return proc_readdir_de(PDE(inode), filp, dirent, filldir);  } @@ -582,7 +560,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp  	for (tmp = dir->subdir; tmp; tmp = tmp->next)  		if (strcmp(tmp->name, dp->name) == 0) { -			WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", +			WARN(1, "proc_dir_entry '%s/%s' already registered\n",  				dir->name, dp->name);  			break;  		} @@ -843,9 +821,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)  	if (S_ISDIR(de->mode))  		parent->nlink--;  	de->nlink = 0; -	WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " -			"'%s/%s', leaking at least '%s'\n", __func__, -			de->parent->name, de->name, de->subdir->name); +	WARN(de->subdir, "%s: removing non-empty directory " +			 "'%s/%s', leaking at least '%s'\n", __func__, +			 de->parent->name, de->name, de->subdir->name);  	pde_put(de);  }  EXPORT_SYMBOL(remove_proc_entry); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 439ae688650..a86aebc9ba7 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -13,6 +13,7 @@  #include <linux/stat.h>  #include <linux/completion.h>  #include <linux/poll.h> +#include <linux/printk.h>  #include <linux/file.h>  #include <linux/limits.h>  #include <linux/init.h> @@ -144,7 +145,7 @@ void pde_users_dec(struct proc_dir_entry *pde)  static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)  { -	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); +	struct proc_dir_entry *pde = PDE(file_inode(file));  	loff_t rv = -EINVAL;  	loff_t (*llseek)(struct file *, loff_t, int); @@ -179,7 +180,7 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)  static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)  { -	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); +	struct proc_dir_entry *pde = PDE(file_inode(file));  	ssize_t rv = -EIO;  	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); @@ -201,7 +202,7 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,  static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)  { -	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); +	struct proc_dir_entry *pde = PDE(file_inode(file));  	ssize_t rv = -EIO;  	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); @@ -223,7 +224,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t  static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)  { -	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); +	struct proc_dir_entry *pde = PDE(file_inode(file));  	unsigned int rv = DEFAULT_POLLMASK;  	unsigned int (*poll)(struct file *, struct poll_table_struct *); @@ -245,7 +246,7 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p  static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  { -	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); +	struct proc_dir_entry *pde = PDE(file_inode(file));  	long rv = -ENOTTY;  	long (*ioctl)(struct file *, unsigned int, unsigned long); @@ -268,7 +269,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne  #ifdef CONFIG_COMPAT  static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  { -	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); +	struct proc_dir_entry *pde = PDE(file_inode(file));  	long rv = -ENOTTY;  	long (*compat_ioctl)(struct file *, unsigned int, unsigned long); @@ -291,7 +292,7 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned  static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)  { -	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); +	struct proc_dir_entry *pde = PDE(file_inode(file));  	int rv = -EIO;  	int (*mmap)(struct file *, struct vm_area_struct *); @@ -445,12 +446,9 @@ static const struct file_operations proc_reg_file_ops_no_compat = {  struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)  { -	struct inode * inode; +	struct inode *inode = iget_locked(sb, de->low_ino); -	inode = iget_locked(sb, de->low_ino); -	if (!inode) -		return NULL; -	if (inode->i_state & I_NEW) { +	if (inode && (inode->i_state & I_NEW)) {  		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;  		PROC_I(inode)->pde = de; @@ -482,10 +480,12 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)  	} else  	       pde_put(de);  	return inode; -}			 +}  int proc_fill_super(struct super_block *s)  { +	struct inode *root_inode; +  	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;  	s->s_blocksize = 1024;  	s->s_blocksize_bits = 10; @@ -494,11 +494,17 @@ int proc_fill_super(struct super_block *s)  	s->s_time_gran = 1;  	pde_get(&proc_root); -	s->s_root = d_make_root(proc_get_inode(s, &proc_root)); -	if (s->s_root) -		return 0; +	root_inode = proc_get_inode(s, &proc_root); +	if (!root_inode) { +		pr_err("proc_fill_super: get root inode failed\n"); +		return -ENOMEM; +	} -	printk("proc_read_super: get root inode failed\n"); -	pde_put(&proc_root); -	return -ENOMEM; +	s->s_root = d_make_root(root_inode); +	if (!s->s_root) { +		pr_err("proc_fill_super: allocate dentry failed\n"); +		return -ENOMEM; +	} + +	return 0;  } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 252544c0520..85ff3a4598b 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@  #include <linux/sched.h>  #include <linux/proc_fs.h> +#include <linux/binfmts.h>  struct  ctl_table_header;  struct  mempolicy; @@ -108,7 +109,7 @@ static inline int task_dumpable(struct task_struct *task)  	if (mm)  		dumpable = get_dumpable(mm);  	task_unlock(task); -	if (dumpable == SUID_DUMPABLE_ENABLED) +	if (dumpable == SUID_DUMP_USER)  		return 1;  	return 0;  } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e96d4f18ca3..eda6f017f27 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -17,6 +17,7 @@  #include <linux/elfcore.h>  #include <linux/vmalloc.h>  #include <linux/highmem.h> +#include <linux/printk.h>  #include <linux/bootmem.h>  #include <linux/init.h>  #include <linux/slab.h> @@ -619,7 +620,7 @@ static int __init proc_kcore_init(void)  	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,  				      &proc_kcore_operations);  	if (!proc_root_kcore) { -		printk(KERN_ERR "couldn't create /proc/kcore\n"); +		pr_err("couldn't create /proc/kcore\n");  		return 0; /* Always returns 0. */  	}  	/* Store text area if it's special */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 80e4645f799..1efaaa19c4f 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)  		* sysctl_overcommit_ratio / 100) + total_swap_pages;  	cached = global_page_state(NR_FILE_PAGES) - -			total_swapcache_pages - i.bufferram; +			total_swapcache_pages() - i.bufferram;  	if (cached < 0)  		cached = 0; @@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)  		K(i.freeram),  		K(i.bufferram),  		K(cached), -		K(total_swapcache_pages), +		K(total_swapcache_pages()),  		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),  		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),  		K(pages[LRU_ACTIVE_ANON]), @@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)  		vmi.used >> 10,  		vmi.largest_chunk >> 10  #ifdef CONFIG_MEMORY_FAILURE -		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) +		,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)  #endif  #ifdef CONFIG_TRANSPARENT_HUGEPAGE  		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index b1822dde55c..ccfd99bd1c5 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -45,7 +45,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)  	file = region->vm_file;  	if (file) { -		struct inode *inode = region->vm_file->f_path.dentry->d_inode; +		struct inode *inode = file_inode(region->vm_file);  		dev = inode->i_sb->s_dev;  		ino = inode->i_ino;  	} diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index de20ec480fa..30b590f5bd3 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -8,6 +8,7 @@  #include <linux/time.h>  #include <linux/proc_fs.h>  #include <linux/seq_file.h> +#include <linux/printk.h>  #include <linux/stat.h>  #include <linux/string.h>  #include <linux/of.h> @@ -110,8 +111,8 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde,  		if (ent->data == oldprop)  			break;  	if (ent == NULL) { -		printk(KERN_WARNING "device-tree: property \"%s\" " -		       " does not exist\n", oldprop->name); +		pr_warn("device-tree: property \"%s\" does not exist\n", +			oldprop->name);  	} else {  		ent->data = newprop;  		ent->size = newprop->length; @@ -153,8 +154,8 @@ static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,  realloc:  	fixed_name = kmalloc(fixup_len, GFP_KERNEL);  	if (fixed_name == NULL) { -		printk(KERN_ERR "device-tree: Out of memory trying to fixup " -				"name \"%s\"\n", name); +		pr_err("device-tree: Out of memory trying to fixup " +		       "name \"%s\"\n", name);  		return name;  	} @@ -175,8 +176,8 @@ retry:  		goto retry;  	} -	printk(KERN_WARNING "device-tree: Duplicate name in %s, " -			"renamed to \"%s\"\n", np->full_name, fixed_name); +	pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n", +		np->full_name, fixed_name);  	return fixed_name;  } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index fe72cd073de..b4ac6572474 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -163,7 +163,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,  	struct net *net;  	ret = -EINVAL; -	net = get_proc_task_net(filp->f_path.dentry->d_inode); +	net = get_proc_task_net(file_inode(filp));  	if (net != NULL) {  		ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);  		put_net(net); @@ -177,20 +177,6 @@ const struct file_operations proc_net_operations = {  	.readdir	= proc_tgid_net_readdir,  }; - -struct proc_dir_entry *proc_net_fops_create(struct net *net, -	const char *name, umode_t mode, const struct file_operations *fops) -{ -	return proc_create(name, mode, net->proc_net, fops); -} -EXPORT_SYMBOL_GPL(proc_net_fops_create); - -void proc_net_remove(struct net *net, const char *name) -{ -	remove_proc_entry(name, net->proc_net); -} -EXPORT_SYMBOL_GPL(proc_net_remove); -  static __net_init int proc_net_ns_init(struct net *net)  {  	struct proc_dir_entry *netd, *net_statd; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 701580ddfcc..ac05f33a0dd 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -5,6 +5,7 @@  #include <linux/sysctl.h>  #include <linux/poll.h>  #include <linux/proc_fs.h> +#include <linux/printk.h>  #include <linux/security.h>  #include <linux/sched.h>  #include <linux/namei.h> @@ -57,7 +58,7 @@ static void sysctl_print_dir(struct ctl_dir *dir)  {  	if (dir->header.parent)  		sysctl_print_dir(dir->header.parent); -	printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname); +	pr_cont("%s/", dir->header.ctl_table[0].procname);  }  static int namecmp(const char *name1, int len1, const char *name2, int len2) @@ -134,9 +135,9 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)  		else if (cmp > 0)  			p = &(*p)->rb_right;  		else { -			printk(KERN_ERR "sysctl duplicate entry: "); +			pr_err("sysctl duplicate entry: ");  			sysctl_print_dir(head->parent); -			printk(KERN_CONT "/%s\n", entry->procname); +			pr_cont("/%s\n", entry->procname);  			return -EEXIST;  		}  	} @@ -478,7 +479,7 @@ out:  static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,  		size_t count, loff_t *ppos, int write)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ctl_table_header *head = grab_header(inode);  	struct ctl_table *table = PROC_I(inode)->sysctl_entry;  	ssize_t error; @@ -542,7 +543,7 @@ static int proc_sys_open(struct inode *inode, struct file *filp)  static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ctl_table_header *head = grab_header(inode);  	struct ctl_table *table = PROC_I(inode)->sysctl_entry;  	unsigned int ret = DEFAULT_POLLMASK; @@ -736,13 +737,6 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)  	if (error)  		return error; -	if ((attr->ia_valid & ATTR_SIZE) && -	    attr->ia_size != i_size_read(inode)) { -		error = vmtruncate(inode, attr->ia_size); -		if (error) -			return error; -	} -  	setattr_copy(inode, attr);  	mark_inode_dirty(inode);  	return 0; @@ -934,9 +928,9 @@ found:  	subdir->header.nreg++;  failed:  	if (unlikely(IS_ERR(subdir))) { -		printk(KERN_ERR "sysctl could not get directory: "); +		pr_err("sysctl could not get directory: ");  		sysctl_print_dir(dir); -		printk(KERN_CONT "/%*.*s %ld\n", +		pr_cont("/%*.*s %ld\n",  			namelen, namelen, name, PTR_ERR(subdir));  	}  	drop_sysctl_table(&dir->header); @@ -1002,8 +996,8 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)  	vaf.fmt = fmt;  	vaf.va = &args; -	printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n", -		path, table->procname, &vaf); +	pr_err("sysctl table check failed: %s/%s %pV\n", +	       path, table->procname, &vaf);  	va_end(args);  	return -EINVAL; @@ -1517,9 +1511,9 @@ static void put_links(struct ctl_table_header *header)  			drop_sysctl_table(link_head);  		}  		else { -			printk(KERN_ERR "sysctl link missing during unregister: "); +			pr_err("sysctl link missing during unregister: ");  			sysctl_print_dir(parent); -			printk(KERN_CONT "/%s\n", name); +			pr_cont("/%s\n", name);  		}  	}  } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 448455b7fd9..3e636d864d5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -271,7 +271,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)  	const char *name = NULL;  	if (file) { -		struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +		struct inode *inode = file_inode(vma->vm_file);  		dev = inode->i_sb->s_dev;  		ino = inode->i_ino;  		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; @@ -743,7 +743,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,  		return rv;  	if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)  		return -EINVAL; -	task = get_proc_task(file->f_path.dentry->d_inode); +	task = get_proc_task(file_inode(file));  	if (!task)  		return -ESRCH;  	mm = get_task_mm(task); @@ -1015,7 +1015,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,  static ssize_t pagemap_read(struct file *file, char __user *buf,  			    size_t count, loff_t *ppos)  { -	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); +	struct task_struct *task = get_proc_task(file_inode(file));  	struct mm_struct *mm;  	struct pagemapread pm;  	int ret = -ESRCH; @@ -1278,7 +1278,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)  	walk.mm = mm;  	pol = get_vma_policy(task, vma, vma->vm_start); -	mpol_to_str(buffer, sizeof(buffer), pol, 0); +	mpol_to_str(buffer, sizeof(buffer), pol);  	mpol_cond_put(pol);  	seq_printf(m, "%08lx %s", vma->vm_start, buffer); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 1ccfa537f5f..56123a6f462 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -149,7 +149,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,  	file = vma->vm_file;  	if (file) { -		struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +		struct inode *inode = file_inode(vma->vm_file);  		dev = inode->i_sb->s_dev;  		ino = inode->i_ino;  		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 0d5071d2998..b870f740ab5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -15,6 +15,7 @@  #include <linux/export.h>  #include <linux/slab.h>  #include <linux/highmem.h> +#include <linux/printk.h>  #include <linux/bootmem.h>  #include <linux/init.h>  #include <linux/crash_dump.h> @@ -175,15 +176,15 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,  	start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);  	if (!curr_m)          	return -EINVAL; -	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) -		tsz = buflen; - -	/* Calculate left bytes in current memory segment. */ -	nr_bytes = (curr_m->size - (start - curr_m->paddr)); -	if (tsz > nr_bytes) -		tsz = nr_bytes;  	while (buflen) { +		tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK)); + +		/* Calculate left bytes in current memory segment. */ +		nr_bytes = (curr_m->size - (start - curr_m->paddr)); +		if (tsz > nr_bytes) +			tsz = nr_bytes; +  		tmp = read_from_oldmem(buffer, tsz, &start, 1);  		if (tmp < 0)  			return tmp; @@ -198,12 +199,6 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,  						struct vmcore, list);  			start = curr_m->paddr;  		} -		if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) -			tsz = buflen; -		/* Calculate left bytes in current memory segment. */ -		nr_bytes = (curr_m->size - (start - curr_m->paddr)); -		if (tsz > nr_bytes) -			tsz = nr_bytes;  	}  	return acc;  } @@ -553,8 +548,7 @@ static int __init parse_crash_elf64_headers(void)  		ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||  		ehdr.e_phentsize != sizeof(Elf64_Phdr) ||  		ehdr.e_phnum == 0) { -		printk(KERN_WARNING "Warning: Core image elf header is not" -					"sane\n"); +		pr_warn("Warning: Core image elf header is not sane\n");  		return -EINVAL;  	} @@ -609,8 +603,7 @@ static int __init parse_crash_elf32_headers(void)  		ehdr.e_ehsize != sizeof(Elf32_Ehdr) ||  		ehdr.e_phentsize != sizeof(Elf32_Phdr) ||  		ehdr.e_phnum == 0) { -		printk(KERN_WARNING "Warning: Core image elf header is not" -					"sane\n"); +		pr_warn("Warning: Core image elf header is not sane\n");  		return -EINVAL;  	} @@ -653,8 +646,7 @@ static int __init parse_crash_elf_headers(void)  	if (rc < 0)  		return rc;  	if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { -		printk(KERN_WARNING "Warning: Core image elf header" -					" not found\n"); +		pr_warn("Warning: Core image elf header not found\n");  		return -EINVAL;  	} @@ -673,8 +665,7 @@ static int __init parse_crash_elf_headers(void)  		/* Determine vmcore size. */  		vmcore_size = get_vmcore_size_elf32(elfcorebuf);  	} else { -		printk(KERN_WARNING "Warning: Core image elf header is not" -					" sane\n"); +		pr_warn("Warning: Core image elf header is not sane\n");  		return -EINVAL;  	}  	return 0; @@ -690,7 +681,7 @@ static int __init vmcore_init(void)  		return rc;  	rc = parse_crash_elf_headers();  	if (rc) { -		printk(KERN_WARNING "Kdump: vmcore not initialized\n"); +		pr_warn("Kdump: vmcore not initialized\n");  		return rc;  	} diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 67de74ca85f..e4bcb2cf055 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -418,9 +418,25 @@ static struct file_system_type pstore_fs_type = {  	.kill_sb	= pstore_kill_sb,  }; +static struct kobject *pstore_kobj; +  static int __init init_pstore_fs(void)  { -	return register_filesystem(&pstore_fs_type); +	int err = 0; + +	/* Create a convenient mount point for people to access pstore */ +	pstore_kobj = kobject_create_and_add("pstore", fs_kobj); +	if (!pstore_kobj) { +		err = -ENOMEM; +		goto out; +	} + +	err = register_filesystem(&pstore_fs_type); +	if (err < 0) +		kobject_put(pstore_kobj); + +out: +	return err;  }  module_init(init_pstore_fs) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 5ea2e77ff02..86d1038b5a1 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -96,6 +96,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason)  	}  } +bool pstore_cannot_block_path(enum kmsg_dump_reason reason) +{ +	/* +	 * In case of NMI path, pstore shouldn't be blocked +	 * regardless of reason. +	 */ +	if (in_nmi()) +		return true; + +	switch (reason) { +	/* In panic case, other cpus are stopped by smp_send_stop(). */ +	case KMSG_DUMP_PANIC: +	/* Emergency restart shouldn't be blocked by spin lock. */ +	case KMSG_DUMP_EMERG: +		return true; +	default: +		return false; +	} +} +EXPORT_SYMBOL_GPL(pstore_cannot_block_path); +  /*   * callback from kmsg_dump. (s2,l2) has the most recently   * written bytes, older bytes are in (s1,l1). Save as much @@ -114,10 +135,12 @@ static void pstore_dump(struct kmsg_dumper *dumper,  	why = get_reason_str(reason); -	if (in_nmi()) { -		is_locked = spin_trylock(&psinfo->buf_lock); -		if (!is_locked) -			pr_err("pstore dump routine blocked in NMI, may corrupt error record\n"); +	if (pstore_cannot_block_path(reason)) { +		is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags); +		if (!is_locked) { +			pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" +				       , in_nmi() ? "NMI" : why); +		}  	} else  		spin_lock_irqsave(&psinfo->buf_lock, flags);  	oopscount++; @@ -143,9 +166,9 @@ static void pstore_dump(struct kmsg_dumper *dumper,  		total += hsize + len;  		part++;  	} -	if (in_nmi()) { +	if (pstore_cannot_block_path(reason)) {  		if (is_locked) -			spin_unlock(&psinfo->buf_lock); +			spin_unlock_irqrestore(&psinfo->buf_lock, flags);  	} else  		spin_unlock_irqrestore(&psinfo->buf_lock, flags);  } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index f883e7e7430..288f068740f 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -167,12 +167,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,  static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)  {  	char *hdr; -	struct timeval timestamp; +	struct timespec timestamp;  	size_t len; -	do_gettimeofday(×tamp); +	/* Report zeroed timestamp if called before timekeeping has resumed. */ +	if (__getnstimeofday(×tamp)) { +		timestamp.tv_sec = 0; +		timestamp.tv_nsec = 0; +	}  	hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", -		(long)timestamp.tv_sec, (long)timestamp.tv_usec); +		(long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000));  	WARN_ON_ONCE(!hdr);  	len = hdr ? strlen(hdr) : 0;  	persistent_ram_write(prz, hdr, len); @@ -291,9 +295,8 @@ static void ramoops_free_przs(struct ramoops_context *cxt)  	kfree(cxt->przs);  } -static int __devinit ramoops_init_przs(struct device *dev, -				       struct ramoops_context *cxt, -				       phys_addr_t *paddr, size_t dump_mem_sz) +static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, +			     phys_addr_t *paddr, size_t dump_mem_sz)  {  	int err = -ENOMEM;  	int i; @@ -336,10 +339,9 @@ fail_prz:  	return err;  } -static int __devinit ramoops_init_prz(struct device *dev, -				      struct ramoops_context *cxt, -				      struct persistent_ram_zone **prz, -				      phys_addr_t *paddr, size_t sz, u32 sig) +static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, +			    struct persistent_ram_zone **prz, +			    phys_addr_t *paddr, size_t sz, u32 sig)  {  	if (!sz)  		return 0; @@ -367,7 +369,7 @@ static int __devinit ramoops_init_prz(struct device *dev,  	return 0;  } -static int __devinit ramoops_probe(struct platform_device *pdev) +static int ramoops_probe(struct platform_device *pdev)  {  	struct device *dev = &pdev->dev;  	struct ramoops_platform_data *pdata = pdev->dev.platform_data; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index eecd2a8a84d..0306303be37 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -390,8 +390,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,  	return 0;  } -static int __devinit persistent_ram_post_init(struct persistent_ram_zone *prz, -					      u32 sig, int ecc_size) +static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, +				    int ecc_size)  {  	int ret; @@ -443,9 +443,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz)  	kfree(prz);  } -struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start, -							  size_t size, u32 sig, -							  int ecc_size) +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, +					       u32 sig, int ecc_size)  {  	struct persistent_ram_zone *prz;  	int ret = -ENOMEM; diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 7b0329468a5..28ce014b3ce 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -16,7 +16,7 @@  static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	unsigned int offset;  	struct buffer_head *bh;  	struct qnx4_inode_entry *de; diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index dc597353db3..8798d065e40 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -117,7 +117,7 @@ static int qnx6_dir_longfilename(struct inode *inode,  static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *s = inode->i_sb;  	struct qnx6_sb_info *sbi = QNX6_SB(s);  	loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index b6addf56048..57199a52a35 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -285,7 +285,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,  		if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {  			/* we got a big endian fs */  			QNX6DEBUG((KERN_INFO "qnx6: fs got different" -					" endianess.\n")); +					" endianness.\n"));  			return bh;  		} else  			sbi->s_bytesex = BYTESEX_LE; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index d5378d02858..8d5b438cc18 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -202,7 +202,7 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,  					    unsigned long pgoff, unsigned long flags)  {  	unsigned long maxpages, lpages, nr, loop, ret; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct page **pages = NULL, **ptr, *page;  	loff_t isize; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index eab8c09d380..c24f1e10b94 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -260,6 +260,7 @@ static struct file_system_type ramfs_fs_type = {  	.name		= "ramfs",  	.mount		= ramfs_mount,  	.kill_sb	= ramfs_kill_sb, +	.fs_flags	= FS_USERNS_MOUNT,  };  static struct file_system_type rootfs_fs_type = {  	.name		= "rootfs", diff --git a/fs/read_write.c b/fs/read_write.c index 1edaf099ddd..a698eff457f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -15,6 +15,7 @@  #include <linux/syscalls.h>  #include <linux/pagemap.h>  #include <linux/splice.h> +#include <linux/compat.h>  #include "read_write.h"  #include <asm/uaccess.h> @@ -163,7 +164,7 @@ EXPORT_SYMBOL(no_llseek);  loff_t default_llseek(struct file *file, loff_t offset, int whence)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	loff_t retval;  	mutex_lock(&inode->i_mutex); @@ -247,6 +248,13 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)  	return retval;  } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) +{ +	return sys_lseek(fd, offset, whence); +} +#endif +  #ifdef __ARCH_WANT_SYS_LLSEEK  SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,  		unsigned long, offset_low, loff_t __user *, result, @@ -278,7 +286,6 @@ out_putf:  }  #endif -  /*   * rw_verify_area doesn't like huge counts. We limit   * them to something that fits in "int" so that others @@ -290,7 +297,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count  	loff_t pos;  	int retval = -EINVAL; -	inode = file->f_path.dentry->d_inode; +	inode = file_inode(file);  	if (unlikely((ssize_t) count < 0))  		return retval;  	pos = *ppos; @@ -901,8 +908,8 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,  	if (!(out.file->f_mode & FMODE_WRITE))  		goto fput_out;  	retval = -EINVAL; -	in_inode = in.file->f_path.dentry->d_inode; -	out_inode = out.file->f_path.dentry->d_inode; +	in_inode = file_inode(in.file); +	out_inode = file_inode(out.file);  	retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);  	if (retval < 0)  		goto fput_out; @@ -935,6 +942,8 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,  	if (retval > 0) {  		add_rchar(current, retval);  		add_wchar(current, retval); +		fsnotify_access(in.file); +		fsnotify_modify(out.file);  	}  	inc_syscr(current); diff --git a/fs/readdir.c b/fs/readdir.c index 5e69ef533b7..fee38e04fae 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -22,7 +22,7 @@  int vfs_readdir(struct file *file, filldir_t filler, void *buf)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	int res = -ENOTDIR;  	if (!file->f_op || !file->f_op->readdir)  		goto out; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 8375c922c0d..6165bd4784f 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -126,7 +126,7 @@ static int reiserfs_file_open(struct inode *inode, struct file *file)  	return err;  } -static void reiserfs_vfs_truncate_file(struct inode *inode) +void reiserfs_vfs_truncate_file(struct inode *inode)  {  	mutex_lock(&(REISERFS_I(inode)->tailpack));  	reiserfs_truncate_file(inode, 1); @@ -268,7 +268,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t  							 * new current position before returning. */  				   )  { -	struct inode *inode = file->f_path.dentry->d_inode;	// Inode of the file that we are writing to. +	struct inode *inode = file_inode(file);	// Inode of the file that we are writing to.  	/* To simplify coding at this time, we store  	   locked pages in array for now */  	struct reiserfs_transaction_handle th; @@ -312,7 +312,6 @@ const struct file_operations reiserfs_file_operations = {  };  const struct inode_operations reiserfs_file_inode_operations = { -	.truncate = reiserfs_vfs_truncate_file,  	.setattr = reiserfs_setattr,  	.setxattr = reiserfs_setxattr,  	.getxattr = reiserfs_getxattr, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d83736fbc26..ea5061fd4f3 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1603,10 +1603,10 @@ int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,  	if (parent && (maxlen < 5)) {  		*lenp = 5; -		return 255; +		return FILEID_INVALID;  	} else if (maxlen < 3) {  		*lenp = 3; -		return 255; +		return FILEID_INVALID;  	}  	data[0] = inode->i_ino; @@ -3085,8 +3085,10 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,  		loff_t isize = i_size_read(inode);  		loff_t end = offset + iov_length(iov, nr_segs); -		if (end > isize) -			vmtruncate(inode, isize); +		if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { +			truncate_setsize(inode, isize); +			reiserfs_vfs_truncate_file(inode); +		}  	}  	return ret; @@ -3200,8 +3202,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)  	 */  	reiserfs_write_unlock_once(inode->i_sb, depth);  	if ((attr->ia_valid & ATTR_SIZE) && -	    attr->ia_size != i_size_read(inode)) -		error = vmtruncate(inode, attr->ia_size); +	    attr->ia_size != i_size_read(inode)) { +		error = inode_newsize_ok(inode, attr->ia_size); +		if (!error) { +			truncate_setsize(inode, attr->ia_size); +			reiserfs_vfs_truncate_file(inode); +		} +	}  	if (!error) {  		setattr_copy(inode, attr); diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 0c2185042d5..15cb5fe6b42 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -21,7 +21,7 @@   */  long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	unsigned int flags;  	int err = 0; diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index e60e87035bb..9cc0740adff 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -281,7 +281,7 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb)  	}  #if defined( REISERFS_USE_OIDMAPF )  	if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { -		loff_t size = sb_info->oidmap.mapf->f_path.dentry->d_inode->i_size; +		loff_t size = file_inode(sb_info->oidmap.mapf)->i_size;  		total_used += size / sizeof(reiserfs_oidinterval_d_t);  	}  #endif diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 33215f57ea0..157e474ab30 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -2455,6 +2455,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct  								    *,  								    int count);  int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); +void reiserfs_vfs_truncate_file(struct inode *inode);  int reiserfs_commit_page(struct inode *inode, struct page *page,  			 unsigned from, unsigned to);  void reiserfs_flush_old_commits(struct super_block *); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index fd7c5f60b46..7e8d3a80bda 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -147,7 +147,7 @@ static const struct address_space_operations romfs_aops = {   */  static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *i = filp->f_dentry->d_inode; +	struct inode *i = file_inode(filp);  	struct romfs_inode ri;  	unsigned long offset, maxoff;  	int j, ino, nextfh; diff --git a/fs/select.c b/fs/select.c index 2ef72d96503..8c1c96c2706 100644 --- a/fs/select.c +++ b/fs/select.c @@ -26,6 +26,7 @@  #include <linux/fs.h>  #include <linux/rcupdate.h>  #include <linux/hrtimer.h> +#include <linux/sched/rt.h>  #include <asm/uaccess.h> diff --git a/fs/seq_file.c b/fs/seq_file.c index 9d863fb501f..38bb59f3f2a 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -296,7 +296,7 @@ EXPORT_SYMBOL(seq_read);   *	seq_lseek -	->llseek() method for sequential files.   *	@file: the file in question   *	@offset: new position - *	@origin: 0 for absolute, 1 for relative position + *	@whence: 0 for absolute, 1 for relative position   *   *	Ready-made ->f_op->llseek()   */ @@ -308,27 +308,27 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)  	mutex_lock(&m->lock);  	m->version = file->f_version;  	switch (whence) { -		case 1: -			offset += file->f_pos; -		case 0: -			if (offset < 0) -				break; -			retval = offset; -			if (offset != m->read_pos) { -				while ((retval=traverse(m, offset)) == -EAGAIN) -					; -				if (retval) { -					/* with extreme prejudice... */ -					file->f_pos = 0; -					m->read_pos = 0; -					m->version = 0; -					m->index = 0; -					m->count = 0; -				} else { -					m->read_pos = offset; -					retval = file->f_pos = offset; -				} +	case SEEK_CUR: +		offset += file->f_pos; +	case SEEK_SET: +		if (offset < 0) +			break; +		retval = offset; +		if (offset != m->read_pos) { +			while ((retval = traverse(m, offset)) == -EAGAIN) +				; +			if (retval) { +				/* with extreme prejudice... */ +				file->f_pos = 0; +				m->read_pos = 0; +				m->version = 0; +				m->index = 0; +				m->count = 0; +			} else { +				m->read_pos = offset; +				retval = file->f_pos = offset;  			} +		}  	}  	file->f_version = m->version;  	mutex_unlock(&m->lock); @@ -339,7 +339,7 @@ EXPORT_SYMBOL(seq_lseek);  /**   *	seq_release -	free the structures associated with sequential file.   *	@file: file in question - *	@inode: file->f_path.dentry->d_inode + *	@inode: its inode   *   *	Frees the structures associated with sequential file; can be used   *	as ->f_op->release() if you don't have private data to destroy. diff --git a/fs/splice.c b/fs/splice.c index 8890604e3fc..718bd005638 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -569,7 +569,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,  	return res;  } -static ssize_t kernel_write(struct file *file, const char *buf, size_t count, +ssize_t kernel_write(struct file *file, const char *buf, size_t count,  			    loff_t pos)  {  	mm_segment_t old_fs; @@ -578,11 +578,12 @@ static ssize_t kernel_write(struct file *file, const char *buf, size_t count,  	old_fs = get_fs();  	set_fs(get_ds());  	/* The cast to a user pointer is valid due to the set_fs() */ -	res = vfs_write(file, (const char __user *)buf, count, &pos); +	res = vfs_write(file, (__force const char __user *)buf, count, &pos);  	set_fs(old_fs);  	return res;  } +EXPORT_SYMBOL(kernel_write);  ssize_t default_file_splice_read(struct file *in, loff_t *ppos,  				 struct pipe_inode_info *pipe, size_t len, @@ -696,8 +697,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,  		return -EINVAL;  	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; -	if (sd->len < sd->total_len) + +	if (sd->len < sd->total_len && pipe->nrbufs > 1)  		more |= MSG_SENDPAGE_NOTLAST; +  	return file->f_op->sendpage(file, buf->page, buf->offset,  				    sd->len, &pos, more);  } @@ -1168,7 +1171,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,  	 * randomly drop data for eg socket -> socket splicing. Use the  	 * piped splicing for that!  	 */ -	i_mode = in->f_path.dentry->d_inode->i_mode; +	i_mode = file_inode(in)->i_mode;  	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))  		return -EINVAL; diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index b381305c9a4..57dc70ebbb1 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -102,7 +102,7 @@ static int get_dir_index_using_offset(struct super_block *sb,  static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;  	u64 block = squashfs_i(inode)->start + msblk->directory_table;  	int offset = squashfs_i(inode)->offset, length, dir_count, size, diff --git a/fs/stat.c b/fs/stat.c index eae494630a3..04ce1ac20d2 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -37,17 +37,17 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)  EXPORT_SYMBOL(generic_fillattr); -int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int vfs_getattr(struct path *path, struct kstat *stat)  { -	struct inode *inode = dentry->d_inode; +	struct inode *inode = path->dentry->d_inode;  	int retval; -	retval = security_inode_getattr(mnt, dentry); +	retval = security_inode_getattr(path->mnt, path->dentry);  	if (retval)  		return retval;  	if (inode->i_op->getattr) -		return inode->i_op->getattr(mnt, dentry, stat); +		return inode->i_op->getattr(path->mnt, path->dentry, stat);  	generic_fillattr(inode, stat);  	return 0; @@ -61,8 +61,7 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)  	int error = -EBADF;  	if (f.file) { -		error = vfs_getattr(f.file->f_path.mnt, f.file->f_path.dentry, -				    stat); +		error = vfs_getattr(&f.file->f_path, stat);  		fdput(f);  	}  	return error; @@ -74,7 +73,7 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,  {  	struct path path;  	int error = -EINVAL; -	int lookup_flags = 0; +	unsigned int lookup_flags = 0;  	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |  		      AT_EMPTY_PATH)) != 0) @@ -84,13 +83,17 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,  		lookup_flags |= LOOKUP_FOLLOW;  	if (flag & AT_EMPTY_PATH)  		lookup_flags |= LOOKUP_EMPTY; - +retry:  	error = user_path_at(dfd, filename, lookup_flags, &path);  	if (error)  		goto out; -	error = vfs_getattr(path.mnt, path.dentry, stat); +	error = vfs_getattr(&path, stat);  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  out:  	return error;  } @@ -296,11 +299,13 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,  	struct path path;  	int error;  	int empty = 0; +	unsigned int lookup_flags = LOOKUP_EMPTY;  	if (bufsiz <= 0)  		return -EINVAL; -	error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty); +retry: +	error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);  	if (!error) {  		struct inode *inode = path.dentry->d_inode; @@ -314,6 +319,10 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,  			}  		}  		path_put(&path); +		if (retry_estale(error, lookup_flags)) { +			lookup_flags |= LOOKUP_REVAL; +			goto retry; +		}  	}  	return error;  } diff --git a/fs/statfs.c b/fs/statfs.c index f8e832e6f0a..c219e733f55 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -77,10 +77,17 @@ EXPORT_SYMBOL(vfs_statfs);  int user_statfs(const char __user *pathname, struct kstatfs *st)  {  	struct path path; -	int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); +	int error; +	unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT; +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);  	if (!error) {  		error = vfs_statfs(&path, st);  		path_put(&path); +		if (retry_estale(error, lookup_flags)) { +			lookup_flags |= LOOKUP_REVAL; +			goto retry; +		}  	}  	return error;  } diff --git a/fs/super.c b/fs/super.c index 12f12371216..7465d436420 100644 --- a/fs/super.c +++ b/fs/super.c @@ -447,14 +447,13 @@ struct super_block *sget(struct file_system_type *type,  			void *data)  {  	struct super_block *s = NULL; -	struct hlist_node *node;  	struct super_block *old;  	int err;  retry:  	spin_lock(&sb_lock);  	if (test) { -		hlist_for_each_entry(old, node, &type->fs_supers, s_instances) { +		hlist_for_each_entry(old, &type->fs_supers, s_instances) {  			if (!test(old, data))  				continue;  			if (!grab_super(old)) @@ -554,10 +553,9 @@ void iterate_supers_type(struct file_system_type *type,  	void (*f)(struct super_block *, void *), void *arg)  {  	struct super_block *sb, *p = NULL; -	struct hlist_node *node;  	spin_lock(&sb_lock); -	hlist_for_each_entry(sb, node, &type->fs_supers, s_instances) { +	hlist_for_each_entry(sb, &type->fs_supers, s_instances) {  		sb->s_count++;  		spin_unlock(&sb_lock); @@ -842,7 +840,7 @@ int get_anon_bdev(dev_t *p)  	else if (error)  		return -EAGAIN; -	if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) { +	if (dev == (1 << MINORBITS)) {  		spin_lock(&unnamed_dev_lock);  		ida_remove(&unnamed_dev_ida, dev);  		if (unnamed_dev_start > dev) diff --git a/fs/sync.c b/fs/sync.c index 14eefeb4463..2c5d6639a66 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -332,7 +332,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,  	if (!f.file)  		goto out; -	i_mode = f.file->f_path.dentry->d_inode->i_mode; +	i_mode = file_inode(f.file)->i_mode;  	ret = -ESPIPE;  	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&  			!S_ISLNK(i_mode)) diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 614b2b54488..15c68f9489a 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -70,7 +70,7 @@ static ssize_t  read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)  {  	struct bin_buffer *bb = file->private_data; -	int size = file->f_path.dentry->d_inode->i_size; +	int size = file_inode(file)->i_size;  	loff_t offs = *off;  	int count = min_t(size_t, bytes, PAGE_SIZE);  	char *temp; @@ -140,7 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,  		     size_t bytes, loff_t *off)  {  	struct bin_buffer *bb = file->private_data; -	int size = file->f_path.dentry->d_inode->i_size; +	int size = file_inode(file)->i_size;  	loff_t offs = *off;  	int count = min_t(size_t, bytes, PAGE_SIZE);  	char *temp; @@ -461,15 +461,14 @@ const struct file_operations bin_fops = {  void unmap_bin_file(struct sysfs_dirent *attr_sd)  {  	struct bin_buffer *bb; -	struct hlist_node *tmp;  	if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)  		return;  	mutex_lock(&sysfs_bin_lock); -	hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) { -		struct inode *inode = bb->file->f_path.dentry->d_inode; +	hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) { +		struct inode *inode = file_inode(bb->file);  		unmap_mapping_range(inode->i_mapping, 0, 0, 1);  	} diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 2df555c66d5..aec3d5c98c9 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -205,6 +205,48 @@ void sysfs_unmerge_group(struct kobject *kobj,  }  EXPORT_SYMBOL_GPL(sysfs_unmerge_group); +/** + * sysfs_add_link_to_group - add a symlink to an attribute group. + * @kobj:	The kobject containing the group. + * @group_name:	The name of the group. + * @target:	The target kobject of the symlink to create. + * @link_name:	The name of the symlink to create. + */ +int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, +			    struct kobject *target, const char *link_name) +{ +	struct sysfs_dirent *dir_sd; +	int error = 0; + +	dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); +	if (!dir_sd) +		return -ENOENT; + +	error = sysfs_create_link_sd(dir_sd, target, link_name); +	sysfs_put(dir_sd); + +	return error; +} +EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); + +/** + * sysfs_remove_link_from_group - remove a symlink from an attribute group. + * @kobj:	The kobject containing the group. + * @group_name:	The name of the group. + * @link_name:	The name of the symlink to remove. + */ +void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, +				  const char *link_name) +{ +	struct sysfs_dirent *dir_sd; + +	dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); +	if (dir_sd) { +		sysfs_hash_and_remove(dir_sd, NULL, link_name); +		sysfs_put(dir_sd); +	} +} +EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);  EXPORT_SYMBOL_GPL(sysfs_create_group);  EXPORT_SYMBOL_GPL(sysfs_update_group); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index db940a9be04..8d924b5ec73 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -10,7 +10,7 @@   * Please see Documentation/filesystems/sysfs.txt for more information.   */ -#define DEBUG  +#define DEBUG  #include <linux/fs.h>  #include <linux/mount.h> diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 3c9eb5624f5..8c940df97a5 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -21,26 +21,17 @@  #include "sysfs.h" -static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, -				const char *name, int warn) +static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, +				   struct kobject *target, +				   const char *name, int warn)  { -	struct sysfs_dirent *parent_sd = NULL;  	struct sysfs_dirent *target_sd = NULL;  	struct sysfs_dirent *sd = NULL;  	struct sysfs_addrm_cxt acxt;  	enum kobj_ns_type ns_type;  	int error; -	BUG_ON(!name); - -	if (!kobj) -		parent_sd = &sysfs_root; -	else -		parent_sd = kobj->sd; - -	error = -EFAULT; -	if (!parent_sd) -		goto out_put; +	BUG_ON(!name || !parent_sd);  	/* target->sd can go away beneath us but is protected with  	 * sysfs_assoc_lock.  Fetch target_sd from it. @@ -96,6 +87,34 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,  }  /** + *	sysfs_create_link_sd - create symlink to a given object. + *	@sd:		directory we're creating the link in. + *	@target:	object we're pointing to. + *	@name:		name of the symlink. + */ +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +			 const char *name) +{ +	return sysfs_do_create_link_sd(sd, target, name, 1); +} + +static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, +				const char *name, int warn) +{ +	struct sysfs_dirent *parent_sd = NULL; + +	if (!kobj) +		parent_sd = &sysfs_root; +	else +		parent_sd = kobj->sd; + +	if (!parent_sd) +		return -EFAULT; + +	return sysfs_do_create_link_sd(parent_sd, target, name, warn); +} + +/**   *	sysfs_create_link - create symlink between two objects.   *	@kobj:	object whose directory we're creating the link in.   *	@target:	object we're pointing to. diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d73c0932bbd..d1e4043eb0c 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -240,3 +240,5 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd);   * symlink.c   */  extern const struct inode_operations sysfs_symlink_inode_operations; +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +			 const char *name); diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index a77c4215762..3799e8dac3e 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -68,7 +68,7 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)  static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)  {  	unsigned long pos = filp->f_pos; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	unsigned offset = pos & ~PAGE_CACHE_MASK;  	unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 0a65939508e..9d4dc683179 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -41,9 +41,11 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)  	if ((attr->ia_valid & ATTR_SIZE) &&  	    attr->ia_size != i_size_read(inode)) { -		error = vmtruncate(inode, attr->ia_size); +		error = inode_newsize_ok(inode, attr->ia_size);  		if (error)  			return error; +		truncate_setsize(inode, attr->ia_size); +		sysv_truncate(inode);  	}  	setattr_copy(inode, attr); @@ -52,7 +54,6 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)  }  const struct inode_operations sysv_file_inode_operations = { -	.truncate	= sysv_truncate,  	.setattr	= sysv_setattr,  	.getattr	= sysv_getattr,  }; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 90b54b43878..c1a591a4725 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -464,6 +464,16 @@ int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)  	return __block_write_begin(page, pos, len, get_block);  } +static void sysv_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) { +		truncate_pagecache(inode, to, inode->i_size); +		sysv_truncate(inode); +	} +} +  static int sysv_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -471,11 +481,8 @@ static int sysv_write_begin(struct file *file, struct address_space *mapping,  	int ret;  	ret = block_write_begin(mapping, pos, len, flags, pagep, get_block); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		sysv_write_failed(mapping, pos + len);  	return ret;  } diff --git a/fs/timerfd.c b/fs/timerfd.c index d03822bbf19..32b644f0369 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -22,6 +22,7 @@  #include <linux/anon_inodes.h>  #include <linux/timerfd.h>  #include <linux/syscalls.h> +#include <linux/compat.h>  #include <linux/rcupdate.h>  struct timerfd_ctx { @@ -278,21 +279,17 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)  	return ufd;  } -SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, -		const struct itimerspec __user *, utmr, -		struct itimerspec __user *, otmr) +static int do_timerfd_settime(int ufd, int flags,  +		const struct itimerspec *new, +		struct itimerspec *old)  {  	struct fd f;  	struct timerfd_ctx *ctx; -	struct itimerspec ktmr, kotmr;  	int ret; -	if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) -		return -EFAULT; -  	if ((flags & ~TFD_SETTIME_FLAGS) || -	    !timespec_valid(&ktmr.it_value) || -	    !timespec_valid(&ktmr.it_interval)) +	    !timespec_valid(&new->it_value) || +	    !timespec_valid(&new->it_interval))  		return -EINVAL;  	ret = timerfd_fget(ufd, &f); @@ -323,27 +320,23 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,  	if (ctx->expired && ctx->tintv.tv64)  		hrtimer_forward_now(&ctx->tmr, ctx->tintv); -	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); -	kotmr.it_interval = ktime_to_timespec(ctx->tintv); +	old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); +	old->it_interval = ktime_to_timespec(ctx->tintv);  	/*  	 * Re-program the timer to the new value ...  	 */ -	ret = timerfd_setup(ctx, flags, &ktmr); +	ret = timerfd_setup(ctx, flags, new);  	spin_unlock_irq(&ctx->wqh.lock);  	fdput(f); -	if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) -		return -EFAULT; -  	return ret;  } -SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) +static int do_timerfd_gettime(int ufd, struct itimerspec *t)  {  	struct fd f;  	struct timerfd_ctx *ctx; -	struct itimerspec kotmr;  	int ret = timerfd_fget(ufd, &f);  	if (ret)  		return ret; @@ -356,11 +349,65 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)  			hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;  		hrtimer_restart(&ctx->tmr);  	} -	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); -	kotmr.it_interval = ktime_to_timespec(ctx->tintv); +	t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); +	t->it_interval = ktime_to_timespec(ctx->tintv);  	spin_unlock_irq(&ctx->wqh.lock);  	fdput(f); +	return 0; +} +SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, +		const struct itimerspec __user *, utmr, +		struct itimerspec __user *, otmr) +{ +	struct itimerspec new, old; +	int ret; + +	if (copy_from_user(&new, utmr, sizeof(new))) +		return -EFAULT; +	ret = do_timerfd_settime(ufd, flags, &new, &old); +	if (ret) +		return ret; +	if (otmr && copy_to_user(otmr, &old, sizeof(old))) +		return -EFAULT; + +	return ret; +} + +SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) +{ +	struct itimerspec kotmr; +	int ret = do_timerfd_gettime(ufd, &kotmr); +	if (ret) +		return ret;  	return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;  } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, +		const struct compat_itimerspec __user *, utmr, +		struct compat_itimerspec __user *, otmr) +{ +	struct itimerspec new, old; +	int ret; + +	if (get_compat_itimerspec(&new, utmr)) +		return -EFAULT; +	ret = do_timerfd_settime(ufd, flags, &new, &old); +	if (ret) +		return ret; +	if (otmr && put_compat_itimerspec(otmr, &old)) +		return -EFAULT; +	return ret; +} + +COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, +		struct compat_itimerspec __user *, otmr) +{ +	struct itimerspec kotmr; +	int ret = do_timerfd_gettime(ufd, &kotmr); +	if (ret) +		return ret; +	return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0; +} +#endif diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 12817ffc734..7f60e900edf 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2459,7 +2459,7 @@ error_dump:  static inline int chance(unsigned int n, unsigned int out_of)  { -	return !!((random32() % out_of) + 1 <= n); +	return !!((prandom_u32() % out_of) + 1 <= n);  } @@ -2477,13 +2477,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)  			if (chance(1, 2)) {  				d->pc_delay = 1;  				/* Fail withing 1 minute */ -				delay = random32() % 60000; +				delay = prandom_u32() % 60000;  				d->pc_timeout = jiffies;  				d->pc_timeout += msecs_to_jiffies(delay);  				ubifs_warn("failing after %lums", delay);  			} else {  				d->pc_delay = 2; -				delay = random32() % 10000; +				delay = prandom_u32() % 10000;  				/* Fail within 10000 operations */  				d->pc_cnt_max = delay;  				ubifs_warn("failing after %lu calls", delay); @@ -2563,7 +2563,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,  	unsigned int from, to, ffs = chance(1, 2);  	unsigned char *p = (void *)buf; -	from = random32() % (len + 1); +	from = prandom_u32() % (len + 1);  	/* Corruption may only span one max. write unit */  	to = min(len, ALIGN(from, c->max_write_size)); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 8a574776a49..de08c92f2e2 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -352,7 +352,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)  	struct qstr nm;  	union ubifs_key key;  	struct ubifs_dent_node *dent; -	struct inode *dir = file->f_path.dentry->d_inode; +	struct inode *dir = file_inode(file);  	struct ubifs_info *c = dir->i_sb->s_fs_info;  	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5bc77817f38..f12189d2db1 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1444,7 +1444,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,  				 struct vm_fault *vmf)  {  	struct page *page = vmf->page; -	struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(vma->vm_file);  	struct ubifs_info *c = inode->i_sb->s_fs_info;  	struct timespec now = ubifs_current_time(inode);  	struct ubifs_budget_req req = { .new_page = 1 }; @@ -1522,6 +1522,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,  			ubifs_release_dirty_inode_budget(c, ui);  	} +	wait_for_stable_page(page);  	unlock_page(page);  	return 0; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 1a7e2d8bdbe..648b143606c 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -147,7 +147,7 @@ out_unlock:  long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  {  	int flags, err; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	switch (cmd) {  	case FS_IOC_GETFLAGS: diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 9daaeef675d..4b826abb152 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -2007,28 +2007,28 @@ static int dbg_populate_lsave(struct ubifs_info *c)  	if (!dbg_is_chk_gen(c))  		return 0; -	if (random32() & 3) +	if (prandom_u32() & 3)  		return 0;  	for (i = 0; i < c->lsave_cnt; i++)  		c->lsave[i] = c->main_first;  	list_for_each_entry(lprops, &c->empty_list, list) -		c->lsave[random32() % c->lsave_cnt] = lprops->lnum; +		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;  	list_for_each_entry(lprops, &c->freeable_list, list) -		c->lsave[random32() % c->lsave_cnt] = lprops->lnum; +		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;  	list_for_each_entry(lprops, &c->frdi_idx_list, list) -		c->lsave[random32() % c->lsave_cnt] = lprops->lnum; +		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;  	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];  	for (i = 0; i < heap->cnt; i++) -		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; +		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;  	heap = &c->lpt_heap[LPROPS_DIRTY - 1];  	for (i = 0; i < heap->cnt; i++) -		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; +		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;  	heap = &c->lpt_heap[LPROPS_FREE - 1];  	for (i = 0; i < heap->cnt; i++) -		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; +		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;  	return 1;  } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 769701ccb5c..ba32da3fe08 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -126,13 +126,14 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)  		else if (inum > o->inum)  			p = p->rb_right;  		else { -			if (o->dnext) { +			if (o->del) {  				spin_unlock(&c->orphan_lock);  				dbg_gen("deleted twice ino %lu",  					(unsigned long)inum);  				return;  			} -			if (o->cnext) { +			if (o->cmt) { +				o->del = 1;  				o->dnext = c->orph_dnext;  				c->orph_dnext = o;  				spin_unlock(&c->orphan_lock); @@ -172,7 +173,9 @@ int ubifs_orphan_start_commit(struct ubifs_info *c)  	last = &c->orph_cnext;  	list_for_each_entry(orphan, &c->orph_new, new_list) {  		ubifs_assert(orphan->new); +		ubifs_assert(!orphan->cmt);  		orphan->new = 0; +		orphan->cmt = 1;  		*last = orphan;  		last = &orphan->cnext;  	} @@ -299,7 +302,9 @@ static int write_orph_node(struct ubifs_info *c, int atomic)  	cnext = c->orph_cnext;  	for (i = 0; i < cnt; i++) {  		orphan = cnext; +		ubifs_assert(orphan->cmt);  		orph->inos[i] = cpu_to_le64(orphan->inum); +		orphan->cmt = 0;  		cnext = orphan->cnext;  		orphan->cnext = NULL;  	} @@ -378,6 +383,7 @@ static int consolidate(struct ubifs_info *c)  		list_for_each_entry(orphan, &c->orph_list, list) {  			if (orphan->new)  				continue; +			orphan->cmt = 1;  			*last = orphan;  			last = &orphan->cnext;  			cnt += 1; @@ -442,6 +448,7 @@ static void erase_deleted(struct ubifs_info *c)  		orphan = dnext;  		dnext = orphan->dnext;  		ubifs_assert(!orphan->new); +		ubifs_assert(orphan->del);  		rb_erase(&orphan->rb, &c->orph_tree);  		list_del(&orphan->list);  		c->tot_orphans -= 1; @@ -531,6 +538,7 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)  	rb_link_node(&orphan->rb, parent, p);  	rb_insert_color(&orphan->rb, &c->orph_tree);  	list_add_tail(&orphan->list, &c->orph_list); +	orphan->del = 1;  	orphan->dnext = c->orph_dnext;  	c->orph_dnext = orphan;  	dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 523bbad69c0..52a6559275c 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -683,7 +683,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt)  		c->ilebs[c->ileb_cnt++] = lnum;  		dbg_cmt("LEB %d", lnum);  	} -	if (dbg_is_chk_index(c) && !(random32() & 7)) +	if (dbg_is_chk_index(c) && !(prandom_u32() & 7))  		return -ENOSPC;  	return 0;  } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d133c276fe0..b2babce4d70 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -904,6 +904,8 @@ struct ubifs_budget_req {   * @dnext: next orphan to delete   * @inum: inode number   * @new: %1 => added since the last commit, otherwise %0 + * @cmt: %1 => commit pending, otherwise %0 + * @del: %1 => delete pending, otherwise %0   */  struct ubifs_orphan {  	struct rb_node rb; @@ -912,7 +914,9 @@ struct ubifs_orphan {  	struct ubifs_orphan *cnext;  	struct ubifs_orphan *dnext;  	ino_t inum; -	int new; +	unsigned new:1; +	unsigned cmt:1; +	unsigned del:1;  };  /** diff --git a/fs/udf/dir.c b/fs/udf/dir.c index eb8bfe2b89a..b3e93f5e17c 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -186,7 +186,7 @@ out:  static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)  { -	struct inode *dir = filp->f_path.dentry->d_inode; +	struct inode *dir = file_inode(filp);  	int result;  	if (filp->f_pos == 0) { diff --git a/fs/udf/file.c b/fs/udf/file.c index 77b5953eaac..29569dd0816 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -139,7 +139,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  {  	ssize_t retval;  	struct file *file = iocb->ki_filp; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	int err, pos;  	size_t count = iocb->ki_left;  	struct udf_inode_info *iinfo = UDF_I(inode); @@ -178,7 +178,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	long old_block, new_block;  	int result = -EINVAL; @@ -204,7 +204,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  		goto out;  	case UDF_RELOCATE_BLOCKS:  		if (!capable(CAP_SYS_ADMIN)) { -			result = -EACCES; +			result = -EPERM;  			goto out;  		}  		if (get_user(old_block, (long __user *)arg)) { diff --git a/fs/udf/inode.c b/fs/udf/inode.c index cbae1ed0b7c..7a12e48ad81 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -67,6 +67,74 @@ static void udf_update_extents(struct inode *,  			       struct extent_position *);  static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); +static void __udf_clear_extent_cache(struct inode *inode) +{ +	struct udf_inode_info *iinfo = UDF_I(inode); + +	if (iinfo->cached_extent.lstart != -1) { +		brelse(iinfo->cached_extent.epos.bh); +		iinfo->cached_extent.lstart = -1; +	} +} + +/* Invalidate extent cache */ +static void udf_clear_extent_cache(struct inode *inode) +{ +	struct udf_inode_info *iinfo = UDF_I(inode); + +	spin_lock(&iinfo->i_extent_cache_lock); +	__udf_clear_extent_cache(inode); +	spin_unlock(&iinfo->i_extent_cache_lock); +} + +/* Return contents of extent cache */ +static int udf_read_extent_cache(struct inode *inode, loff_t bcount, +				 loff_t *lbcount, struct extent_position *pos) +{ +	struct udf_inode_info *iinfo = UDF_I(inode); +	int ret = 0; + +	spin_lock(&iinfo->i_extent_cache_lock); +	if ((iinfo->cached_extent.lstart <= bcount) && +	    (iinfo->cached_extent.lstart != -1)) { +		/* Cache hit */ +		*lbcount = iinfo->cached_extent.lstart; +		memcpy(pos, &iinfo->cached_extent.epos, +		       sizeof(struct extent_position)); +		if (pos->bh) +			get_bh(pos->bh); +		ret = 1; +	} +	spin_unlock(&iinfo->i_extent_cache_lock); +	return ret; +} + +/* Add extent to extent cache */ +static void udf_update_extent_cache(struct inode *inode, loff_t estart, +				    struct extent_position *pos, int next_epos) +{ +	struct udf_inode_info *iinfo = UDF_I(inode); + +	spin_lock(&iinfo->i_extent_cache_lock); +	/* Invalidate previously cached extent */ +	__udf_clear_extent_cache(inode); +	if (pos->bh) +		get_bh(pos->bh); +	memcpy(&iinfo->cached_extent.epos, pos, +	       sizeof(struct extent_position)); +	iinfo->cached_extent.lstart = estart; +	if (next_epos) +		switch (iinfo->i_alloc_type) { +		case ICBTAG_FLAG_AD_SHORT: +			iinfo->cached_extent.epos.offset -= +			sizeof(struct short_ad); +			break; +		case ICBTAG_FLAG_AD_LONG: +			iinfo->cached_extent.epos.offset -= +			sizeof(struct long_ad); +		} +	spin_unlock(&iinfo->i_extent_cache_lock); +}  void udf_evict_inode(struct inode *inode)  { @@ -90,6 +158,7 @@ void udf_evict_inode(struct inode *inode)  	}  	kfree(iinfo->i_ext.i_data);  	iinfo->i_ext.i_data = NULL; +	udf_clear_extent_cache(inode);  	if (want_delete) {  		udf_free_inode(inode);  	} @@ -105,6 +174,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)  		truncate_pagecache(inode, to, isize);  		if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {  			down_write(&iinfo->i_data_sem); +			udf_clear_extent_cache(inode);  			udf_truncate_extents(inode);  			up_write(&iinfo->i_data_sem);  		} @@ -372,7 +442,7 @@ static int udf_get_block(struct inode *inode, sector_t block,  		iinfo->i_next_alloc_goal++;  	} - +	udf_clear_extent_cache(inode);  	phys = inode_getblk(inode, block, &err, &new);  	if (!phys)  		goto abort; @@ -1171,6 +1241,7 @@ set_size:  	} else {  		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {  			down_write(&iinfo->i_data_sem); +			udf_clear_extent_cache(inode);  			memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,  			       0x00, bsize - newsize -  			       udf_file_entry_alloc_offset(inode)); @@ -1184,6 +1255,7 @@ set_size:  		if (err)  			return err;  		down_write(&iinfo->i_data_sem); +		udf_clear_extent_cache(inode);  		truncate_setsize(inode, newsize);  		udf_truncate_extents(inode);  		up_write(&iinfo->i_data_sem); @@ -2156,11 +2228,12 @@ int8_t inode_bmap(struct inode *inode, sector_t block,  	struct udf_inode_info *iinfo;  	iinfo = UDF_I(inode); -	pos->offset = 0; -	pos->block = iinfo->i_location; -	pos->bh = NULL; +	if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) { +		pos->offset = 0; +		pos->block = iinfo->i_location; +		pos->bh = NULL; +	}  	*elen = 0; -  	do {  		etype = udf_next_aext(inode, pos, eloc, elen, 1);  		if (etype == -1) { @@ -2170,7 +2243,8 @@ int8_t inode_bmap(struct inode *inode, sector_t block,  		}  		lbcount += *elen;  	} while (lbcount <= bcount); - +	/* update extent cache */ +	udf_update_extent_cache(inode, lbcount - *elen, pos, 1);  	*offset = (bcount + *elen - lbcount) >> blocksize_bits;  	return etype; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 95fee278ab9..102c072c6bb 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1270,10 +1270,10 @@ static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp,  	if (parent && (len < 5)) {  		*lenp = 5; -		return 255; +		return FILEID_INVALID;  	} else if (len < 3) {  		*lenp = 3; -		return 255; +		return FILEID_INVALID;  	}  	*lenp = 3; diff --git a/fs/udf/super.c b/fs/udf/super.c index d44fb568abe..bc5b30a819e 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -134,6 +134,8 @@ static struct inode *udf_alloc_inode(struct super_block *sb)  	ei->i_next_alloc_goal = 0;  	ei->i_strat4096 = 0;  	init_rwsem(&ei->i_data_sem); +	ei->cached_extent.lstart = -1; +	spin_lock_init(&ei->i_extent_cache_lock);  	return &ei->vfs_inode;  } @@ -307,7 +309,8 @@ static void udf_sb_free_partitions(struct super_block *sb)  {  	struct udf_sb_info *sbi = UDF_SB(sb);  	int i; - +	if (sbi->s_partmaps == NULL) +		return;  	for (i = 0; i < sbi->s_partitions; i++)  		udf_free_partition(&sbi->s_partmaps[i]);  	kfree(sbi->s_partmaps); @@ -1020,7 +1023,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)  	if (bitmap == NULL)  		return NULL; -	bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);  	bitmap->s_nr_groups = nr_groups;  	return bitmap;  } @@ -1078,8 +1080,6 @@ static int udf_fill_partdesc_info(struct super_block *sb,  		if (!bitmap)  			return 1;  		map->s_uspace.s_bitmap = bitmap; -		bitmap->s_extLength = le32_to_cpu( -				phd->unallocSpaceBitmap.extLength);  		bitmap->s_extPosition = le32_to_cpu(  				phd->unallocSpaceBitmap.extPosition);  		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; @@ -1114,8 +1114,6 @@ static int udf_fill_partdesc_info(struct super_block *sb,  		if (!bitmap)  			return 1;  		map->s_fspace.s_bitmap = bitmap; -		bitmap->s_extLength = le32_to_cpu( -				phd->freedSpaceBitmap.extLength);  		bitmap->s_extPosition = le32_to_cpu(  				phd->freedSpaceBitmap.extPosition);  		map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; @@ -1865,6 +1863,8 @@ static void udf_open_lvid(struct super_block *sb)  	mark_buffer_dirty(bh);  	sbi->s_lvid_dirty = 0;  	mutex_unlock(&sbi->s_alloc_mutex); +	/* Make opening of filesystem visible on the media immediately */ +	sync_dirty_buffer(bh);  }  static void udf_close_lvid(struct super_block *sb) @@ -1905,6 +1905,8 @@ static void udf_close_lvid(struct super_block *sb)  	mark_buffer_dirty(bh);  	sbi->s_lvid_dirty = 0;  	mutex_unlock(&sbi->s_alloc_mutex); +	/* Make closing of filesystem visible on the media immediately */ +	sync_dirty_buffer(bh);  }  u64 lvid_get_unique_id(struct super_block *sb) diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index bb8309dcd5c..b5cd8ed2aa1 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -1,6 +1,19 @@  #ifndef _UDF_I_H  #define _UDF_I_H +struct extent_position { +	struct buffer_head *bh; +	uint32_t offset; +	struct kernel_lb_addr block; +}; + +struct udf_ext_cache { +	/* Extent position */ +	struct extent_position epos; +	/* Start logical offset in bytes */ +	loff_t lstart; +}; +  /*   * The i_data_sem and i_mutex serve for protection of allocation information   * of a regular files and symlinks. This includes all extents belonging to @@ -35,6 +48,9 @@ struct udf_inode_info {  		__u8		*i_data;  	} i_ext;  	struct rw_semaphore	i_data_sem; +	struct udf_ext_cache cached_extent; +	/* Spinlock for protecting extent cache */ +	spinlock_t i_extent_cache_lock;  	struct inode vfs_inode;  }; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 5f027227f08..ed401e94aa8 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -80,10 +80,9 @@ struct udf_virtual_data {  };  struct udf_bitmap { -	__u32			s_extLength;  	__u32			s_extPosition; -	__u16			s_nr_groups; -	struct buffer_head 	**s_block_bitmap; +	int			s_nr_groups; +	struct buffer_head 	*s_block_bitmap[0];  };  struct udf_part_map { diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index de038da6f6b..be7dabbbcb4 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -113,11 +113,6 @@ struct ustr {  	uint8_t u_len;  }; -struct extent_position { -	struct buffer_head *bh; -	uint32_t offset; -	struct kernel_lb_addr block; -};  /* super.c */ diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index e4f10a40768..0bf6e16f8d7 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -29,7 +29,7 @@ config UFS_FS  config UFS_FS_WRITE  	bool "UFS file system write support (DANGEROUS)" -	depends on UFS_FS && EXPERIMENTAL +	depends on UFS_FS  	help  	  Say Y here if you want to try writing to UFS partitions. This is  	  experimental, so you should back up your UFS partitions beforehand. diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index dbc90994715..3a75ca09c50 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -433,7 +433,7 @@ static int  ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)  {  	loff_t pos = filp->f_pos; -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct super_block *sb = inode->i_sb;  	unsigned int offset = pos & ~PAGE_CACHE_MASK;  	unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index eb6d0b7dc87..ff24e4449ec 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -526,6 +526,14 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)  	return __block_write_begin(page, pos, len, ufs_getfrag_block);  } +static void ufs_write_failed(struct address_space *mapping, loff_t to) +{ +	struct inode *inode = mapping->host; + +	if (to > inode->i_size) +		truncate_pagecache(inode, to, inode->i_size); +} +  static int ufs_write_begin(struct file *file, struct address_space *mapping,  			loff_t pos, unsigned len, unsigned flags,  			struct page **pagep, void **fsdata) @@ -534,11 +542,8 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,  	ret = block_write_begin(mapping, pos, len, flags, pagep,  				ufs_getfrag_block); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} +	if (unlikely(ret)) +		ufs_write_failed(mapping, pos + len);  	return ret;  } diff --git a/fs/utimes.c b/fs/utimes.c index bb0696a4173..f4fb7eca10e 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -158,13 +158,17 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,  		if (!(flags & AT_SYMLINK_NOFOLLOW))  			lookup_flags |= LOOKUP_FOLLOW; - +retry:  		error = user_path_at(dfd, filename, lookup_flags, &path);  		if (error)  			goto out;  		error = utimes_common(&path, times);  		path_put(&path); +		if (retry_estale(error, lookup_flags)) { +			lookup_flags |= LOOKUP_REVAL; +			goto retry; +		}  	}  out: diff --git a/fs/xattr.c b/fs/xattr.c index e21c119f4f9..3377dff1840 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -370,8 +370,9 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,  {  	struct path path;  	int error; - -	error = user_path(pathname, &path); +	unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);  	if (error)  		return error;  	error = mnt_want_write(path.mnt); @@ -380,6 +381,10 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,  		mnt_drop_write(path.mnt);  	}  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -389,8 +394,9 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,  {  	struct path path;  	int error; - -	error = user_lpath(pathname, &path); +	unsigned int lookup_flags = 0; +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);  	if (error)  		return error;  	error = mnt_want_write(path.mnt); @@ -399,6 +405,10 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,  		mnt_drop_write(path.mnt);  	}  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -476,12 +486,17 @@ SYSCALL_DEFINE4(getxattr, const char __user *, pathname,  {  	struct path path;  	ssize_t error; - -	error = user_path(pathname, &path); +	unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);  	if (error)  		return error;  	error = getxattr(path.dentry, name, value, size);  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -490,12 +505,17 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,  {  	struct path path;  	ssize_t error; - -	error = user_lpath(pathname, &path); +	unsigned int lookup_flags = 0; +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);  	if (error)  		return error;  	error = getxattr(path.dentry, name, value, size);  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -556,12 +576,17 @@ SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,  {  	struct path path;  	ssize_t error; - -	error = user_path(pathname, &path); +	unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);  	if (error)  		return error;  	error = listxattr(path.dentry, list, size);  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -570,12 +595,17 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,  {  	struct path path;  	ssize_t error; - -	error = user_lpath(pathname, &path); +	unsigned int lookup_flags = 0; +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);  	if (error)  		return error;  	error = listxattr(path.dentry, list, size);  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -615,8 +645,9 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,  {  	struct path path;  	int error; - -	error = user_path(pathname, &path); +	unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);  	if (error)  		return error;  	error = mnt_want_write(path.mnt); @@ -625,6 +656,10 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,  		mnt_drop_write(path.mnt);  	}  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } @@ -633,8 +668,9 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,  {  	struct path path;  	int error; - -	error = user_lpath(pathname, &path); +	unsigned int lookup_flags = 0; +retry: +	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);  	if (error)  		return error;  	error = mnt_want_write(path.mnt); @@ -643,6 +679,10 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,  		mnt_drop_write(path.mnt);  	}  	path_put(&path); +	if (retry_estale(error, lookup_flags)) { +		lookup_flags |= LOOKUP_REVAL; +		goto retry; +	}  	return error;  } diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5a7ffe54f5d..cc33aaf219f 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -70,8 +70,8 @@ config XFS_RT  	  If unsure, say N.  config XFS_DEBUG -	bool "XFS Debugging support (EXPERIMENTAL)" -	depends on XFS_FS && EXPERIMENTAL +	bool "XFS Debugging support" +	depends on XFS_FS  	help  	  Say Y here to get an XFS build with many debugging features,  	  including ASSERT checks, function wrappers around macros, diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 393055fe3ae..0ad23253e8b 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1925,8 +1925,6 @@ xfs_alloc_fix_freelist(  	targs.mp = mp;  	targs.agbp = agbp;  	targs.agno = args->agno; -	targs.mod = targs.minleft = targs.wasdel = targs.userdata = -		targs.minalignslop = 0;  	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;  	targs.type = XFS_ALLOCTYPE_THIS_AG;  	targs.pag = pag; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4111a40ebe1..5f707e53717 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -86,11 +86,11 @@ xfs_destroy_ioend(  	}  	if (ioend->io_iocb) { +		inode_dio_done(ioend->io_inode);  		if (ioend->io_isasync) {  			aio_complete(ioend->io_iocb, ioend->io_error ?  					ioend->io_error : ioend->io_result, 0);  		} -		inode_dio_done(ioend->io_inode);  	}  	mempool_free(ioend, xfs_ioend_pool); diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index aaf472532b3..888683844d9 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -300,9 +300,12 @@ xfs_attr_set_int(  	if (rsvd)  		args.trans->t_flags |= XFS_TRANS_RESERVE; -	if ((error = xfs_trans_reserve(args.trans, args.total, -			XFS_ATTRSET_LOG_RES(mp, args.total), 0, -			XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) { +	error = xfs_trans_reserve(args.trans, args.total, +				  XFS_ATTRSETM_LOG_RES(mp) + +				  XFS_ATTRSETRT_LOG_RES(mp) * args.total, +				  0, XFS_TRANS_PERM_LOG_RES, +				  XFS_ATTRSET_LOG_COUNT); +	if (error) {  		xfs_trans_cancel(args.trans, 0);  		return(error);  	} diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 0e92d12765d..b44af9211bd 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -147,7 +147,10 @@ xfs_bmap_local_to_extents(  	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */  	xfs_extlen_t	total,		/* total blocks needed by transaction */  	int		*logflagsp,	/* inode logging flags */ -	int		whichfork);	/* data or attr fork */ +	int		whichfork,	/* data or attr fork */ +	void		(*init_fn)(struct xfs_buf *bp, +				   struct xfs_inode *ip, +				   struct xfs_ifork *ifp));  /*   * Search the extents list for the inode, for the extent containing bno. @@ -357,7 +360,42 @@ xfs_bmap_add_attrfork_extents(  }  /* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Block initialisation functions for local to extent format conversion. + * As these get more complex, they will be moved to the relevant files, + * but for now they are too simple to worry about. + */ +STATIC void +xfs_bmap_local_to_extents_init_fn( +	struct xfs_buf		*bp, +	struct xfs_inode	*ip, +	struct xfs_ifork	*ifp) +{ +	bp->b_ops = &xfs_bmbt_buf_ops; +	memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); +} + +STATIC void +xfs_symlink_local_to_remote( +	struct xfs_buf		*bp, +	struct xfs_inode	*ip, +	struct xfs_ifork	*ifp) +{ +	/* remote symlink blocks are not verifiable until CRCs come along */ +	bp->b_ops = NULL; +	memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); +} + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. Each + * different data fork content type needs a different callout to do the + * conversion. Some are basic and only require special block initialisation + * callouts for the data formating, others (directories) are so specialised they + * handle everything themselves. + * + * XXX (dgc): investigate whether directory conversion can use the generic + * formatting callout. It should be possible - it's just a very complex + * formatter. it would also require passing the transaction through to the init + * function.   */  STATIC int					/* error */  xfs_bmap_add_attrfork_local( @@ -368,25 +406,29 @@ xfs_bmap_add_attrfork_local(  	int			*flags)		/* inode logging flags */  {  	xfs_da_args_t		dargs;		/* args for dir/attr code */ -	int			error;		/* error return value */ -	xfs_mount_t		*mp;		/* mount structure pointer */  	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))  		return 0; +  	if (S_ISDIR(ip->i_d.di_mode)) { -		mp = ip->i_mount;  		memset(&dargs, 0, sizeof(dargs));  		dargs.dp = ip;  		dargs.firstblock = firstblock;  		dargs.flist = flist; -		dargs.total = mp->m_dirblkfsbs; +		dargs.total = ip->i_mount->m_dirblkfsbs;  		dargs.whichfork = XFS_DATA_FORK;  		dargs.trans = tp; -		error = xfs_dir2_sf_to_block(&dargs); -	} else -		error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, -			XFS_DATA_FORK); -	return error; +		return xfs_dir2_sf_to_block(&dargs); +	} + +	if (S_ISLNK(ip->i_d.di_mode)) +		return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, +						 flags, XFS_DATA_FORK, +						 xfs_symlink_local_to_remote); + +	return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, +					 XFS_DATA_FORK, +					 xfs_bmap_local_to_extents_init_fn);  }  /* @@ -3099,8 +3141,6 @@ xfs_bmap_extents_to_btree(  		args.fsbno = *firstblock;  	}  	args.minlen = args.maxlen = args.prod = 1; -	args.total = args.minleft = args.alignment = args.mod = args.isfl = -		args.minalignslop = 0;  	args.wasdel = wasdel;  	*logflagsp = 0;  	if ((error = xfs_alloc_vextent(&args))) { @@ -3221,7 +3261,10 @@ xfs_bmap_local_to_extents(  	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */  	xfs_extlen_t	total,		/* total blocks needed by transaction */  	int		*logflagsp,	/* inode logging flags */ -	int		whichfork)	/* data or attr fork */ +	int		whichfork, +	void		(*init_fn)(struct xfs_buf *bp, +				   struct xfs_inode *ip, +				   struct xfs_ifork *ifp))  {  	int		error;		/* error return value */  	int		flags;		/* logging flags returned */ @@ -3241,12 +3284,12 @@ xfs_bmap_local_to_extents(  		xfs_buf_t	*bp;	/* buffer for extent block */  		xfs_bmbt_rec_host_t *ep;/* extent record pointer */ +		ASSERT((ifp->if_flags & +			(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);  		memset(&args, 0, sizeof(args));  		args.tp = tp;  		args.mp = ip->i_mount;  		args.firstblock = *firstblock; -		ASSERT((ifp->if_flags & -			(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);  		/*  		 * Allocate a block.  We know we need only one, since the  		 * file currently fits in an inode. @@ -3259,20 +3302,21 @@ xfs_bmap_local_to_extents(  			args.type = XFS_ALLOCTYPE_NEAR_BNO;  		}  		args.total = total; -		args.mod = args.minleft = args.alignment = args.wasdel = -			args.isfl = args.minalignslop = 0;  		args.minlen = args.maxlen = args.prod = 1; -		if ((error = xfs_alloc_vextent(&args))) +		error = xfs_alloc_vextent(&args); +		if (error)  			goto done; -		/* -		 * Can't fail, the space was reserved. -		 */ + +		/* Can't fail, the space was reserved. */  		ASSERT(args.fsbno != NULLFSBLOCK);  		ASSERT(args.len == 1);  		*firstblock = args.fsbno;  		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); -		bp->b_ops = &xfs_bmbt_buf_ops; -		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + +		/* initialise the block and copy the data */ +		init_fn(bp, ip, ifp); + +		/* account for the change in fork size and log everything */  		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);  		xfs_bmap_forkoff_reset(args.mp, ip, whichfork);  		xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -4680,9 +4724,6 @@ __xfs_bmapi_allocate(  			return error;  	} -	if (bma->flags & XFS_BMAPI_STACK_SWITCH) -		bma->stack_switch = 1; -  	error = xfs_bmap_alloc(bma);  	if (error)  		return error; @@ -4922,8 +4963,32 @@ xfs_bmapi_write(  	XFS_STATS_INC(xs_blk_mapw);  	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { +		/* +		 * XXX (dgc): This assumes we are only called for inodes that +		 * contain content neutral data in local format. Anything that +		 * contains caller-specific data in local format that needs +		 * transformation to move to a block format needs to do the +		 * conversion to extent format itself. +		 * +		 * Directory data forks and attribute forks handle this +		 * themselves, but with the addition of metadata verifiers every +		 * data fork in local format now contains caller specific data +		 * and as such conversion through this function is likely to be +		 * broken. +		 * +		 * The only likely user of this branch is for remote symlinks, +		 * but we cannot overwrite the data fork contents of the symlink +		 * (EEXIST occurs higher up the stack) and so it will never go +		 * from local format to extent format here. Hence I don't think +		 * this branch is ever executed intentionally and we should +		 * consider removing it and asserting that xfs_bmapi_write() +		 * cannot be called directly on local format forks. i.e. callers +		 * are completely responsible for local to extent format +		 * conversion, not xfs_bmapi_write(). +		 */  		error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, -						  &bma.logflags, whichfork); +					&bma.logflags, whichfork, +					xfs_bmap_local_to_extents_init_fn);  		if (error)  			goto error0;  	} @@ -4956,6 +5021,9 @@ xfs_bmapi_write(  	bma.flist = flist;  	bma.firstblock = firstblock; +	if (flags & XFS_BMAPI_STACK_SWITCH) +		bma.stack_switch = 1; +  	while (bno < end && n < *nmap) {  		inhole = eof || bma.got.br_startoff > bno;  		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 26673a0b20e..4e8f0df82d0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -175,7 +175,7 @@ xfs_buf_get_maps(  	bp->b_map_count = map_count;  	if (map_count == 1) { -		bp->b_maps = &bp->b_map; +		bp->b_maps = &bp->__b_map;  		return 0;  	} @@ -193,7 +193,7 @@ static void  xfs_buf_free_maps(  	struct xfs_buf	*bp)  { -	if (bp->b_maps != &bp->b_map) { +	if (bp->b_maps != &bp->__b_map) {  		kmem_free(bp->b_maps);  		bp->b_maps = NULL;  	} @@ -377,8 +377,8 @@ xfs_buf_allocate_memory(  	}  use_alloc_page: -	start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT; -	end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1) +	start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; +	end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)  								>> PAGE_SHIFT;  	page_count = end - start;  	error = _xfs_buf_get_pages(bp, page_count, flags); @@ -487,6 +487,7 @@ _xfs_buf_find(  	struct rb_node		*parent;  	xfs_buf_t		*bp;  	xfs_daddr_t		blkno = map[0].bm_bn; +	xfs_daddr_t		eofs;  	int			numblks = 0;  	int			i; @@ -498,6 +499,23 @@ _xfs_buf_find(  	ASSERT(!(numbytes < (1 << btp->bt_sshift)));  	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); +	/* +	 * Corrupted block numbers can get through to here, unfortunately, so we +	 * have to check that the buffer falls within the filesystem bounds. +	 */ +	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); +	if (blkno >= eofs) { +		/* +		 * XXX (dgc): we should really be returning EFSCORRUPTED here, +		 * but none of the higher level infrastructure supports +		 * returning a specific error on buffer lookup failures. +		 */ +		xfs_alert(btp->bt_mount, +			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", +			  __func__, blkno, eofs); +		return NULL; +	} +  	/* get tree root */  	pag = xfs_perag_get(btp->bt_mount,  				xfs_daddr_to_agno(btp->bt_mount, blkno)); @@ -640,7 +658,7 @@ _xfs_buf_read(  	xfs_buf_flags_t		flags)  {  	ASSERT(!(flags & XBF_WRITE)); -	ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL); +	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);  	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);  	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); @@ -933,8 +951,6 @@ xfs_buf_trylock(  	locked = down_trylock(&bp->b_sema) == 0;  	if (locked)  		XB_SET_OWNER(bp); -	else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) -		xfs_log_force(bp->b_target->bt_mount, 0);  	trace_xfs_buf_trylock(bp, _RET_IP_);  	return locked; @@ -1487,6 +1503,8 @@ restart:  	while (!list_empty(&btp->bt_lru)) {  		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);  		if (atomic_read(&bp->b_hold) > 1) { +			trace_xfs_buf_wait_buftarg(bp, _RET_IP_); +			list_move_tail(&bp->b_lru, &btp->bt_lru);  			spin_unlock(&btp->bt_lru_lock);  			delay(100);  			goto restart; @@ -1709,7 +1727,7 @@ xfs_buf_cmp(  	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);  	xfs_daddr_t		diff; -	diff = ap->b_map.bm_bn - bp->b_map.bm_bn; +	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;  	if (diff < 0)  		return -1;  	if (diff > 0) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 23f5642480b..433a12ed7b1 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -151,7 +151,7 @@ typedef struct xfs_buf {  	struct page		**b_pages;	/* array of page pointers */  	struct page		*b_page_array[XB_PAGES]; /* inline pages */  	struct xfs_buf_map	*b_maps;	/* compound buffer map */ -	struct xfs_buf_map	b_map;		/* inline compound buffer map */ +	struct xfs_buf_map	__b_map;	/* inline compound buffer map */  	int			b_map_count;  	int			b_io_length;	/* IO size in BBs */  	atomic_t		b_pin_count;	/* pin count */ @@ -330,8 +330,8 @@ void xfs_buf_stale(struct xfs_buf *bp);   * In future, uncached buffers will pass the block number directly to the io   * request function and hence these macros will go away at that point.   */ -#define XFS_BUF_ADDR(bp)		((bp)->b_map.bm_bn) -#define XFS_BUF_SET_ADDR(bp, bno)	((bp)->b_map.bm_bn = (xfs_daddr_t)(bno)) +#define XFS_BUF_ADDR(bp)		((bp)->b_maps[0].bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno)	((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))  static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)  { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index becf4a97efc..cf263476d6b 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -37,109 +37,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)  	return container_of(lip, struct xfs_buf_log_item, bli_item);  } - -#ifdef XFS_TRANS_DEBUG -/* - * This function uses an alternate strategy for tracking the bytes - * that the user requests to be logged.  This can then be used - * in conjunction with the bli_orig array in the buf log item to - * catch bugs in our callers' code. - * - * We also double check the bits set in xfs_buf_item_log using a - * simple algorithm to check that every byte is accounted for. - */ -STATIC void -xfs_buf_item_log_debug( -	xfs_buf_log_item_t	*bip, -	uint			first, -	uint			last) -{ -	uint	x; -	uint	byte; -	uint	nbytes; -	uint	chunk_num; -	uint	word_num; -	uint	bit_num; -	uint	bit_set; -	uint	*wordp; - -	ASSERT(bip->bli_logged != NULL); -	byte = first; -	nbytes = last - first + 1; -	bfset(bip->bli_logged, first, nbytes); -	for (x = 0; x < nbytes; x++) { -		chunk_num = byte >> XFS_BLF_SHIFT; -		word_num = chunk_num >> BIT_TO_WORD_SHIFT; -		bit_num = chunk_num & (NBWORD - 1); -		wordp = &(bip->bli_format.blf_data_map[word_num]); -		bit_set = *wordp & (1 << bit_num); -		ASSERT(bit_set); -		byte++; -	} -} - -/* - * This function is called when we flush something into a buffer without - * logging it.  This happens for things like inodes which are logged - * separately from the buffer. - */ -void -xfs_buf_item_flush_log_debug( -	xfs_buf_t	*bp, -	uint		first, -	uint		last) -{ -	xfs_buf_log_item_t	*bip = bp->b_fspriv; -	uint			nbytes; - -	if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF)) -		return; - -	ASSERT(bip->bli_logged != NULL); -	nbytes = last - first + 1; -	bfset(bip->bli_logged, first, nbytes); -} - -/* - * This function is called to verify that our callers have logged - * all the bytes that they changed. - * - * It does this by comparing the original copy of the buffer stored in - * the buf log item's bli_orig array to the current copy of the buffer - * and ensuring that all bytes which mismatch are set in the bli_logged - * array of the buf log item. - */ -STATIC void -xfs_buf_item_log_check( -	xfs_buf_log_item_t	*bip) -{ -	char		*orig; -	char		*buffer; -	int		x; -	xfs_buf_t	*bp; - -	ASSERT(bip->bli_orig != NULL); -	ASSERT(bip->bli_logged != NULL); - -	bp = bip->bli_buf; -	ASSERT(bp->b_length > 0); -	ASSERT(bp->b_addr != NULL); -	orig = bip->bli_orig; -	buffer = bp->b_addr; -	for (x = 0; x < BBTOB(bp->b_length); x++) { -		if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { -			xfs_emerg(bp->b_mount, -				"%s: bip %x buffer %x orig %x index %d", -				__func__, bip, bp, orig, x); -			ASSERT(0); -		} -	} -} -#else -#define		xfs_buf_item_log_debug(x,y,z) -#define		xfs_buf_item_log_check(x) -#endif -  STATIC void	xfs_buf_do_callbacks(struct xfs_buf *bp);  /* @@ -237,7 +134,7 @@ xfs_buf_item_size(  		 * cancel flag in it.  		 */  		trace_xfs_buf_item_size_stale(bip); -		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); +		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);  		return bip->bli_format_count;  	} @@ -278,7 +175,7 @@ xfs_buf_item_format_segment(  	uint		buffer_offset;  	/* copy the flags across from the base format item */ -	blfp->blf_flags = bip->bli_format.blf_flags; +	blfp->blf_flags = bip->__bli_format.blf_flags;  	/*  	 * Base size is the actual size of the ondisk structure - it reflects @@ -287,6 +184,17 @@ xfs_buf_item_format_segment(  	 */  	base_size = offsetof(struct xfs_buf_log_format, blf_data_map) +  			(blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + +	nvecs = 0; +	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); +	if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { +		/* +		 * If the map is not be dirty in the transaction, mark +		 * the size as zero and do not advance the vector pointer. +		 */ +		goto out; +	} +  	vecp->i_addr = blfp;  	vecp->i_len = base_size;  	vecp->i_type = XLOG_REG_TYPE_BFORMAT; @@ -301,15 +209,13 @@ xfs_buf_item_format_segment(  		 */  		trace_xfs_buf_item_format_stale(bip);  		ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); -		blfp->blf_size = nvecs; -		return vecp; +		goto out;  	}  	/*  	 * Fill in an iovec for each set of contiguous chunks.  	 */ -	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); -	ASSERT(first_bit != -1); +  	last_bit = first_bit;  	nbits = 1;  	for (;;) { @@ -371,7 +277,8 @@ xfs_buf_item_format_segment(  			nbits++;  		}  	} -	bip->bli_format.blf_size = nvecs; +out: +	blfp->blf_size = nvecs;  	return vecp;  } @@ -405,7 +312,7 @@ xfs_buf_item_format(  	if (bip->bli_flags & XFS_BLI_INODE_BUF) {  		if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&  		      xfs_log_item_in_current_chkpt(lip))) -			bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; +			bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;  		bip->bli_flags &= ~XFS_BLI_INODE_BUF;  	} @@ -419,7 +326,6 @@ xfs_buf_item_format(  	 * Check to make sure everything is consistent.  	 */  	trace_xfs_buf_item_format(bip); -	xfs_buf_item_log_check(bip);  }  /* @@ -485,7 +391,7 @@ xfs_buf_item_unpin(  		ASSERT(bip->bli_flags & XFS_BLI_STALE);  		ASSERT(xfs_buf_islocked(bp));  		ASSERT(XFS_BUF_ISSTALE(bp)); -		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); +		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);  		trace_xfs_buf_item_unpin_stale(bip); @@ -563,8 +469,18 @@ xfs_buf_item_push(  	if (xfs_buf_ispinned(bp))  		return XFS_ITEM_PINNED; -	if (!xfs_buf_trylock(bp)) +	if (!xfs_buf_trylock(bp)) { +		/* +		 * If we have just raced with a buffer being pinned and it has +		 * been marked stale, we could end up stalling until someone else +		 * issues a log force to unpin the stale buffer. Check for the +		 * race condition here so xfsaild recognizes the buffer is pinned +		 * and queues a log force to move it along. +		 */ +		if (xfs_buf_ispinned(bp)) +			return XFS_ITEM_PINNED;  		return XFS_ITEM_LOCKED; +	}  	ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -601,7 +517,7 @@ xfs_buf_item_unlock(  {  	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);  	struct xfs_buf		*bp = bip->bli_buf; -	int			aborted; +	int			aborted, clean, i;  	uint			hold;  	/* Clear the buffer's association with this transaction. */ @@ -631,7 +547,7 @@ xfs_buf_item_unlock(  	 */  	if (bip->bli_flags & XFS_BLI_STALE) {  		trace_xfs_buf_item_unlock_stale(bip); -		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); +		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);  		if (!aborted) {  			atomic_dec(&bip->bli_refcount);  			return; @@ -642,12 +558,27 @@ xfs_buf_item_unlock(  	/*  	 * If the buf item isn't tracking any data, free it, otherwise drop the -	 * reference we hold to it. +	 * reference we hold to it. If we are aborting the transaction, this may +	 * be the only reference to the buf item, so we free it anyway +	 * regardless of whether it is dirty or not. A dirty abort implies a +	 * shutdown, anyway.  	 */ -	if (xfs_bitmap_empty(bip->bli_format.blf_data_map, -			     bip->bli_format.blf_map_size)) +	clean = 1; +	for (i = 0; i < bip->bli_format_count; i++) { +		if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, +			     bip->bli_formats[i].blf_map_size)) { +			clean = 0; +			break; +		} +	} +	if (clean)  		xfs_buf_item_relse(bp); -	else +	else if (aborted) { +		if (atomic_dec_and_test(&bip->bli_refcount)) { +			ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); +			xfs_buf_item_relse(bp); +		} +	} else  		atomic_dec(&bip->bli_refcount);  	if (!hold) @@ -716,7 +647,7 @@ xfs_buf_item_get_format(  	bip->bli_format_count = count;  	if (count == 1) { -		bip->bli_formats = &bip->bli_format; +		bip->bli_formats = &bip->__bli_format;  		return 0;  	} @@ -731,7 +662,7 @@ STATIC void  xfs_buf_item_free_format(  	struct xfs_buf_log_item	*bip)  { -	if (bip->bli_formats != &bip->bli_format) { +	if (bip->bli_formats != &bip->__bli_format) {  		kmem_free(bip->bli_formats);  		bip->bli_formats = NULL;  	} @@ -898,8 +829,6 @@ xfs_buf_item_log_segment(  		mask = (1 << end_bit) - 1;  		*wordp |= mask;  	} - -	xfs_buf_item_log_debug(bip, first, last);  }  /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 6850f49f4af..ee36c88ecfd 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -98,13 +98,9 @@ typedef struct xfs_buf_log_item {  	unsigned int		bli_flags;	/* misc flags */  	unsigned int		bli_recur;	/* lock recursion count */  	atomic_t		bli_refcount;	/* cnt of tp refs */ -#ifdef XFS_TRANS_DEBUG -	char			*bli_orig;	/* original buffer copy */ -	char			*bli_logged;	/* bytes logged (bitmap) */ -#endif  	int			bli_format_count;	/* count of headers */  	struct xfs_buf_log_format *bli_formats;	/* array of in-log header ptrs */ -	struct xfs_buf_log_format bli_format;	/* embedded in-log header */ +	struct xfs_buf_log_format __bli_format;	/* embedded in-log header */  } xfs_buf_log_item_t;  void	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); @@ -117,16 +113,6 @@ void	xfs_buf_attach_iodone(struct xfs_buf *,  void	xfs_buf_iodone_callbacks(struct xfs_buf *);  void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -#ifdef XFS_TRANS_DEBUG -void -xfs_buf_item_flush_log_debug( -	struct xfs_buf *bp, -	uint	first, -	uint	last); -#else -#define	xfs_buf_item_flush_log_debug(bp, first, last) -#endif -  #endif	/* __KERNEL__ */  #endif	/* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index d0e9c74d3d9..f852b082a08 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -78,14 +78,14 @@ xfs_swapext(  		goto out_put_tmp_file;  	} -	if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) || -	    IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) { +	if (IS_SWAPFILE(file_inode(f.file)) || +	    IS_SWAPFILE(file_inode(tmp.file))) {  		error = XFS_ERROR(EINVAL);  		goto out_put_tmp_file;  	} -	ip = XFS_I(f.file->f_path.dentry->d_inode); -	tip = XFS_I(tmp.file->f_path.dentry->d_inode); +	ip = XFS_I(file_inode(f.file)); +	tip = XFS_I(file_inode(tmp.file));  	if (ip->i_mount != tip->i_mount) {  		error = XFS_ERROR(EINVAL); @@ -246,10 +246,10 @@ xfs_swap_extents(  		goto out_unlock;  	} -	error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); +	error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);  	if (error)  		goto out_unlock; -	truncate_pagecache_range(VFS_I(ip), 0, -1); +	truncate_pagecache_range(VFS_I(tip), 0, -1);  	/* Verify O_DIRECT for ftmp */  	if (VN_CACHED(VFS_I(tip)) != 0) { diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 7536faaa61e..12afe07a91d 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -355,10 +355,12 @@ xfs_dir2_block_addname(  	/*  	 * If need to compact the leaf entries, do it now.  	 */ -	if (compact) +	if (compact) {  		xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,  				      &lfloghigh, &lfloglow); -	else if (btp->stale) { +		/* recalculate blp post-compaction */ +		blp = xfs_dir2_block_leaf_p(btp); +	} else if (btp->stale) {  		/*  		 * Set leaf logging boundaries to impossible state.  		 * For the no-stale case they're set explicitly. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9e1bf5294c9..8025eb23ad7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -612,15 +612,9 @@ xfs_qm_dqread(  	if (flags & XFS_QMOPT_DQALLOC) {  		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);  		error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), -				XFS_WRITE_LOG_RES(mp) + -				/* -				 * Round the chunklen up to the next multiple -				 * of 128 (buf log item chunk size)). -				 */ -				BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128, -				0, -				XFS_TRANS_PERM_LOG_RES, -				XFS_WRITE_LOG_COUNT); +					  XFS_QM_DQALLOC_LOG_RES(mp), 0, +					  XFS_TRANS_PERM_LOG_RES, +					  XFS_WRITE_LOG_COUNT);  		if (error)  			goto error1;  		cancelflags = XFS_TRANS_RELEASE_LOG_RES; diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index a83611849ce..c585bc64639 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -48,7 +48,7 @@ static int xfs_fileid_length(int fileid_type)  	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:  		return 6;  	} -	return 255; /* invalid */ +	return FILEID_INVALID;  }  STATIC int @@ -90,7 +90,7 @@ xfs_fs_encode_fh(  	len = xfs_fileid_length(fileid_type);  	if (*max_len < len) {  		*max_len = len; -		return 255; +		return FILEID_INVALID;  	}  	*max_len = len; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 67284edb84d..f03bf1a456f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -811,7 +811,7 @@ xfs_file_fallocate(  	loff_t		offset,  	loff_t		len)  { -	struct inode	*inode = file->f_path.dentry->d_inode; +	struct inode	*inode = file_inode(file);  	long		error;  	loff_t		new_size = 0;  	xfs_flock64_t	bf; @@ -912,7 +912,7 @@ xfs_file_readdir(  	void		*dirent,  	filldir_t	filldir)  { -	struct inode	*inode = filp->f_path.dentry->d_inode; +	struct inode	*inode = file_inode(filp);  	xfs_inode_t	*ip = XFS_I(inode);  	int		error;  	size_t		bufsize; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 94eaeedc549..2866b8c78b7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -709,8 +709,8 @@ xfs_fs_log_dummy(  	int		error;  	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); -	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, -					XFS_DEFAULT_LOG_COUNT); +	error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, +				  XFS_DEFAULT_LOG_COUNT);  	if (error) {  		xfs_trans_cancel(tp, 0);  		return error; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index a815412eab8..515bf71ce01 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -279,8 +279,6 @@ xfs_ialloc_ag_alloc(  		  (args.agbno < be32_to_cpu(agi->agi_length)))) {  		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);  		args.type = XFS_ALLOCTYPE_THIS_BNO; -		args.mod = args.total = args.wasdel = args.isfl = -			args.userdata = args.minalignslop = 0;  		args.prod = 1;  		/* @@ -333,8 +331,6 @@ xfs_ialloc_ag_alloc(  		 * Allocate a fixed-size extent of inodes.  		 */  		args.type = XFS_ALLOCTYPE_NEAR_BNO; -		args.mod = args.total = args.wasdel = args.isfl = -			args.userdata = args.minalignslop = 0;  		args.prod = 1;  		/*  		 * Allow space for the inode btree to split. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 66282dcb821..4f201656d2d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2379,9 +2379,6 @@ xfs_iflush_fork(  	char			*cp;  	xfs_ifork_t		*ifp;  	xfs_mount_t		*mp; -#ifdef XFS_TRANS_DEBUG -	int			first; -#endif  	static const short	brootflag[2] =  		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };  	static const short	dataflag[2] = @@ -2724,9 +2721,6 @@ xfs_iflush_int(  	xfs_inode_log_item_t	*iip;  	xfs_dinode_t		*dip;  	xfs_mount_t		*mp; -#ifdef XFS_TRANS_DEBUG -	int			first; -#endif  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));  	ASSERT(xfs_isiflocked(ip)); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 22baf6ea4fa..237e7f6f2ab 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -419,6 +419,7 @@ static inline void xfs_iflock(struct xfs_inode *ip)  static inline void xfs_ifunlock(struct xfs_inode *ip)  {  	xfs_iflags_clear(ip, XFS_IFLOCK); +	smp_mb();  	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);  } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d041d47d9d8..f034bd1652f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -269,17 +269,6 @@ xfs_inode_item_format(  		} else {  			ASSERT(!(iip->ili_fields &  				 XFS_ILOG_DBROOT)); -#ifdef XFS_TRANS_DEBUG -			if (iip->ili_root_size > 0) { -				ASSERT(iip->ili_root_size == -				       ip->i_df.if_broot_bytes); -				ASSERT(memcmp(iip->ili_orig_root, -					    ip->i_df.if_broot, -					    iip->ili_root_size) == 0); -			} else { -				ASSERT(ip->i_df.if_broot_bytes == 0); -			} -#endif  			iip->ili_fields &= ~XFS_ILOG_DBROOT;  		}  		break; @@ -678,11 +667,6 @@ void  xfs_inode_item_destroy(  	xfs_inode_t	*ip)  { -#ifdef XFS_TRANS_DEBUG -	if (ip->i_itemp->ili_root_size != 0) { -		kmem_free(ip->i_itemp->ili_orig_root); -	} -#endif  	kmem_zone_free(xfs_ili_zone, ip->i_itemp);  } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 376d4d0b263..779812fb3d8 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -148,10 +148,6 @@ typedef struct xfs_inode_log_item {  						      data exts */  	struct xfs_bmbt_rec	*ili_aextents_buf; /* array of logged  						      attr exts */ -#ifdef XFS_TRANS_DEBUG -	int			ili_root_size; -	char			*ili_orig_root; -#endif  	xfs_inode_log_format_t	ili_format;	   /* logged structure */  } xfs_inode_log_item_t; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c1c3ef88a26..d681e34c295 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -80,7 +80,7 @@ xfs_find_handle(  		f = fdget(hreq->fd);  		if (!f.file)  			return -EBADF; -		inode = f.file->f_path.dentry->d_inode; +		inode = file_inode(f.file);  	} else {  		error = user_lpath((const char __user *)hreq->path, &path);  		if (error) @@ -168,7 +168,7 @@ xfs_handle_to_dentry(  	/*  	 * Only allow handle opens under a directory.  	 */ -	if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode)) +	if (!S_ISDIR(file_inode(parfilp)->i_mode))  		return ERR_PTR(-ENOTDIR);  	if (hlen != sizeof(xfs_handle_t)) @@ -1334,7 +1334,7 @@ xfs_file_ioctl(  	unsigned int		cmd,  	unsigned long		p)  { -	struct inode		*inode = filp->f_path.dentry->d_inode; +	struct inode		*inode = file_inode(filp);  	struct xfs_inode	*ip = XFS_I(inode);  	struct xfs_mount	*mp = ip->i_mount;  	void			__user *arg = (void __user *)p; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1244274a567..63b8fc43215 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -530,7 +530,7 @@ xfs_file_compat_ioctl(  	unsigned		cmd,  	unsigned long		p)  { -	struct inode		*inode = filp->f_path.dentry->d_inode; +	struct inode		*inode = file_inode(filp);  	struct xfs_inode	*ip = XFS_I(inode);  	struct xfs_mount	*mp = ip->i_mount;  	void			__user *arg = (void __user *)p; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index add06b4e9a6..912d83d8860 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -311,6 +311,62 @@ xfs_iomap_eof_want_preallocate(  }  /* + * Determine the initial size of the preallocation. We are beyond the current + * EOF here, but we need to take into account whether this is a sparse write or + * an extending write when determining the preallocation size.  Hence we need to + * look up the extent that ends at the current write offset and use the result + * to determine the preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceeding data extent as the basis for the + * preallocation size. If the size of the extent is greater than half the + * maximum extent length, then use the current offset as the basis. This ensures + * that for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width alignment of + * real extents. + */ +STATIC int +xfs_iomap_eof_prealloc_initial_size( +	struct xfs_mount	*mp, +	struct xfs_inode	*ip, +	xfs_off_t		offset, +	xfs_bmbt_irec_t		*imap, +	int			nimaps) +{ +	xfs_fileoff_t   start_fsb; +	int		imaps = 1; +	int		error; + +	ASSERT(nimaps >= imaps); + +	/* if we are using a specific prealloc size, return now */ +	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) +		return 0; + +	/* +	 * As we write multiple pages, the offset will always align to the +	 * start of a page and hence point to a hole at EOF. i.e. if the size is +	 * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) +	 * will return FSB 1. Hence if there are blocks in the file, we want to +	 * point to the block prior to the EOF block and not the hole that maps +	 * directly at @offset. +	 */ +	start_fsb = XFS_B_TO_FSB(mp, offset); +	if (start_fsb) +		start_fsb--; +	error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); +	if (error) +		return 0; + +	ASSERT(imaps == 1); +	if (imap[0].br_startblock == HOLESTARTBLOCK) +		return 0; +	if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) +		return imap[0].br_blockcount; +	return XFS_B_TO_FSB(mp, offset); +} + +/*   * If we don't have a user specified preallocation size, dynamically increase   * the preallocation size as the size of the file grows. Cap the maximum size   * at a single extent or less if the filesystem is near full. The closer the @@ -319,20 +375,19 @@ xfs_iomap_eof_want_preallocate(  STATIC xfs_fsblock_t  xfs_iomap_prealloc_size(  	struct xfs_mount	*mp, -	struct xfs_inode	*ip) +	struct xfs_inode	*ip, +	xfs_off_t		offset, +	struct xfs_bmbt_irec	*imap, +	int			nimaps)  {  	xfs_fsblock_t		alloc_blocks = 0; -	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { +	alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, +							   imap, nimaps); +	if (alloc_blocks > 0) {  		int shift = 0;  		int64_t freesp; -		/* -		 * rounddown_pow_of_two() returns an undefined result -		 * if we pass in alloc_blocks = 0. Hence the "+ 1" to -		 * ensure we always pass in a non-zero value. -		 */ -		alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;  		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,  					rounddown_pow_of_two(alloc_blocks)); @@ -351,6 +406,15 @@ xfs_iomap_prealloc_size(  		}  		if (shift)  			alloc_blocks >>= shift; + +		/* +		 * If we are still trying to allocate more space than is +		 * available, squash the prealloc hard. This can happen if we +		 * have a large file on a small filesystem and the above +		 * lowspace thresholds are smaller than MAXEXTLEN. +		 */ +		while (alloc_blocks >= freesp) +			alloc_blocks >>= 4;  	}  	if (alloc_blocks < mp->m_writeio_blocks) @@ -390,7 +454,6 @@ xfs_iomap_write_delay(  	extsz = xfs_get_extsz_hint(ip);  	offset_fsb = XFS_B_TO_FSBT(mp, offset); -  	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,  				imap, XFS_WRITE_IMAPS, &prealloc);  	if (error) @@ -398,7 +461,10 @@ xfs_iomap_write_delay(  retry:  	if (prealloc) { -		xfs_fsblock_t	alloc_blocks = xfs_iomap_prealloc_size(mp, ip); +		xfs_fsblock_t	alloc_blocks; + +		alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, +						       XFS_WRITE_IMAPS);  		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));  		ioalign = XFS_B_TO_FSBT(mp, aligned_offset); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46bd9d52ab5..eec226f78a4 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -120,7 +120,7 @@ xlog_verify_iclog(  	struct xlog		*log,  	struct xlog_in_core	*iclog,  	int			count, -	boolean_t		syncing); +	bool                    syncing);  STATIC void  xlog_verify_tail_lsn(  	struct xlog		*log, @@ -1737,7 +1737,7 @@ xlog_sync(  	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);  	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); -	xlog_verify_iclog(log, iclog, count, B_TRUE); +	xlog_verify_iclog(log, iclog, count, true);  	/* account for log which doesn't start at block #0 */  	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); @@ -3611,7 +3611,7 @@ xlog_verify_iclog(  	struct xlog		*log,  	struct xlog_in_core	*iclog,  	int			count, -	boolean_t		syncing) +	bool                    syncing)  {  	xlog_op_header_t	*ophead;  	xlog_in_core_t		*icptr; @@ -3659,7 +3659,7 @@ xlog_verify_iclog(  		/* clientid is only 1 byte */  		field_offset = (__psint_t)  			       ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); -		if (syncing == B_FALSE || (field_offset & 0x1ff)) { +		if (!syncing || (field_offset & 0x1ff)) {  			clientid = ophead->oh_clientid;  		} else {  			idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); @@ -3682,7 +3682,7 @@ xlog_verify_iclog(  		/* check length */  		field_offset = (__psint_t)  			       ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); -		if (syncing == B_FALSE || (field_offset & 0x1ff)) { +		if (!syncing || (field_offset & 0x1ff)) {  			op_len = be32_to_cpu(ophead->oh_len);  		} else {  			idx = BTOBBT((__psint_t)&ophead->oh_len - diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 96fcbb85ff8..d1dba7ce75a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1442,9 +1442,8 @@ xlog_recover_find_tid(  	xlog_tid_t		tid)  {  	xlog_recover_t		*trans; -	struct hlist_node	*n; -	hlist_for_each_entry(trans, n, head, r_list) { +	hlist_for_each_entry(trans, head, r_list) {  		if (trans->r_log_tid == tid)  			return trans;  	} diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da508463ff1..3806088a8f7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify(  		return;  	}  	/* quietly fail */ -	xfs_buf_ioerror(bp, EFSCORRUPTED); +	xfs_buf_ioerror(bp, EWRONGFS);  }  static void @@ -1109,8 +1109,8 @@ xfs_mount_reset_sbqflags(  		return 0;  	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); -	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, -				      XFS_DEFAULT_LOG_COUNT); +	error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), +				  0, 0, XFS_DEFAULT_LOG_COUNT);  	if (error) {  		xfs_trans_cancel(tp, 0);  		xfs_alert(mp, "%s: Superblock update failed!", __func__); @@ -1583,8 +1583,8 @@ xfs_log_sbcount(xfs_mount_t *mp)  		return 0;  	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); -	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, -					XFS_DEFAULT_LOG_COUNT); +	error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, +				  XFS_DEFAULT_LOG_COUNT);  	if (error) {  		xfs_trans_cancel(tp, 0);  		return error; @@ -1945,8 +1945,8 @@ xfs_mount_log_sb(  			 XFS_SB_VERSIONNUM));  	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); -	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, -				XFS_DEFAULT_LOG_COUNT); +	error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, +				  XFS_DEFAULT_LOG_COUNT);  	if (error) {  		xfs_trans_cancel(tp, 0);  		return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bab8314507e..bc907061d39 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -34,12 +34,19 @@ typedef struct xfs_trans_reservations {  	uint	tr_addafork;	/* cvt inode to attributed trans */  	uint	tr_writeid;	/* write setuid/setgid file */  	uint	tr_attrinval;	/* attr fork buffer invalidation */ -	uint	tr_attrset;	/* set/create an attribute */ +	uint	tr_attrsetm;	/* set/create an attribute at mount time */ +	uint	tr_attrsetrt;	/* set/create an attribute at runtime */  	uint	tr_attrrm;	/* remove an attribute */  	uint	tr_clearagi;	/* clear bad agi unlinked ino bucket */  	uint	tr_growrtalloc;	/* grow realtime allocations */  	uint	tr_growrtzero;	/* grow realtime zeroing */  	uint	tr_growrtfree;	/* grow realtime freeing */ +	uint	tr_qm_sbchange;	/* change quota flags */ +	uint	tr_qm_setqlim;	/* adjust quota limits */ +	uint	tr_qm_dqalloc;	/* allocate quota on disk */ +	uint	tr_qm_quotaoff;	/* turn quota off */ +	uint	tr_qm_equotaoff;/* end of turn quota off */ +	uint	tr_sb;		/* modify superblock */  } xfs_trans_reservations_t;  #ifndef __KERNEL__ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 60eff476315..e5b5cf97378 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1584,10 +1584,9 @@ xfs_qm_write_sb_changes(  	int		error;  	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); -	if ((error = xfs_trans_reserve(tp, 0, -				      mp->m_sb.sb_sectsize + 128, 0, -				      0, -				      XFS_DEFAULT_LOG_COUNT))) { +	error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), +				  0, 0, XFS_DEFAULT_LOG_COUNT); +	if (error) {  		xfs_trans_cancel(tp, 0);  		return error;  	} diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 6b39115bf14..2d02eac1c9a 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -146,7 +146,7 @@ xfs_qm_newmount(  			 * inode goes inactive and wants to free blocks,  			 * or via xfs_log_mount_finish.  			 */ -			*needquotamount = B_TRUE; +			*needquotamount = true;  			*quotaflags = mp->m_qflags;  			mp->m_qflags = 0;  		} diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5f53e75409b..cf9a34051e0 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -408,10 +408,10 @@ xfs_qm_scall_getqstat(  {  	struct xfs_quotainfo	*q = mp->m_quotainfo;  	struct xfs_inode	*uip, *gip; -	boolean_t		tempuqip, tempgqip; +	bool                    tempuqip, tempgqip;  	uip = gip = NULL; -	tempuqip = tempgqip = B_FALSE; +	tempuqip = tempgqip = false;  	memset(out, 0, sizeof(fs_quota_stat_t));  	out->qs_version = FS_QSTAT_VERSION; @@ -434,12 +434,12 @@ xfs_qm_scall_getqstat(  	if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {  		if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,  					0, 0, &uip) == 0) -			tempuqip = B_TRUE; +			tempuqip = true;  	}  	if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {  		if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,  					0, 0, &gip) == 0) -			tempgqip = B_TRUE; +			tempgqip = true;  	}  	if (uip) {  		out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; @@ -490,8 +490,9 @@ xfs_qm_scall_setqlim(  		return 0;  	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); -	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, -				      0, 0, XFS_DEFAULT_LOG_COUNT))) { +	error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), +				  0, 0, XFS_DEFAULT_LOG_COUNT); +	if (error) {  		xfs_trans_cancel(tp, 0);  		return (error);  	} @@ -638,8 +639,9 @@ xfs_qm_log_quotaoff_end(  	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); -	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2, -				      0, 0, XFS_DEFAULT_LOG_COUNT))) { +	error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp), +				  0, 0, XFS_DEFAULT_LOG_COUNT); +	if (error) {  		xfs_trans_cancel(tp, 0);  		return (error);  	} @@ -671,14 +673,10 @@ xfs_qm_log_quotaoff(  	uint			oldsbqflag=0;  	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); -	if ((error = xfs_trans_reserve(tp, 0, -				      sizeof(xfs_qoff_logitem_t) * 2 + -				      mp->m_sb.sb_sectsize + 128, -				      0, -				      0, -				      XFS_DEFAULT_LOG_COUNT))) { +	error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp), +				  0, 0, XFS_DEFAULT_LOG_COUNT); +	if (error)  		goto error0; -	}  	qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);  	xfs_trans_log_quotaoff_item(tp, qoffi); @@ -784,11 +782,11 @@ xfs_qm_scall_getquota(  	     (XFS_IS_OQUOTA_ENFORCED(mp) &&  			(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&  	    dst->d_id != 0) { -		if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && +		if ((dst->d_bcount > dst->d_blk_softlimit) &&  		    (dst->d_blk_softlimit > 0)) {  			ASSERT(dst->d_btimer != 0);  		} -		if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && +		if ((dst->d_icount > dst->d_ino_softlimit) &&  		    (dst->d_ino_softlimit > 0)) {  			ASSERT(dst->d_itimer != 0);  		} diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ab8839b2627..c407121873b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -139,9 +139,9 @@ static const match_table_t tokens = {  STATIC unsigned long -suffix_strtoul(char *s, char **endp, unsigned int base) +suffix_kstrtoint(char *s, unsigned int base, int *res)  { -	int	last, shift_left_factor = 0; +	int	last, shift_left_factor = 0, _res;  	char	*value = s;  	last = strlen(value) - 1; @@ -158,7 +158,10 @@ suffix_strtoul(char *s, char **endp, unsigned int base)  		value[last] = '\0';  	} -	return simple_strtoul((const char *)s, endp, base) << shift_left_factor; +	if (kstrtoint(s, base, &_res)) +		return -EINVAL; +	*res = _res << shift_left_factor; +	return 0;  }  /* @@ -174,7 +177,7 @@ xfs_parseargs(  	char			*options)  {  	struct super_block	*sb = mp->m_super; -	char			*this_char, *value, *eov; +	char			*this_char, *value;  	int			dsunit = 0;  	int			dswidth = 0;  	int			iosize = 0; @@ -230,14 +233,16 @@ xfs_parseargs(  					this_char);  				return EINVAL;  			} -			mp->m_logbufs = simple_strtoul(value, &eov, 10); +			if (kstrtoint(value, 10, &mp->m_logbufs)) +				return EINVAL;  		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {  			if (!value || !*value) {  				xfs_warn(mp, "%s option requires an argument",  					this_char);  				return EINVAL;  			} -			mp->m_logbsize = suffix_strtoul(value, &eov, 10); +			if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) +				return EINVAL;  		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {  			if (!value || !*value) {  				xfs_warn(mp, "%s option requires an argument", @@ -266,7 +271,8 @@ xfs_parseargs(  					this_char);  				return EINVAL;  			} -			iosize = simple_strtoul(value, &eov, 10); +			if (kstrtoint(value, 10, &iosize)) +				return EINVAL;  			iosizelog = ffs(iosize) - 1;  		} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {  			if (!value || !*value) { @@ -274,7 +280,8 @@ xfs_parseargs(  					this_char);  				return EINVAL;  			} -			iosize = suffix_strtoul(value, &eov, 10); +			if (suffix_kstrtoint(value, 10, &iosize)) +				return EINVAL;  			iosizelog = ffs(iosize) - 1;  		} else if (!strcmp(this_char, MNTOPT_GRPID) ||  			   !strcmp(this_char, MNTOPT_BSDGROUPS)) { @@ -296,14 +303,16 @@ xfs_parseargs(  					this_char);  				return EINVAL;  			} -			dsunit = simple_strtoul(value, &eov, 10); +			if (kstrtoint(value, 10, &dsunit)) +				return EINVAL;  		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {  			if (!value || !*value) {  				xfs_warn(mp, "%s option requires an argument",  					this_char);  				return EINVAL;  			} -			dswidth = simple_strtoul(value, &eov, 10); +			if (kstrtoint(value, 10, &dswidth)) +				return EINVAL;  		} else if (!strcmp(this_char, MNTOPT_32BITINODE)) {  			mp->m_flags |= XFS_MOUNT_SMALL_INUMS;  		} else if (!strcmp(this_char, MNTOPT_64BITINODE)) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2e137d4a85a..16a812977ea 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse);  DEFINE_BUF_EVENT(xfs_buf_item_iodone);  DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);  DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);  DEFINE_BUF_EVENT(xfs_trans_read_buf_io);  DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 06ed520a767..2fd7c1ff1d2 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -37,14 +37,45 @@  #include "xfs_extent_busy.h"  #include "xfs_bmap.h"  #include "xfs_quota.h" +#include "xfs_qm.h"  #include "xfs_trans_priv.h"  #include "xfs_trans_space.h"  #include "xfs_inode_item.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h"  #include "xfs_trace.h"  kmem_zone_t	*xfs_trans_zone;  kmem_zone_t	*xfs_log_item_desc_zone; +/* + * A buffer has a format structure overhead in the log in addition + * to the data, so we need to take this into account when reserving + * space in a transaction for a buffer.  Round the space required up + * to a multiple of 128 bytes so that we don't change the historical + * reservation that has been used for this overhead. + */ +STATIC uint +xfs_buf_log_overhead(void) +{ +	return round_up(sizeof(struct xlog_op_header) + +			sizeof(struct xfs_buf_log_format), 128); +} + +/* + * Calculate out transaction log reservation per item in bytes. + * + * The nbufs argument is used to indicate the number of items that + * will be changed in a transaction.  size is used to tell how many + * bytes should be reserved per item. + */ +STATIC uint +xfs_calc_buf_res( +	uint		nbufs, +	uint		size) +{ +	return nbufs * (size + xfs_buf_log_overhead()); +}  /*   * Various log reservation values. @@ -85,18 +116,15 @@ xfs_calc_write_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		MAX((mp->m_sb.sb_inodesize + -		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + -		     2 * mp->m_sb.sb_sectsize + -		     mp->m_sb.sb_sectsize + -		     XFS_ALLOCFREE_LOG_RES(mp, 2) + -		     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + -			    XFS_ALLOCFREE_LOG_COUNT(mp, 2))), -		    (2 * mp->m_sb.sb_sectsize + -		     2 * mp->m_sb.sb_sectsize + -		     mp->m_sb.sb_sectsize + -		     XFS_ALLOCFREE_LOG_RES(mp, 2) + -		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); +		MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + +		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), +				      XFS_FSB_TO_B(mp, 1)) + +		     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + +		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), +				      XFS_FSB_TO_B(mp, 1))), +		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + +		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), +				      XFS_FSB_TO_B(mp, 1))));  }  /* @@ -117,18 +145,17 @@ xfs_calc_itruncate_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		MAX((mp->m_sb.sb_inodesize + -		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + -		     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), -		    (4 * mp->m_sb.sb_sectsize + -		     4 * mp->m_sb.sb_sectsize + -		     mp->m_sb.sb_sectsize + -		     XFS_ALLOCFREE_LOG_RES(mp, 4) + -		     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) + -		     128 * 5 + -		     XFS_ALLOCFREE_LOG_RES(mp, 1) + -		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + -			    XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); +		MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + +		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, +				      XFS_FSB_TO_B(mp, 1))), +		    (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + +		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), +				      XFS_FSB_TO_B(mp, 1)) + +		    xfs_calc_buf_res(5, 0) + +		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), +				     XFS_FSB_TO_B(mp, 1)) + +		    xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + +				     mp->m_in_maxlevels, 0)));  }  /* @@ -148,14 +175,12 @@ xfs_calc_rename_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		MAX((4 * mp->m_sb.sb_inodesize + -		     2 * XFS_DIROP_LOG_RES(mp) + -		     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))), -		    (3 * mp->m_sb.sb_sectsize + -		     3 * mp->m_sb.sb_sectsize + -		     mp->m_sb.sb_sectsize + -		     XFS_ALLOCFREE_LOG_RES(mp, 3) + -		     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))); +		MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) + +		     xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), +				      XFS_FSB_TO_B(mp, 1))), +		    (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + +		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), +				      XFS_FSB_TO_B(mp, 1))));  }  /* @@ -175,15 +200,12 @@ xfs_calc_link_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		MAX((mp->m_sb.sb_inodesize + -		     mp->m_sb.sb_inodesize + -		     XFS_DIROP_LOG_RES(mp) + -		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))), -		    (mp->m_sb.sb_sectsize + -		     mp->m_sb.sb_sectsize + -		     mp->m_sb.sb_sectsize + -		     XFS_ALLOCFREE_LOG_RES(mp, 1) + -		     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); +		MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + +		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), +				      XFS_FSB_TO_B(mp, 1))), +		    (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + +		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), +				      XFS_FSB_TO_B(mp, 1))));  }  /* @@ -203,15 +225,12 @@ xfs_calc_remove_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		MAX((mp->m_sb.sb_inodesize + -		     mp->m_sb.sb_inodesize + -		     XFS_DIROP_LOG_RES(mp) + -		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))), -		    (2 * mp->m_sb.sb_sectsize + -		     2 * mp->m_sb.sb_sectsize + -		     mp->m_sb.sb_sectsize + -		     XFS_ALLOCFREE_LOG_RES(mp, 2) + -		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); +		MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + +		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), +				      XFS_FSB_TO_B(mp, 1))), +		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + +		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), +				      XFS_FSB_TO_B(mp, 1))));  }  /* @@ -233,18 +252,18 @@ xfs_calc_symlink_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		MAX((mp->m_sb.sb_inodesize + -		     mp->m_sb.sb_inodesize + -		     XFS_FSB_TO_B(mp, 1) + -		     XFS_DIROP_LOG_RES(mp) + -		     1024 + -		     128 * (4 + XFS_DIROP_LOG_COUNT(mp))), -		    (2 * mp->m_sb.sb_sectsize + -		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + -		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + -		     XFS_ALLOCFREE_LOG_RES(mp, 1) + -		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + -			    XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); +		MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + +		     xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + +		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), +				      XFS_FSB_TO_B(mp, 1)) + +		     xfs_calc_buf_res(1, 1024)), +		    (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + +		     xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), +				      XFS_FSB_TO_B(mp, 1)) + +		     xfs_calc_buf_res(mp->m_in_maxlevels, +				      XFS_FSB_TO_B(mp, 1)) + +		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), +				      XFS_FSB_TO_B(mp, 1))));  }  /* @@ -267,18 +286,19 @@ xfs_calc_create_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		MAX((mp->m_sb.sb_inodesize + -		     mp->m_sb.sb_inodesize + +		MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + +		     xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + +		     (uint)XFS_FSB_TO_B(mp, 1) + +		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), +				      XFS_FSB_TO_B(mp, 1))), +		    (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +  		     mp->m_sb.sb_sectsize + -		     XFS_FSB_TO_B(mp, 1) + -		     XFS_DIROP_LOG_RES(mp) + -		     128 * (3 + XFS_DIROP_LOG_COUNT(mp))), -		    (3 * mp->m_sb.sb_sectsize + -		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + -		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + -		     XFS_ALLOCFREE_LOG_RES(mp, 1) + -		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + -			    XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); +		     xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), +				      XFS_FSB_TO_B(mp, 1)) + +		     xfs_calc_buf_res(mp->m_in_maxlevels, +				      XFS_FSB_TO_B(mp, 1)) + +		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), +				      XFS_FSB_TO_B(mp, 1))));  }  /* @@ -306,16 +326,16 @@ xfs_calc_ifree_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		mp->m_sb.sb_inodesize + -		mp->m_sb.sb_sectsize + -		mp->m_sb.sb_sectsize + -		XFS_FSB_TO_B(mp, 1) + +		xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + +		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + +		xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +  		MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),  		    XFS_INODE_CLUSTER_SIZE(mp)) + -		128 * 5 + -		XFS_ALLOCFREE_LOG_RES(mp, 1) + -		128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + -		       XFS_ALLOCFREE_LOG_COUNT(mp, 1)); +		xfs_calc_buf_res(1, 0) + +		xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + +				 mp->m_in_maxlevels, 0) + +		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), +				 XFS_FSB_TO_B(mp, 1));  }  /* @@ -343,9 +363,9 @@ STATIC uint  xfs_calc_growdata_reservation(  	struct xfs_mount	*mp)  { -	return mp->m_sb.sb_sectsize * 3 + -		XFS_ALLOCFREE_LOG_RES(mp, 1) + -		128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); +	return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + +		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), +				 XFS_FSB_TO_B(mp, 1));  }  /* @@ -362,12 +382,12 @@ STATIC uint  xfs_calc_growrtalloc_reservation(  	struct xfs_mount	*mp)  { -	return 2 * mp->m_sb.sb_sectsize + -		XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + -		mp->m_sb.sb_inodesize + -		XFS_ALLOCFREE_LOG_RES(mp, 1) + -		128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + -		       XFS_ALLOCFREE_LOG_COUNT(mp, 1)); +	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + +		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), +				 XFS_FSB_TO_B(mp, 1)) + +		xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + +		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), +				 XFS_FSB_TO_B(mp, 1));  }  /* @@ -379,7 +399,7 @@ STATIC uint  xfs_calc_growrtzero_reservation(  	struct xfs_mount	*mp)  { -	return mp->m_sb.sb_blocksize + 128; +	return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);  }  /* @@ -396,11 +416,10 @@ STATIC uint  xfs_calc_growrtfree_reservation(  	struct xfs_mount	*mp)  { -	return mp->m_sb.sb_sectsize + -		2 * mp->m_sb.sb_inodesize + -		mp->m_sb.sb_blocksize + -		mp->m_rsumsize + -		128 * 5; +	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + +		xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + +		xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + +		xfs_calc_buf_res(1, mp->m_rsumsize);  }  /* @@ -411,7 +430,7 @@ STATIC uint  xfs_calc_swrite_reservation(  	struct xfs_mount	*mp)  { -	return mp->m_sb.sb_inodesize + 128; +	return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);  }  /* @@ -421,7 +440,7 @@ xfs_calc_swrite_reservation(  STATIC uint  xfs_calc_writeid_reservation(xfs_mount_t *mp)  { -	return mp->m_sb.sb_inodesize + 128; +	return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);  }  /* @@ -437,13 +456,13 @@ xfs_calc_addafork_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		mp->m_sb.sb_inodesize + -		mp->m_sb.sb_sectsize * 2 + -		mp->m_dirblksize + -		XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + -		XFS_ALLOCFREE_LOG_RES(mp, 1) + -		128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 + -		       XFS_ALLOCFREE_LOG_COUNT(mp, 1)); +		xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + +		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + +		xfs_calc_buf_res(1, mp->m_dirblksize) + +		xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, +				 XFS_FSB_TO_B(mp, 1)) + +		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), +				 XFS_FSB_TO_B(mp, 1));  }  /* @@ -461,35 +480,51 @@ STATIC uint  xfs_calc_attrinval_reservation(  	struct xfs_mount	*mp)  { -	return MAX((mp->m_sb.sb_inodesize + -		    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + -		    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))), -		   (4 * mp->m_sb.sb_sectsize + -		    4 * mp->m_sb.sb_sectsize + -		    mp->m_sb.sb_sectsize + -		    XFS_ALLOCFREE_LOG_RES(mp, 4) + -		    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))); +	return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + +		    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), +				     XFS_FSB_TO_B(mp, 1))), +		   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + +		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), +				     XFS_FSB_TO_B(mp, 1))));  }  /* - * Setting an attribute. + * Setting an attribute at mount time.   *	the inode getting the attribute   *	the superblock for allocations   *	the agfs extents are allocated from   *	the attribute btree * max depth   *	the inode allocation btree   * Since attribute transaction space is dependent on the size of the attribute, - * the calculation is done partially at mount time and partially at runtime. + * the calculation is done partially at mount time and partially at runtime(see + * below).   */  STATIC uint -xfs_calc_attrset_reservation( +xfs_calc_attrsetm_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		mp->m_sb.sb_inodesize + -		mp->m_sb.sb_sectsize + -		XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + -		128 * (2 + XFS_DA_NODE_MAXDEPTH); +		xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + +		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + +		xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); +} + +/* + * Setting an attribute at runtime, transaction space unit per block. + * 	the superblock for allocations: sector size + *	the inode bmap btree could join or split: max depth * block size + * Since the runtime attribute transaction space is dependent on the total + * blocks needed for the 1st bmap, here we calculate out the space unit for + * one block so that the caller could figure out the total space according + * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp). + */ +STATIC uint +xfs_calc_attrsetrt_reservation( +	struct xfs_mount	*mp) +{ +	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + +		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), +				 XFS_FSB_TO_B(mp, 1));  }  /* @@ -508,16 +543,15 @@ xfs_calc_attrrm_reservation(  	struct xfs_mount	*mp)  {  	return XFS_DQUOT_LOGRES(mp) + -		MAX((mp->m_sb.sb_inodesize + -		     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + -		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + -		     128 * (1 + XFS_DA_NODE_MAXDEPTH + -			    XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), -		    (2 * mp->m_sb.sb_sectsize + -		     2 * mp->m_sb.sb_sectsize + -		     mp->m_sb.sb_sectsize + -		     XFS_ALLOCFREE_LOG_RES(mp, 2) + -		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); +		MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + +		     xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, +				      XFS_FSB_TO_B(mp, 1)) + +		     (uint)XFS_FSB_TO_B(mp, +					XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + +		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), +		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + +		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), +				      XFS_FSB_TO_B(mp, 1))));  }  /* @@ -527,7 +561,78 @@ STATIC uint  xfs_calc_clear_agi_bucket_reservation(  	struct xfs_mount	*mp)  { -	return mp->m_sb.sb_sectsize + 128; +	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Clearing the quotaflags in the superblock. + *	the super block for changing quota flags: sector size + */ +STATIC uint +xfs_calc_qm_sbchange_reservation( +	struct xfs_mount	*mp) +{ +	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Adjusting quota limits. + *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + */ +STATIC uint +xfs_calc_qm_setqlim_reservation( +	struct xfs_mount	*mp) +{ +	return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); +} + +/* + * Allocating quota on disk if needed. + *	the write transaction log space: XFS_WRITE_LOG_RES(mp) + *	the unit of quota allocation: one system block size + */ +STATIC uint +xfs_calc_qm_dqalloc_reservation( +	struct xfs_mount	*mp) +{ +	return XFS_WRITE_LOG_RES(mp) + +		xfs_calc_buf_res(1, +			XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); +} + +/* + * Turning off quotas. + *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + *    the superblock for the quota flags: sector size + */ +STATIC uint +xfs_calc_qm_quotaoff_reservation( +	struct xfs_mount	*mp) +{ +	return sizeof(struct xfs_qoff_logitem) * 2 + +		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * End of turning off quotas. + *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + */ +STATIC uint +xfs_calc_qm_quotaoff_end_reservation( +	struct xfs_mount	*mp) +{ +	return sizeof(struct xfs_qoff_logitem) * 2; +} + +/* + * Syncing the incore super block changes to disk. + *     the super block to reflect the changes: sector size + */ +STATIC uint +xfs_calc_sb_reservation( +	struct xfs_mount	*mp) +{ +	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);  }  /* @@ -555,12 +660,19 @@ xfs_trans_init(  	resp->tr_writeid = xfs_calc_writeid_reservation(mp);  	resp->tr_addafork = xfs_calc_addafork_reservation(mp);  	resp->tr_attrinval = xfs_calc_attrinval_reservation(mp); -	resp->tr_attrset = xfs_calc_attrset_reservation(mp); +	resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp); +	resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp);  	resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);  	resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);  	resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);  	resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);  	resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp); +	resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp); +	resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp); +	resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp); +	resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp); +	resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp); +	resp->tr_sb = xfs_calc_sb_reservation(mp);  }  /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c6c0601abd7..cd29f617102 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -252,17 +252,19 @@ struct xfs_log_item_desc {   * as long as SWRITE logs the entire inode core   */  #define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite) -#define	XFS_WRITEID_LOG_RES(mp)	((mp)->m_reservations.tr_swrite) +#define	XFS_WRITEID_LOG_RES(mp)		((mp)->m_reservations.tr_swrite)  #define	XFS_ADDAFORK_LOG_RES(mp)	((mp)->m_reservations.tr_addafork)  #define	XFS_ATTRINVAL_LOG_RES(mp)	((mp)->m_reservations.tr_attrinval) -#define	XFS_ATTRSET_LOG_RES(mp, ext)	\ -	((mp)->m_reservations.tr_attrset + \ -	 (ext * (mp)->m_sb.sb_sectsize) + \ -	 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ -	 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) -#define	XFS_ATTRRM_LOG_RES(mp)	((mp)->m_reservations.tr_attrrm) +#define	XFS_ATTRSETM_LOG_RES(mp)	((mp)->m_reservations.tr_attrsetm) +#define XFS_ATTRSETRT_LOG_RES(mp)	((mp)->m_reservations.tr_attrsetrt) +#define	XFS_ATTRRM_LOG_RES(mp)		((mp)->m_reservations.tr_attrrm)  #define	XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi) - +#define XFS_QM_SBCHANGE_LOG_RES(mp)	((mp)->m_reservations.tr_qm_sbchange) +#define XFS_QM_SETQLIM_LOG_RES(mp)	((mp)->m_reservations.tr_qm_setqlim) +#define XFS_QM_DQALLOC_LOG_RES(mp)	((mp)->m_reservations.tr_qm_dqalloc) +#define XFS_QM_QUOTAOFF_LOG_RES(mp)	((mp)->m_reservations.tr_qm_quotaoff) +#define XFS_QM_QUOTAOFF_END_LOG_RES(mp)	((mp)->m_reservations.tr_qm_equotaoff) +#define XFS_SB_LOG_RES(mp)		((mp)->m_reservations.tr_sb)  /*   * Various log count values. diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 6011ee66133..0eda7254305 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -55,20 +55,6 @@ xfs_ail_check(  		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); -#ifdef XFS_TRANS_DEBUG -	/* -	 * Walk the list checking lsn ordering, and that every entry has the -	 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it -	 * when specifically debugging the transaction subsystem. -	 */ -	prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); -	list_for_each_entry(lip, &ailp->xa_ail, li_ail) { -		if (&prev_lip->li_ail != &ailp->xa_ail) -			ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); -		ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); -		prev_lip = lip; -	} -#endif /* XFS_TRANS_DEBUG */  }  #else /* !DEBUG */  #define	xfs_ail_check(a,l) diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 4fc17d479d4..3edf5dbee00 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -93,7 +93,7 @@ _xfs_trans_bjoin(  	xfs_buf_item_init(bp, tp->t_mountp);  	bip = bp->b_fspriv;  	ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); -	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); +	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));  	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));  	if (reset_recur)  		bip->bli_recur = 0; @@ -432,7 +432,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,  	bip = bp->b_fspriv;  	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);  	ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); -	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); +	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));  	ASSERT(atomic_read(&bip->bli_refcount) > 0);  	trace_xfs_trans_brelse(bip); @@ -519,7 +519,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,  	ASSERT(bp->b_transp == tp);  	ASSERT(bip != NULL);  	ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); -	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); +	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));  	ASSERT(atomic_read(&bip->bli_refcount) > 0);  	bip->bli_flags |= XFS_BLI_HOLD; @@ -539,7 +539,7 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,  	ASSERT(bp->b_transp == tp);  	ASSERT(bip != NULL);  	ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); -	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); +	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));  	ASSERT(atomic_read(&bip->bli_refcount) > 0);  	ASSERT(bip->bli_flags & XFS_BLI_HOLD); @@ -598,7 +598,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,  		bip->bli_flags &= ~XFS_BLI_STALE;  		ASSERT(XFS_BUF_ISSTALE(bp));  		XFS_BUF_UNSTALE(bp); -		bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; +		bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;  	}  	tp->t_flags |= XFS_TRANS_DIRTY; @@ -643,6 +643,7 @@ xfs_trans_binval(  	xfs_buf_t	*bp)  {  	xfs_buf_log_item_t	*bip = bp->b_fspriv; +	int			i;  	ASSERT(bp->b_transp == tp);  	ASSERT(bip != NULL); @@ -657,8 +658,8 @@ xfs_trans_binval(  		 */  		ASSERT(XFS_BUF_ISSTALE(bp));  		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); -		ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); -		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); +		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); +		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);  		ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);  		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);  		return; @@ -668,10 +669,12 @@ xfs_trans_binval(  	bip->bli_flags |= XFS_BLI_STALE;  	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); -	bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; -	bip->bli_format.blf_flags |= XFS_BLF_CANCEL; -	memset((char *)(bip->bli_format.blf_data_map), 0, -	      (bip->bli_format.blf_map_size * sizeof(uint))); +	bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; +	bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; +	for (i = 0; i < bip->bli_format_count; i++) { +		memset(bip->bli_formats[i].blf_data_map, 0, +		       (bip->bli_formats[i].blf_map_size * sizeof(uint))); +	}  	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;  	tp->t_flags |= XFS_TRANS_DIRTY;  } @@ -775,5 +778,5 @@ xfs_trans_dquot_buf(  	       type == XFS_BLF_GDQUOT_BUF);  	ASSERT(atomic_read(&bip->bli_refcount) > 0); -	bip->bli_format.blf_flags |= type; +	bip->__bli_format.blf_flags |= type;  } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 0c7fa54f309..642c2d6e1db 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -516,7 +516,7 @@ xfs_trans_unreserve_and_mod_dquots(  	int			i, j;  	xfs_dquot_t		*dqp;  	xfs_dqtrx_t		*qtrx, *qa; -	boolean_t		locked; +	bool                    locked;  	if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))  		return; @@ -537,17 +537,17 @@ xfs_trans_unreserve_and_mod_dquots(  			 * about the number of blocks used field, or deltas.  			 * Also we don't bother to zero the fields.  			 */ -			locked = B_FALSE; +			locked = false;  			if (qtrx->qt_blk_res) {  				xfs_dqlock(dqp); -				locked = B_TRUE; +				locked = true;  				dqp->q_res_bcount -=  					(xfs_qcnt_t)qtrx->qt_blk_res;  			}  			if (qtrx->qt_ino_res) {  				if (!locked) {  					xfs_dqlock(dqp); -					locked = B_TRUE; +					locked = true;  				}  				dqp->q_res_icount -=  					(xfs_qcnt_t)qtrx->qt_ino_res; @@ -556,7 +556,7 @@ xfs_trans_unreserve_and_mod_dquots(  			if (qtrx->qt_rtblk_res) {  				if (!locked) {  					xfs_dqlock(dqp); -					locked = B_TRUE; +					locked = true;  				}  				dqp->q_res_rtbcount -=  					(xfs_qcnt_t)qtrx->qt_rtblk_res; diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index d2eee20d5f5..ac6d567704d 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -33,14 +33,6 @@  #include "xfs_inode_item.h"  #include "xfs_trace.h" -#ifdef XFS_TRANS_DEBUG -STATIC void -xfs_trans_inode_broot_debug( -	xfs_inode_t	*ip); -#else -#define	xfs_trans_inode_broot_debug(ip) -#endif -  /*   * Add a locked inode to the transaction.   * @@ -67,8 +59,6 @@ xfs_trans_ijoin(  	 * Get a log_item_desc to point at the new item.  	 */  	xfs_trans_add_item(tp, &iip->ili_item); - -	xfs_trans_inode_broot_debug(ip);  }  /* @@ -135,34 +125,3 @@ xfs_trans_log_inode(  	flags |= ip->i_itemp->ili_last_fields;  	ip->i_itemp->ili_fields |= flags;  } - -#ifdef XFS_TRANS_DEBUG -/* - * Keep track of the state of the inode btree root to make sure we - * log it properly. - */ -STATIC void -xfs_trans_inode_broot_debug( -	xfs_inode_t	*ip) -{ -	xfs_inode_log_item_t	*iip; - -	ASSERT(ip->i_itemp != NULL); -	iip = ip->i_itemp; -	if (iip->ili_root_size != 0) { -		ASSERT(iip->ili_orig_root != NULL); -		kmem_free(iip->ili_orig_root); -		iip->ili_root_size = 0; -		iip->ili_orig_root = NULL; -	} -	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { -		ASSERT((ip->i_df.if_broot != NULL) && -		       (ip->i_df.if_broot_bytes > 0)); -		iip->ili_root_size = ip->i_df.if_broot_bytes; -		iip->ili_orig_root = -			(char*)kmem_alloc(iip->ili_root_size, KM_SLEEP); -		memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot), -		      iip->ili_root_size); -	} -} -#endif diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 7a41874f4c2..61ba1cfa974 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -32,7 +32,6 @@ typedef unsigned int		__uint32_t;  typedef signed long long int	__int64_t;  typedef unsigned long long int	__uint64_t; -typedef enum { B_FALSE,B_TRUE }	boolean_t;  typedef __uint32_t		prid_t;		/* project ID */  typedef __uint32_t		inst_t;		/* an instruction */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index d95f565a390..77ad74834ba 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -725,7 +725,7 @@ xfs_create(  	int			error;  	xfs_bmap_free_t		free_list;  	xfs_fsblock_t		first_block; -	boolean_t		unlock_dp_on_error = B_FALSE; +	bool                    unlock_dp_on_error = false;  	uint			cancel_flags;  	int			committed;  	prid_t			prid; @@ -794,7 +794,7 @@ xfs_create(  	}  	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); -	unlock_dp_on_error = B_TRUE; +	unlock_dp_on_error = true;  	xfs_bmap_init(&free_list, &first_block); @@ -830,7 +830,7 @@ xfs_create(  	 * error path.  	 */  	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); -	unlock_dp_on_error = B_FALSE; +	unlock_dp_on_error = false;  	error = xfs_dir_createname(tp, dp, name, ip->i_ino,  					&first_block, &free_list, resblks ? @@ -1367,7 +1367,7 @@ xfs_symlink(  	int			pathlen;  	xfs_bmap_free_t		free_list;  	xfs_fsblock_t		first_block; -	boolean_t		unlock_dp_on_error = B_FALSE; +	bool                    unlock_dp_on_error = false;  	uint			cancel_flags;  	int			committed;  	xfs_fileoff_t		first_fsb; @@ -1438,7 +1438,7 @@ xfs_symlink(  	}  	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); -	unlock_dp_on_error = B_TRUE; +	unlock_dp_on_error = true;  	/*  	 * Check whether the directory allows new symlinks or not. @@ -1484,7 +1484,7 @@ xfs_symlink(  	 * error path.  	 */  	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); -	unlock_dp_on_error = B_FALSE; +	unlock_dp_on_error = false;  	/*  	 * Also attach the dquot(s) to it, if applicable.  |