diff options
Diffstat (limited to 'fs')
144 files changed, 1487 insertions, 539 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 91dad63e5a2..2756dcd5de6 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -365,3 +365,4 @@ struct file_system_type v9fs_fs_type = {  	.owner = THIS_MODULE,  	.fs_flags = FS_RENAME_DOES_D_MOVE,  }; +MODULE_ALIAS_FS("9p"); diff --git a/fs/adfs/super.c b/fs/adfs/super.c index d5712293579..0ff4bae2c2a 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -524,6 +524,7 @@ static struct file_system_type adfs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("adfs");  static int __init init_adfs_fs(void)  { diff --git a/fs/affs/super.c b/fs/affs/super.c index b84dc735250..45161a832bb 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -622,6 +622,7 @@ static struct file_system_type affs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("affs");  static int __init init_affs_fs(void)  { diff --git a/fs/afs/super.c b/fs/afs/super.c index 7c31ec39957..c4861557e38 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -45,6 +45,7 @@ struct file_system_type afs_fs_type = {  	.kill_sb	= afs_kill_super,  	.fs_flags	= 0,  }; +MODULE_ALIAS_FS("afs");  static const struct super_operations afs_super_ops = {  	.statfs		= afs_statfs, diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index cddc74b9cdb..b3db517e89e 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -26,6 +26,7 @@ static struct file_system_type autofs_fs_type = {  	.mount		= autofs_mount,  	.kill_sb	= autofs4_kill_sb,  }; +MODULE_ALIAS_FS("autofs");  static int __init init_autofs4_fs(void)  { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c8f4e25eb9e..8615ee89ab5 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -951,6 +951,7 @@ static struct file_system_type befs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,	  }; +MODULE_ALIAS_FS("befs");  static int __init  init_befs_fs(void) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 737aaa3f709..5e376bb9341 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -473,6 +473,7 @@ static struct file_system_type bfs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("bfs");  static int __init init_bfs_fs(void)  { diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index fecbbf3f8ff..751df5e4f61 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -720,6 +720,7 @@ static struct file_system_type bm_fs_type = {  	.mount		= bm_mount,  	.kill_sb	= kill_litter_super,  }; +MODULE_ALIAS_FS("binfmt_misc");  static int __init init_misc_binfmt(void)  { diff --git a/fs/block_dev.c b/fs/block_dev.c index aea605c98ba..aae187a7f94 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -551,6 +551,7 @@ struct block_device *bdgrab(struct block_device *bdev)  	ihold(bdev->bd_inode);  	return bdev;  } +EXPORT_SYMBOL(bdgrab);  long nr_blockdev_pages(void)  { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ecd25a1b4e5..ca9d8f1a3bb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -651,6 +651,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,  	if (tree_mod_dont_log(fs_info, NULL))  		return 0; +	__tree_mod_log_free_eb(fs_info, old_root); +  	ret = tree_mod_alloc(fs_info, flags, &tm);  	if (ret < 0)  		goto out; @@ -736,7 +738,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)  static noinline void  tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,  		     struct extent_buffer *src, unsigned long dst_offset, -		     unsigned long src_offset, int nr_items) +		     unsigned long src_offset, int nr_items, int log_removal)  {  	int ret;  	int i; @@ -750,10 +752,12 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,  	}  	for (i = 0; i < nr_items; i++) { -		ret = tree_mod_log_insert_key_locked(fs_info, src, -						     i + src_offset, -						     MOD_LOG_KEY_REMOVE); -		BUG_ON(ret < 0); +		if (log_removal) { +			ret = tree_mod_log_insert_key_locked(fs_info, src, +							i + src_offset, +							MOD_LOG_KEY_REMOVE); +			BUG_ON(ret < 0); +		}  		ret = tree_mod_log_insert_key_locked(fs_info, dst,  						     i + dst_offset,  						     MOD_LOG_KEY_ADD); @@ -927,7 +931,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,  			ret = btrfs_dec_ref(trans, root, buf, 1, 1);  			BUG_ON(ret); /* -ENOMEM */  		} -		tree_mod_log_free_eb(root->fs_info, buf);  		clean_tree_block(trans, root, buf);  		*last_ref = 1;  	} @@ -1046,6 +1049,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  		btrfs_set_node_ptr_generation(parent, parent_slot,  					      trans->transid);  		btrfs_mark_buffer_dirty(parent); +		tree_mod_log_free_eb(root->fs_info, buf);  		btrfs_free_tree_block(trans, root, buf, parent_start,  				      last_ref);  	} @@ -1750,7 +1754,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  			goto enospc;  		} -		tree_mod_log_free_eb(root->fs_info, root->node);  		tree_mod_log_set_root_pointer(root, child);  		rcu_assign_pointer(root->node, child); @@ -2995,7 +2998,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,  		push_items = min(src_nritems - 8, push_items);  	tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, -			     push_items); +			     push_items, 1);  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(dst_nritems),  			   btrfs_node_key_ptr_offset(0), @@ -3066,7 +3069,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  				      sizeof(struct btrfs_key_ptr));  	tree_mod_log_eb_copy(root->fs_info, dst, src, 0, -			     src_nritems - push_items, push_items); +			     src_nritems - push_items, push_items, 1);  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3218,12 +3221,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  	int mid;  	int ret;  	u32 c_nritems; +	int tree_mod_log_removal = 1;  	c = path->nodes[level];  	WARN_ON(btrfs_header_generation(c) != trans->transid);  	if (c == root->node) {  		/* trying to split the root, lets make a new one */  		ret = insert_new_root(trans, root, path, level + 1); +		/* +		 * removal of root nodes has been logged by +		 * tree_mod_log_set_root_pointer due to locking +		 */ +		tree_mod_log_removal = 0;  		if (ret)  			return ret;  	} else { @@ -3261,7 +3270,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  			    (unsigned long)btrfs_header_chunk_tree_uuid(split),  			    BTRFS_UUID_SIZE); -	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); +	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, +			     tree_mod_log_removal);  	copy_extent_buffer(split, c,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(mid), diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b278b117cb..14fce27b478 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -22,8 +22,9 @@  #include "disk-io.h"  #include "transaction.h" -#define BTRFS_DELAYED_WRITEBACK		400 -#define BTRFS_DELAYED_BACKGROUND	100 +#define BTRFS_DELAYED_WRITEBACK		512 +#define BTRFS_DELAYED_BACKGROUND	128 +#define BTRFS_DELAYED_BATCH		16  static struct kmem_cache *delayed_node_cache; @@ -494,6 +495,15 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,  					BTRFS_DELAYED_DELETION_ITEM);  } +static void finish_one_item(struct btrfs_delayed_root *delayed_root) +{ +	int seq = atomic_inc_return(&delayed_root->items_seq); +	if ((atomic_dec_return(&delayed_root->items) < +	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && +	    waitqueue_active(&delayed_root->wait)) +		wake_up(&delayed_root->wait); +} +  static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)  {  	struct rb_root *root; @@ -512,10 +522,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)  	rb_erase(&delayed_item->rb_node, root);  	delayed_item->delayed_node->count--; -	if (atomic_dec_return(&delayed_root->items) < -	    BTRFS_DELAYED_BACKGROUND && -	    waitqueue_active(&delayed_root->wait)) -		wake_up(&delayed_root->wait); + +	finish_one_item(delayed_root);  }  static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) @@ -1056,10 +1064,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)  		delayed_node->count--;  		delayed_root = delayed_node->root->fs_info->delayed_root; -		if (atomic_dec_return(&delayed_root->items) < -		    BTRFS_DELAYED_BACKGROUND && -		    waitqueue_active(&delayed_root->wait)) -			wake_up(&delayed_root->wait); +		finish_one_item(delayed_root);  	}  } @@ -1304,35 +1309,44 @@ void btrfs_remove_delayed_node(struct inode *inode)  	btrfs_release_delayed_node(delayed_node);  } -struct btrfs_async_delayed_node { -	struct btrfs_root *root; -	struct btrfs_delayed_node *delayed_node; +struct btrfs_async_delayed_work { +	struct btrfs_delayed_root *delayed_root; +	int nr;  	struct btrfs_work work;  }; -static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) +static void btrfs_async_run_delayed_root(struct btrfs_work *work)  { -	struct btrfs_async_delayed_node *async_node; +	struct btrfs_async_delayed_work *async_work; +	struct btrfs_delayed_root *delayed_root;  	struct btrfs_trans_handle *trans;  	struct btrfs_path *path;  	struct btrfs_delayed_node *delayed_node = NULL;  	struct btrfs_root *root;  	struct btrfs_block_rsv *block_rsv; -	int need_requeue = 0; +	int total_done = 0; -	async_node = container_of(work, struct btrfs_async_delayed_node, work); +	async_work = container_of(work, struct btrfs_async_delayed_work, work); +	delayed_root = async_work->delayed_root;  	path = btrfs_alloc_path();  	if (!path)  		goto out; -	path->leave_spinning = 1; -	delayed_node = async_node->delayed_node; +again: +	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) +		goto free_path; + +	delayed_node = btrfs_first_prepared_delayed_node(delayed_root); +	if (!delayed_node) +		goto free_path; + +	path->leave_spinning = 1;  	root = delayed_node->root;  	trans = btrfs_join_transaction(root);  	if (IS_ERR(trans)) -		goto free_path; +		goto release_path;  	block_rsv = trans->block_rsv;  	trans->block_rsv = &root->fs_info->delayed_block_rsv; @@ -1363,57 +1377,47 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)  	 * Task1 will sleep until the transaction is commited.  	 */  	mutex_lock(&delayed_node->mutex); -	if (delayed_node->count) -		need_requeue = 1; -	else -		btrfs_dequeue_delayed_node(root->fs_info->delayed_root, -					   delayed_node); +	btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node);  	mutex_unlock(&delayed_node->mutex);  	trans->block_rsv = block_rsv;  	btrfs_end_transaction_dmeta(trans, root);  	btrfs_btree_balance_dirty_nodelay(root); + +release_path: +	btrfs_release_path(path); +	total_done++; + +	btrfs_release_prepared_delayed_node(delayed_node); +	if (async_work->nr == 0 || total_done < async_work->nr) +		goto again; +  free_path:  	btrfs_free_path(path);  out: -	if (need_requeue) -		btrfs_requeue_work(&async_node->work); -	else { -		btrfs_release_prepared_delayed_node(delayed_node); -		kfree(async_node); -	} +	wake_up(&delayed_root->wait); +	kfree(async_work);  } +  static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, -				     struct btrfs_root *root, int all) +				     struct btrfs_root *root, int nr)  { -	struct btrfs_async_delayed_node *async_node; -	struct btrfs_delayed_node *curr; -	int count = 0; +	struct btrfs_async_delayed_work *async_work; -again: -	curr = btrfs_first_prepared_delayed_node(delayed_root); -	if (!curr) +	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)  		return 0; -	async_node = kmalloc(sizeof(*async_node), GFP_NOFS); -	if (!async_node) { -		btrfs_release_prepared_delayed_node(curr); +	async_work = kmalloc(sizeof(*async_work), GFP_NOFS); +	if (!async_work)  		return -ENOMEM; -	} -	async_node->root = root; -	async_node->delayed_node = curr; - -	async_node->work.func = btrfs_async_run_delayed_node_done; -	async_node->work.flags = 0; - -	btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work); -	count++; - -	if (all || count < 4) -		goto again; +	async_work->delayed_root = delayed_root; +	async_work->work.func = btrfs_async_run_delayed_root; +	async_work->work.flags = 0; +	async_work->nr = nr; +	btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);  	return 0;  } @@ -1424,30 +1428,55 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root)  	WARN_ON(btrfs_first_delayed_node(delayed_root));  } +static int refs_newer(struct btrfs_delayed_root *delayed_root, +		      int seq, int count) +{ +	int val = atomic_read(&delayed_root->items_seq); + +	if (val < seq || val >= seq + count) +		return 1; +	return 0; +} +  void btrfs_balance_delayed_items(struct btrfs_root *root)  {  	struct btrfs_delayed_root *delayed_root; +	int seq;  	delayed_root = btrfs_get_delayed_root(root);  	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)  		return; +	seq = atomic_read(&delayed_root->items_seq); +  	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {  		int ret; -		ret = btrfs_wq_run_delayed_node(delayed_root, root, 1); +		DEFINE_WAIT(__wait); + +		ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);  		if (ret)  			return; -		wait_event_interruptible_timeout( -				delayed_root->wait, -				(atomic_read(&delayed_root->items) < -				 BTRFS_DELAYED_BACKGROUND), -				HZ); -		return; +		while (1) { +			prepare_to_wait(&delayed_root->wait, &__wait, +					TASK_INTERRUPTIBLE); + +			if (refs_newer(delayed_root, seq, +				       BTRFS_DELAYED_BATCH) || +			    atomic_read(&delayed_root->items) < +			    BTRFS_DELAYED_BACKGROUND) { +				break; +			} +			if (!signal_pending(current)) +				schedule(); +			else +				break; +		} +		finish_wait(&delayed_root->wait, &__wait);  	} -	btrfs_wq_run_delayed_node(delayed_root, root, 0); +	btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);  }  /* Will return 0 or -ENOMEM */ diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 78b6ad0fc66..1d5c5f7abe3 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -43,6 +43,7 @@ struct btrfs_delayed_root {  	 */  	struct list_head prepare_list;  	atomic_t items;		/* for delayed items */ +	atomic_t items_seq;	/* for delayed items */  	int nodes;		/* for delayed nodes */  	wait_queue_head_t wait;  }; @@ -86,6 +87,7 @@ static inline void btrfs_init_delayed_root(  				struct btrfs_delayed_root *delayed_root)  {  	atomic_set(&delayed_root->items, 0); +	atomic_set(&delayed_root->items_seq, 0);  	delayed_root->nodes = 0;  	spin_lock_init(&delayed_root->lock);  	init_waitqueue_head(&delayed_root->wait); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02369a3c162..6d19a0a554a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -62,7 +62,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,  static void btrfs_destroy_ordered_extents(struct btrfs_root *root);  static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,  				      struct btrfs_root *root); -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);  static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);  static int btrfs_destroy_marked_extents(struct btrfs_root *root,  					struct extent_io_tree *dirty_pages, @@ -1291,6 +1291,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,  				      0, objectid, NULL, 0, 0, 0);  	if (IS_ERR(leaf)) {  		ret = PTR_ERR(leaf); +		leaf = NULL;  		goto fail;  	} @@ -1334,11 +1335,16 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,  	btrfs_tree_unlock(leaf); +	return root; +  fail: -	if (ret) -		return ERR_PTR(ret); +	if (leaf) { +		btrfs_tree_unlock(leaf); +		free_extent_buffer(leaf); +	} +	kfree(root); -	return root; +	return ERR_PTR(ret);  }  static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, @@ -3253,7 +3259,7 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)  	if (btrfs_root_refs(&root->root_item) == 0)  		synchronize_srcu(&fs_info->subvol_srcu); -	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {  		btrfs_free_log(NULL, root);  		btrfs_free_log_root_tree(NULL, fs_info);  	} @@ -3687,7 +3693,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,  	return ret;  } -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)  {  	struct btrfs_pending_snapshot *snapshot;  	struct list_head splice; @@ -3700,10 +3706,8 @@ static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)  		snapshot = list_entry(splice.next,  				      struct btrfs_pending_snapshot,  				      list); - +		snapshot->error = -ECANCELED;  		list_del_init(&snapshot->list); - -		kfree(snapshot);  	}  } @@ -3840,6 +3844,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,  	cur_trans->blocked = 1;  	wake_up(&root->fs_info->transaction_blocked_wait); +	btrfs_evict_pending_snapshots(cur_trans); +  	cur_trans->blocked = 0;  	wake_up(&root->fs_info->transaction_wait); @@ -3849,8 +3855,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,  	btrfs_destroy_delayed_inodes(root);  	btrfs_assert_delayed_root_empty(root); -	btrfs_destroy_pending_snapshots(cur_trans); -  	btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,  				     EXTENT_DIRTY);  	btrfs_destroy_pinned_extent(root, @@ -3894,6 +3898,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)  		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))  			wake_up(&root->fs_info->transaction_blocked_wait); +		btrfs_evict_pending_snapshots(t); +  		t->blocked = 0;  		smp_mb();  		if (waitqueue_active(&root->fs_info->transaction_wait)) @@ -3907,8 +3913,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)  		btrfs_destroy_delayed_inodes(root);  		btrfs_assert_delayed_root_empty(root); -		btrfs_destroy_pending_snapshots(t); -  		btrfs_destroy_delalloc_inodes(root);  		spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3e074dab2d5..3d551231cab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -257,7 +257,8 @@ static int exclude_super_stripes(struct btrfs_root *root,  		cache->bytes_super += stripe_len;  		ret = add_excluded_extent(root, cache->key.objectid,  					  stripe_len); -		BUG_ON(ret); /* -ENOMEM */ +		if (ret) +			return ret;  	}  	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -265,13 +266,17 @@ static int exclude_super_stripes(struct btrfs_root *root,  		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,  				       cache->key.objectid, bytenr,  				       0, &logical, &nr, &stripe_len); -		BUG_ON(ret); /* -ENOMEM */ +		if (ret) +			return ret;  		while (nr--) {  			cache->bytes_super += stripe_len;  			ret = add_excluded_extent(root, logical[nr],  						  stripe_len); -			BUG_ON(ret); /* -ENOMEM */ +			if (ret) { +				kfree(logical); +				return ret; +			}  		}  		kfree(logical); @@ -1467,8 +1472,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,  	if (ret && !insert) {  		err = -ENOENT;  		goto out; +	} else if (ret) { +		err = -EIO; +		WARN_ON(1); +		goto out;  	} -	BUG_ON(ret); /* Corruption */  	leaf = path->nodes[0];  	item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -4435,7 +4443,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)  	spin_lock(&sinfo->lock);  	spin_lock(&block_rsv->lock); -	block_rsv->size = num_bytes; +	block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);  	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +  		    sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -4790,14 +4798,49 @@ out_fail:  	 * If the inodes csum_bytes is the same as the original  	 * csum_bytes then we know we haven't raced with any free()ers  	 * so we can just reduce our inodes csum bytes and carry on. -	 * Otherwise we have to do the normal free thing to account for -	 * the case that the free side didn't free up its reserve -	 * because of this outstanding reservation.  	 */ -	if (BTRFS_I(inode)->csum_bytes == csum_bytes) +	if (BTRFS_I(inode)->csum_bytes == csum_bytes) {  		calc_csum_metadata_size(inode, num_bytes, 0); -	else -		to_free = calc_csum_metadata_size(inode, num_bytes, 0); +	} else { +		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; +		u64 bytes; + +		/* +		 * This is tricky, but first we need to figure out how much we +		 * free'd from any free-ers that occured during this +		 * reservation, so we reset ->csum_bytes to the csum_bytes +		 * before we dropped our lock, and then call the free for the +		 * number of bytes that were freed while we were trying our +		 * reservation. +		 */ +		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; +		BTRFS_I(inode)->csum_bytes = csum_bytes; +		to_free = calc_csum_metadata_size(inode, bytes, 0); + + +		/* +		 * Now we need to see how much we would have freed had we not +		 * been making this reservation and our ->csum_bytes were not +		 * artificially inflated. +		 */ +		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; +		bytes = csum_bytes - orig_csum_bytes; +		bytes = calc_csum_metadata_size(inode, bytes, 0); + +		/* +		 * Now reset ->csum_bytes to what it should be.  If bytes is +		 * more than to_free then we would have free'd more space had we +		 * not had an artificially high ->csum_bytes, so we need to free +		 * the remainder.  If bytes is the same or less then we don't +		 * need to do anything, the other free-ers did the correct +		 * thing. +		 */ +		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; +		if (bytes > to_free) +			to_free = bytes - to_free; +		else +			to_free = 0; +	}  	spin_unlock(&BTRFS_I(inode)->lock);  	if (dropped)  		to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -7944,7 +7987,17 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		 * info has super bytes accounted for, otherwise we'll think  		 * we have more space than we actually do.  		 */ -		exclude_super_stripes(root, cache); +		ret = exclude_super_stripes(root, cache); +		if (ret) { +			/* +			 * We may have excluded something, so call this just in +			 * case. +			 */ +			free_excluded_extents(root, cache); +			kfree(cache->free_space_ctl); +			kfree(cache); +			goto error; +		}  		/*  		 * check for two cases, either we are full, and therefore @@ -8086,7 +8139,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	cache->last_byte_to_unpin = (u64)-1;  	cache->cached = BTRFS_CACHE_FINISHED; -	exclude_super_stripes(root, cache); +	ret = exclude_super_stripes(root, cache); +	if (ret) { +		/* +		 * We may have excluded something, so call this just in +		 * case. +		 */ +		free_excluded_extents(root, cache); +		kfree(cache->free_space_ctl); +		kfree(cache); +		return ret; +	}  	add_new_free_space(cache, root->fs_info, chunk_offset,  			   chunk_offset + size); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f173c5af646..cdee391fc7b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1257,6 +1257,39 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)  				GFP_NOFS);  } +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ +	unsigned long index = start >> PAGE_CACHE_SHIFT; +	unsigned long end_index = end >> PAGE_CACHE_SHIFT; +	struct page *page; + +	while (index <= end_index) { +		page = find_get_page(inode->i_mapping, index); +		BUG_ON(!page); /* Pages should be in the extent_io_tree */ +		clear_page_dirty_for_io(page); +		page_cache_release(page); +		index++; +	} +	return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ +	unsigned long index = start >> PAGE_CACHE_SHIFT; +	unsigned long end_index = end >> PAGE_CACHE_SHIFT; +	struct page *page; + +	while (index <= end_index) { +		page = find_get_page(inode->i_mapping, index); +		BUG_ON(!page); /* Pages should be in the extent_io_tree */ +		account_page_redirty(page); +		__set_page_dirty_nobuffers(page); +		page_cache_release(page); +		index++; +	} +	return 0; +} +  /*   * helper function to set both pages and extents in the tree writeback   */ diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6068a198556..258c9215685 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -325,6 +325,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,  		      unsigned long *map_len);  int extent_range_uptodate(struct extent_io_tree *tree,  			  u64 start, u64 end); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);  int extent_clear_unlock_delalloc(struct inode *inode,  				struct extent_io_tree *tree,  				u64 start, u64 end, struct page *locked_page, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ec160202be3..c4628a201cb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,9 +118,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,  		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);  		csums_in_item /= csum_size; -		if (csum_offset >= csums_in_item) { +		if (csum_offset == csums_in_item) {  			ret = -EFBIG;  			goto fail; +		} else if (csum_offset > csums_in_item) { +			goto fail;  		}  	}  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -728,7 +730,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,  		return -ENOMEM;  	sector_sum = sums->sums; -	trans->adding_csums = 1;  again:  	next_offset = (u64)-1;  	found_next = 0; @@ -899,7 +900,6 @@ next_sector:  		goto again;  	}  out: -	trans->adding_csums = 0;  	btrfs_free_path(path);  	return ret; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index af1d0605a5c..ade03e6f7bd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -591,6 +591,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  		}  		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);  		clear_bit(EXTENT_FLAG_PINNED, &em->flags); +		clear_bit(EXTENT_FLAG_LOGGING, &flags);  		remove_extent_mapping(em_tree, em);  		if (no_splits)  			goto next; @@ -2141,6 +2142,7 @@ static long btrfs_fallocate(struct file *file, int mode,  {  	struct inode *inode = file_inode(file);  	struct extent_state *cached_state = NULL; +	struct btrfs_root *root = BTRFS_I(inode)->root;  	u64 cur_offset;  	u64 last_byte;  	u64 alloc_start; @@ -2168,6 +2170,11 @@ static long btrfs_fallocate(struct file *file, int mode,  	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);  	if (ret)  		return ret; +	if (root->fs_info->quota_enabled) { +		ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); +		if (ret) +			goto out_reserve_fail; +	}  	/*  	 * wait for ordered IO before we have any locks.  We'll loop again @@ -2271,6 +2278,9 @@ static long btrfs_fallocate(struct file *file, int mode,  			     &cached_state, GFP_NOFS);  out:  	mutex_unlock(&inode->i_mutex); +	if (root->fs_info->quota_enabled) +		btrfs_qgroup_free(root, alloc_end - alloc_start); +out_reserve_fail:  	/* Let go of our reservation. */  	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);  	return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c226daefd65..09c58a35b42 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -353,6 +353,7 @@ static noinline int compress_file_range(struct inode *inode,  	int i;  	int will_compress;  	int compress_type = root->fs_info->compress_type; +	int redirty = 0;  	/* if this is a small write inside eof, kick off a defrag */  	if ((end - start + 1) < 16 * 1024 && @@ -415,6 +416,17 @@ again:  		if (BTRFS_I(inode)->force_compress)  			compress_type = BTRFS_I(inode)->force_compress; +		/* +		 * we need to call clear_page_dirty_for_io on each +		 * page in the range.  Otherwise applications with the file +		 * mmap'd can wander in and change the page contents while +		 * we are compressing them. +		 * +		 * If the compression fails for any reason, we set the pages +		 * dirty again later on. +		 */ +		extent_range_clear_dirty_for_io(inode, start, end); +		redirty = 1;  		ret = btrfs_compress_pages(compress_type,  					   inode->i_mapping, start,  					   total_compressed, pages, @@ -554,6 +566,8 @@ cleanup_and_bail_uncompressed:  			__set_page_dirty_nobuffers(locked_page);  			/* unlocked later on in the async handlers */  		} +		if (redirty) +			extent_range_redirty_for_io(inode, start, end);  		add_async_extent(async_cow, start, end - start + 1,  				 0, NULL, 0, BTRFS_COMPRESS_NONE);  		*num_added += 1; @@ -1743,8 +1757,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,  	struct btrfs_ordered_sum *sum;  	list_for_each_entry(sum, list, list) { +		trans->adding_csums = 1;  		btrfs_csum_file_blocks(trans,  		       BTRFS_I(inode)->root->fs_info->csum_root, sum); +		trans->adding_csums = 0;  	}  	return 0;  } @@ -2312,6 +2328,7 @@ again:  	key.type = BTRFS_EXTENT_DATA_KEY;  	key.offset = start; +	path->leave_spinning = 1;  	if (merge) {  		struct btrfs_file_extent_item *fi;  		u64 extent_len; @@ -2368,6 +2385,7 @@ again:  	btrfs_mark_buffer_dirty(leaf);  	inode_add_bytes(inode, len); +	btrfs_release_path(path);  	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,  			new->disk_len, 0, @@ -2381,6 +2399,7 @@ again:  	ret = 1;  out_free_path:  	btrfs_release_path(path); +	path->leave_spinning = 0;  	btrfs_end_transaction(trans, root);  out_unlock:  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, @@ -3676,11 +3695,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,  	 * 1 for the dir item  	 * 1 for the dir index  	 * 1 for the inode ref -	 * 1 for the inode ref in the tree log -	 * 2 for the dir entries in the log  	 * 1 for the inode  	 */ -	trans = btrfs_start_transaction(root, 8); +	trans = btrfs_start_transaction(root, 5);  	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)  		return trans; @@ -8124,7 +8141,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items  	 * should cover the worst case number of items we'll modify.  	 */ -	trans = btrfs_start_transaction(root, 20); +	trans = btrfs_start_transaction(root, 11);  	if (IS_ERR(trans)) {                  ret = PTR_ERR(trans);                  goto out_notrans; @@ -8502,6 +8519,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  	struct btrfs_key ins;  	u64 cur_offset = start;  	u64 i_size; +	u64 cur_bytes;  	int ret = 0;  	bool own_trans = true; @@ -8516,8 +8534,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  			}  		} -		ret = btrfs_reserve_extent(trans, root, -					   min(num_bytes, 256ULL * 1024 * 1024), +		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); +		cur_bytes = max(cur_bytes, min_size); +		ret = btrfs_reserve_extent(trans, root, cur_bytes,  					   min_size, 0, *alloc_hint, &ins, 1);  		if (ret) {  			if (own_trans) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c83086fdda0..2c02310ff2d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -527,6 +527,8 @@ fail:  	if (async_transid) {  		*async_transid = trans->transid;  		err = btrfs_commit_transaction_async(trans, root, 1); +		if (err) +			err = btrfs_commit_transaction(trans, root);  	} else {  		err = btrfs_commit_transaction(trans, root);  	} @@ -592,16 +594,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,  		*async_transid = trans->transid;  		ret = btrfs_commit_transaction_async(trans,  				     root->fs_info->extent_root, 1); +		if (ret) +			ret = btrfs_commit_transaction(trans, root);  	} else {  		ret = btrfs_commit_transaction(trans,  					       root->fs_info->extent_root);  	} -	if (ret) { -		/* cleanup_transaction has freed this for us */ -		if (trans->aborted) -			pending_snapshot = NULL; +	if (ret)  		goto fail; -	}  	ret = pending_snapshot->error;  	if (ret) @@ -2245,13 +2245,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  	if (ret)  		return ret; -	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, -			1)) { -		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); -		mnt_drop_write_file(file); -		return -EINVAL; -	} -  	if (btrfs_root_readonly(root)) {  		ret = -EROFS;  		goto out; @@ -2306,7 +2299,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  		ret = -EINVAL;  	}  out: -	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);  	mnt_drop_write_file(file);  	return ret;  } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index ca52681e5f4..b81e0e9a489 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -26,7 +26,6 @@  void btrfs_tree_lock(struct extent_buffer *eb);  void btrfs_tree_unlock(struct extent_buffer *eb); -int btrfs_try_spin_lock(struct extent_buffer *eb);  void btrfs_tree_read_lock(struct extent_buffer *eb);  void btrfs_tree_read_unlock(struct extent_buffer *eb); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dc08d77b717..005c45db699 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -557,6 +557,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)  	INIT_LIST_HEAD(&splice);  	INIT_LIST_HEAD(&works); +	mutex_lock(&root->fs_info->ordered_operations_mutex);  	spin_lock(&root->fs_info->ordered_extent_lock);  	list_splice_init(&root->fs_info->ordered_extents, &splice);  	while (!list_empty(&splice)) { @@ -600,6 +601,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)  		cond_resched();  	} +	mutex_unlock(&root->fs_info->ordered_operations_mutex);  }  /* diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index aee4b1cc3d9..b44124dd237 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1153,7 +1153,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,  	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,  				   sgn > 0 ? node->seq - 1 : node->seq, &roots);  	if (ret < 0) -		goto out; +		return ret;  	spin_lock(&fs_info->qgroup_lock);  	quota_root = fs_info->quota_root; @@ -1275,7 +1275,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,  	ret = 0;  unlock:  	spin_unlock(&fs_info->qgroup_lock); -out:  	ulist_free(roots);  	ulist_free(tmp); @@ -1525,21 +1524,23 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)  		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&  		    qg->reserved + qg->rfer + num_bytes > -		    qg->max_rfer) +		    qg->max_rfer) {  			ret = -EDQUOT; +			goto out; +		}  		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&  		    qg->reserved + qg->excl + num_bytes > -		    qg->max_excl) +		    qg->max_excl) {  			ret = -EDQUOT; +			goto out; +		}  		list_for_each_entry(glist, &qg->groups, next_group) {  			ulist_add(ulist, glist->group->qgroupid,  				  (uintptr_t)glist->group, GFP_ATOMIC);  		}  	} -	if (ret) -		goto out;  	/*  	 * no limits exceeded, now record the reservation into all qgroups diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 50695dc5e2a..b67171e6d68 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1269,6 +1269,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)  	}  	spin_unlock(&rc->reloc_root_tree.lock); +	if (!node) +		return 0;  	BUG_ON((struct btrfs_root *)node->data != root);  	if (!del) { @@ -2238,13 +2240,28 @@ again:  }  static noinline_for_stack +void free_reloc_roots(struct list_head *list) +{ +	struct btrfs_root *reloc_root; + +	while (!list_empty(list)) { +		reloc_root = list_entry(list->next, struct btrfs_root, +					root_list); +		__update_reloc_root(reloc_root, 1); +		free_extent_buffer(reloc_root->node); +		free_extent_buffer(reloc_root->commit_root); +		kfree(reloc_root); +	} +} + +static noinline_for_stack  int merge_reloc_roots(struct reloc_control *rc)  {  	struct btrfs_root *root;  	struct btrfs_root *reloc_root;  	LIST_HEAD(reloc_roots);  	int found = 0; -	int ret; +	int ret = 0;  again:  	root = rc->extent_root; @@ -2270,20 +2287,33 @@ again:  			BUG_ON(root->reloc_root != reloc_root);  			ret = merge_reloc_root(rc, root); -			BUG_ON(ret); +			if (ret) +				goto out;  		} else {  			list_del_init(&reloc_root->root_list);  		}  		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); -		BUG_ON(ret < 0); +		if (ret < 0) { +			if (list_empty(&reloc_root->root_list)) +				list_add_tail(&reloc_root->root_list, +					      &reloc_roots); +			goto out; +		}  	}  	if (found) {  		found = 0;  		goto again;  	} +out: +	if (ret) { +		btrfs_std_error(root->fs_info, ret); +		if (!list_empty(&reloc_roots)) +			free_reloc_roots(&reloc_roots); +	} +  	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); -	return 0; +	return ret;  }  static void free_block_list(struct rb_root *blocks) @@ -2818,8 +2848,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,  	int err = 0;  	path = btrfs_alloc_path(); -	if (!path) -		return -ENOMEM; +	if (!path) { +		err = -ENOMEM; +		goto out_path; +	}  	rb_node = rb_first(blocks);  	while (rb_node) { @@ -2858,10 +2890,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,  		rb_node = rb_next(rb_node);  	}  out: -	free_block_list(blocks);  	err = finish_pending_nodes(trans, rc, path, err);  	btrfs_free_path(path); +out_path: +	free_block_list(blocks);  	return err;  } @@ -3698,7 +3731,15 @@ int prepare_to_relocate(struct reloc_control *rc)  	set_reloc_control(rc);  	trans = btrfs_join_transaction(rc->extent_root); -	BUG_ON(IS_ERR(trans)); +	if (IS_ERR(trans)) { +		unset_reloc_control(rc); +		/* +		 * extent tree is not a ref_cow tree and has no reloc_root to +		 * cleanup.  And callers are responsible to free the above +		 * block rsv. +		 */ +		return PTR_ERR(trans); +	}  	btrfs_commit_transaction(trans, rc->extent_root);  	return 0;  } @@ -3730,7 +3771,11 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)  	while (1) {  		progress++;  		trans = btrfs_start_transaction(rc->extent_root, 0); -		BUG_ON(IS_ERR(trans)); +		if (IS_ERR(trans)) { +			err = PTR_ERR(trans); +			trans = NULL; +			break; +		}  restart:  		if (update_backref_cache(trans, &rc->backref_cache)) {  			btrfs_end_transaction(trans, rc->extent_root); @@ -4264,14 +4309,9 @@ int btrfs_recover_relocation(struct btrfs_root *root)  out_free:  	kfree(rc);  out: -	while (!list_empty(&reloc_roots)) { -		reloc_root = list_entry(reloc_roots.next, -					struct btrfs_root, root_list); -		list_del(&reloc_root->root_list); -		free_extent_buffer(reloc_root->node); -		free_extent_buffer(reloc_root->commit_root); -		kfree(reloc_root); -	} +	if (!list_empty(&reloc_roots)) +		free_reloc_roots(&reloc_roots); +  	btrfs_free_path(path);  	if (err == 0) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 53c3501fa4c..85e072b956d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -542,7 +542,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  	eb = path->nodes[0];  	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);  	item_size = btrfs_item_size_nr(eb, path->slots[0]); -	btrfs_release_path(path);  	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {  		do { @@ -558,7 +557,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  				ret < 0 ? -1 : ref_level,  				ret < 0 ? -1 : ref_root);  		} while (ret != 1); +		btrfs_release_path(path);  	} else { +		btrfs_release_path(path);  		swarn.path = path;  		swarn.dev = dev;  		iterate_extent_inodes(fs_info, found_key.objectid, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f7a8b861058..c85e7c6b459 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3945,12 +3945,10 @@ static int is_extent_unchanged(struct send_ctx *sctx,  		    found_key.type != key.type) {  			key.offset += right_len;  			break; -		} else { -			if (found_key.offset != key.offset + right_len) { -				/* Should really not happen */ -				ret = -EIO; -				goto out; -			} +		} +		if (found_key.offset != key.offset + right_len) { +			ret = 0; +			goto out;  		}  		key = found_key;  	} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 68a29a1ea06..f6b88595f85 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1558,6 +1558,7 @@ static struct file_system_type btrfs_fs_type = {  	.kill_sb	= btrfs_kill_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("btrfs");  /*   * used by btrfsctl to scan devices when no FS is mounted diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e52da6fb116..50767bbaad6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -625,14 +625,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	btrfs_trans_release_metadata(trans, root);  	trans->block_rsv = NULL; -	/* -	 * the same root has to be passed to start_transaction and -	 * end_transaction. Subvolume quota depends on this. -	 */ -	WARN_ON(trans->root != root);  	if (trans->qgroup_reserved) { -		btrfs_qgroup_free(root, trans->qgroup_reserved); +		/* +		 * the same root has to be passed here between start_transaction +		 * and end_transaction. Subvolume quota depends on this. +		 */ +		btrfs_qgroup_free(trans->root, trans->qgroup_reserved);  		trans->qgroup_reserved = 0;  	} @@ -1052,7 +1051,12 @@ int btrfs_defrag_root(struct btrfs_root *root)  /*   * new snapshots need to be created at a very specific time in the - * transaction commit.  This does the actual creation + * transaction commit.  This does the actual creation. + * + * Note: + * If the error which may affect the commitment of the current transaction + * happens, we should return the error number. If the error which just affect + * the creation of the pending snapshots, just return 0.   */  static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  				   struct btrfs_fs_info *fs_info, @@ -1071,7 +1075,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	struct extent_buffer *tmp;  	struct extent_buffer *old;  	struct timespec cur_time = CURRENT_TIME; -	int ret; +	int ret = 0;  	u64 to_reserve = 0;  	u64 index = 0;  	u64 objectid; @@ -1080,40 +1084,36 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	path = btrfs_alloc_path();  	if (!path) { -		ret = pending->error = -ENOMEM; -		return ret; +		pending->error = -ENOMEM; +		return 0;  	}  	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);  	if (!new_root_item) { -		ret = pending->error = -ENOMEM; +		pending->error = -ENOMEM;  		goto root_item_alloc_fail;  	} -	ret = btrfs_find_free_objectid(tree_root, &objectid); -	if (ret) { -		pending->error = ret; +	pending->error = btrfs_find_free_objectid(tree_root, &objectid); +	if (pending->error)  		goto no_free_objectid; -	}  	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);  	if (to_reserve > 0) { -		ret = btrfs_block_rsv_add(root, &pending->block_rsv, -					  to_reserve, -					  BTRFS_RESERVE_NO_FLUSH); -		if (ret) { -			pending->error = ret; +		pending->error = btrfs_block_rsv_add(root, +						     &pending->block_rsv, +						     to_reserve, +						     BTRFS_RESERVE_NO_FLUSH); +		if (pending->error)  			goto no_free_objectid; -		}  	} -	ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, -				   objectid, pending->inherit); -	if (ret) { -		pending->error = ret; +	pending->error = btrfs_qgroup_inherit(trans, fs_info, +					      root->root_key.objectid, +					      objectid, pending->inherit); +	if (pending->error)  		goto no_free_objectid; -	}  	key.objectid = objectid;  	key.offset = (u64)-1; @@ -1141,7 +1141,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  					 dentry->d_name.len, 0);  	if (dir_item != NULL && !IS_ERR(dir_item)) {  		pending->error = -EEXIST; -		goto fail; +		goto dir_item_existed;  	} else if (IS_ERR(dir_item)) {  		ret = PTR_ERR(dir_item);  		btrfs_abort_transaction(trans, root, ret); @@ -1272,6 +1272,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	if (ret)  		btrfs_abort_transaction(trans, root, ret);  fail: +	pending->error = ret; +dir_item_existed:  	trans->block_rsv = rsv;  	trans->bytes_reserved = 0;  no_free_objectid: @@ -1287,12 +1289,17 @@ root_item_alloc_fail:  static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,  					     struct btrfs_fs_info *fs_info)  { -	struct btrfs_pending_snapshot *pending; +	struct btrfs_pending_snapshot *pending, *next;  	struct list_head *head = &trans->transaction->pending_snapshots; +	int ret = 0; -	list_for_each_entry(pending, head, list) -		create_pending_snapshot(trans, fs_info, pending); -	return 0; +	list_for_each_entry_safe(pending, next, head, list) { +		list_del(&pending->list); +		ret = create_pending_snapshot(trans, fs_info, pending); +		if (ret) +			break; +	} +	return ret;  }  static void update_super_roots(struct btrfs_root *root) @@ -1448,6 +1455,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,  	btrfs_abort_transaction(trans, root, err);  	spin_lock(&root->fs_info->trans_lock); + +	if (list_empty(&cur_trans->list)) { +		spin_unlock(&root->fs_info->trans_lock); +		btrfs_end_transaction(trans, root); +		return; +	} +  	list_del_init(&cur_trans->list);  	if (cur_trans == root->fs_info->running_transaction) {  		root->fs_info->trans_no_join = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c7ef569eb22..451fad96ecd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1382,7 +1382,10 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,  	btrfs_release_path(path);  	if (ret == 0) { -		btrfs_inc_nlink(inode); +		if (!inode->i_nlink) +			set_nlink(inode, 1); +		else +			btrfs_inc_nlink(inode);  		ret = btrfs_update_inode(trans, root, inode);  	} else if (ret == -EEXIST) {  		ret = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 35bb2d4ed29..2854c824ab6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -684,6 +684,12 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)  		__btrfs_close_devices(fs_devices);  		free_fs_devices(fs_devices);  	} +	/* +	 * Wait for rcu kworkers under __btrfs_close_devices +	 * to finish all blkdev_puts so device is really +	 * free when umount is done. +	 */ +	rcu_barrier();  	return ret;  } @@ -2379,7 +2385,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,  		return ret;  	trans = btrfs_start_transaction(root, 0); -	BUG_ON(IS_ERR(trans)); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		btrfs_std_error(root->fs_info, ret); +		return ret; +	}  	lock_chunks(root); @@ -3050,7 +3060,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)  	unset_balance_control(fs_info);  	ret = del_balance_item(fs_info->tree_root); -	BUG_ON(ret); +	if (ret) +		btrfs_std_error(fs_info, ret);  	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);  } @@ -3230,6 +3241,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		update_ioctl_balance_args(fs_info, 0, bargs);  	} +	if ((ret && ret != -ECANCELED && ret != -ENOSPC) || +	    balance_need_close(fs_info)) { +		__cancel_balance(fs_info); +	} +  	wake_up(&fs_info->balance_wait_q);  	return ret; @@ -4919,7 +4935,18 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	em = lookup_extent_mapping(em_tree, chunk_start, 1);  	read_unlock(&em_tree->lock); -	BUG_ON(!em || em->start != chunk_start); +	if (!em) { +		printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", +		       chunk_start); +		return -EIO; +	} + +	if (em->start != chunk_start) { +		printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", +		       em->start, chunk_start); +		free_extent_map(em); +		return -EIO; +	}  	map = (struct map_lookup *)em->bdev;  	length = em->len; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9fe17c6c287..6ddc0bca56b 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -952,6 +952,7 @@ static struct file_system_type ceph_fs_type = {  	.kill_sb	= ceph_kill_sb,  	.fs_flags	= FS_RENAME_DOES_D_MOVE,  }; +MODULE_ALIAS_FS("ceph");  #define _STRINGIFY(x) #x  #define STRINGIFY(x) _STRINGIFY(x) diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index cfd1ce34e0b..1d36db11477 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -614,53 +614,10 @@ decode_negTokenInit(unsigned char *security_blob, int length,  		}  	} -	/* mechlistMIC */ -	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { -		/* Check if we have reached the end of the blob, but with -		   no mechListMic (e.g. NTLMSSP instead of KRB5) */ -		if (ctx.error == ASN1_ERR_DEC_EMPTY) -			goto decode_negtoken_exit; -		cFYI(1, "Error decoding last part negTokenInit exit3"); -		return 0; -	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { -		/* tag = 3 indicating mechListMIC */ -		cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)", -			cls, con, tag, end, *end); -		return 0; -	} - -	/* sequence */ -	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { -		cFYI(1, "Error decoding last part negTokenInit exit5"); -		return 0; -	} else if ((cls != ASN1_UNI) || (con != ASN1_CON) -		   || (tag != ASN1_SEQ)) { -		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)", -			cls, con, tag, end, *end); -	} - -	/* sequence of */ -	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { -		cFYI(1, "Error decoding last part negTokenInit exit 7"); -		return 0; -	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { -		cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)", -			cls, con, tag, end, *end); -		return 0; -	} - -	/* general string */ -	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { -		cFYI(1, "Error decoding last part negTokenInit exit9"); -		return 0; -	} else if ((cls != ASN1_UNI) || (con != ASN1_PRI) -		   || (tag != ASN1_GENSTR)) { -		cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)", -			cls, con, tag, end, *end); -		return 0; -	} -	cFYI(1, "Need to call asn1_octets_decode() function for %s", -		ctx.pointer);	/* is this UTF-8 or ASCII? */ -decode_negtoken_exit: +	/* +	 * We currently ignore anything at the end of the SPNEGO blob after +	 * the mechTypes have been parsed, since none of that info is +	 * used at the moment. +	 */  	return 1;  } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1a052c0eee8..345fc89c428 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -91,6 +91,30 @@ struct workqueue_struct	*cifsiod_wq;  __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];  #endif +/* + * Bumps refcount for cifs super block. + * Note that it should be only called if a referece to VFS super block is + * already held, e.g. in open-type syscalls context. Otherwise it can race with + * atomic_dec_and_test in deactivate_locked_super. + */ +void +cifs_sb_active(struct super_block *sb) +{ +	struct cifs_sb_info *server = CIFS_SB(sb); + +	if (atomic_inc_return(&server->active) == 1) +		atomic_inc(&sb->s_active); +} + +void +cifs_sb_deactive(struct super_block *sb) +{ +	struct cifs_sb_info *server = CIFS_SB(sb); + +	if (atomic_dec_and_test(&server->active)) +		deactivate_super(sb); +} +  static int  cifs_read_super(struct super_block *sb)  { @@ -777,6 +801,7 @@ struct file_system_type cifs_fs_type = {  	.kill_sb = cifs_kill_sb,  	/*  .fs_flags */  }; +MODULE_ALIAS_FS("cifs");  const struct inode_operations cifs_dir_inode_ops = {  	.create = cifs_create,  	.atomic_open = cifs_atomic_open, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7163419cecd..0e32c3446ce 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type;  extern const struct address_space_operations cifs_addr_ops;  extern const struct address_space_operations cifs_addr_ops_smallbuf; +/* Functions related to super block operations */ +extern void cifs_sb_active(struct super_block *sb); +extern void cifs_sb_deactive(struct super_block *sb); +  /* Functions related to inodes */  extern const struct inode_operations cifs_dir_inode_ops;  extern struct inode *cifs_root_iget(struct super_block *); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7353bc5d73d..8e2e799e7a2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1909,12 +1909,12 @@ cifs_writev_requeue(struct cifs_writedata *wdata)  	} while (rc == -EAGAIN);  	for (i = 0; i < wdata->nr_pages; i++) { +		unlock_page(wdata->pages[i]);  		if (rc != 0) {  			SetPageError(wdata->pages[i]);  			end_page_writeback(wdata->pages[i]);  			page_cache_release(wdata->pages[i]);  		} -		unlock_page(wdata->pages[i]);  	}  	mapping_set_error(inode->i_mapping, rc); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 54125e04fd0..991c63c6bdd 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -97,7 +97,7 @@ enum {  	Opt_user, Opt_pass, Opt_ip,  	Opt_unc, Opt_domain,  	Opt_srcaddr, Opt_prefixpath, -	Opt_iocharset, Opt_sockopt, +	Opt_iocharset,  	Opt_netbiosname, Opt_servern,  	Opt_ver, Opt_vers, Opt_sec, Opt_cache, @@ -202,7 +202,6 @@ static const match_table_t cifs_mount_option_tokens = {  	{ Opt_srcaddr, "srcaddr=%s" },  	{ Opt_prefixpath, "prefixpath=%s" },  	{ Opt_iocharset, "iocharset=%s" }, -	{ Opt_sockopt, "sockopt=%s" },  	{ Opt_netbiosname, "netbiosname=%s" },  	{ Opt_servern, "servern=%s" },  	{ Opt_ver, "ver=%s" }, @@ -1752,19 +1751,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,  			 */  			cFYI(1, "iocharset set to %s", string);  			break; -		case Opt_sockopt: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; - -			if (strnicmp(string, "TCP_NODELAY", 11) == 0) { -				printk(KERN_WARNING "CIFS: the " -					"sockopt=TCP_NODELAY option has been " -					"deprecated and will be removed " -					"in 3.9\n"); -				vol->sockopt_tcp_nodelay = 1; -			} -			break;  		case Opt_netbiosname:  			string = match_strdup(args);  			if (string == NULL) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c0d8557731..7a0dd99e450 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -300,6 +300,8 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,  	INIT_WORK(&cfile->oplock_break, cifs_oplock_break);  	mutex_init(&cfile->fh_mutex); +	cifs_sb_active(inode->i_sb); +  	/*  	 * If the server returned a read oplock and we have mandatory brlocks,  	 * set oplock level to None. @@ -349,7 +351,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)  	struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);  	struct TCP_Server_Info *server = tcon->ses->server;  	struct cifsInodeInfo *cifsi = CIFS_I(inode); -	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); +	struct super_block *sb = inode->i_sb; +	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);  	struct cifsLockInfo *li, *tmp;  	struct cifs_fid fid;  	struct cifs_pending_open open; @@ -414,6 +417,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)  	cifs_put_tlink(cifs_file->tlink);  	dput(cifs_file->dentry); +	cifs_sb_deactive(sb);  	kfree(cifs_file);  } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 83f2606c76d..20887bf6312 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -995,6 +995,15 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,  		return PTR_ERR(tlink);  	tcon = tlink_tcon(tlink); +	/* +	 * We cannot rename the file if the server doesn't support +	 * CAP_INFOLEVEL_PASSTHRU +	 */ +	if (!(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)) { +		rc = -EBUSY; +		goto out; +	} +  	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,  			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,  			 &netfid, &oplock, NULL, cifs_sb->local_nls, @@ -1023,7 +1032,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,  					current->tgid);  		/* although we would like to mark the file hidden   		   if that fails we will still try to rename it */ -		if (rc != 0) +		if (!rc)  			cifsInode->cifsAttrs = dosattr;  		else  			dosattr = origattr; /* since not able to change them */ @@ -1034,7 +1043,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,  				   cifs_sb->mnt_cifs_flags &  					    CIFS_MOUNT_MAP_SPECIAL_CHR);  	if (rc != 0) { -		rc = -ETXTBSY; +		rc = -EBUSY;  		goto undo_setattr;  	} @@ -1053,7 +1062,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,  		if (rc == -ENOENT)  			rc = 0;  		else if (rc != 0) { -			rc = -ETXTBSY; +			rc = -EBUSY;  			goto undo_rename;  		}  		cifsInode->delete_pending = true; @@ -1160,15 +1169,13 @@ psx_del_no_retry:  			cifs_drop_nlink(inode);  	} else if (rc == -ENOENT) {  		d_drop(dentry); -	} else if (rc == -ETXTBSY) { +	} else if (rc == -EBUSY) {  		if (server->ops->rename_pending_delete) {  			rc = server->ops->rename_pending_delete(full_path,  								dentry, xid);  			if (rc == 0)  				cifs_drop_nlink(inode);  		} -		if (rc == -ETXTBSY) -			rc = -EBUSY;  	} else if ((rc == -EACCES) && (dosattr == 0) && inode) {  		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);  		if (attrs == NULL) { @@ -1509,7 +1516,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,  	 * source. Note that cross directory moves do not work with  	 * rename by filehandle to various Windows servers.  	 */ -	if (rc == 0 || rc != -ETXTBSY) +	if (rc == 0 || rc != -EBUSY)  		goto do_rename_exit;  	/* open-file renames don't work across directories */ diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index a82bc51fdc8..c0b25b28be6 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -62,7 +62,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {  	{ERRdiffdevice, -EXDEV},  	{ERRnofiles, -ENOENT},  	{ERRwriteprot, -EROFS}, -	{ERRbadshare, -ETXTBSY}, +	{ERRbadshare, -EBUSY},  	{ERRlock, -EACCES},  	{ERRunsup, -EINVAL},  	{ERRnosuchshare, -ENXIO}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c9c7aa7ed96..bceffe7b8f8 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -744,4 +744,5 @@ struct smb_version_values smb30_values = {  	.cap_unix = 0,  	.cap_nt_find = SMB2_NT_FIND,  	.cap_large_files = SMB2_LARGE_FILES, +	.oplock_read = SMB2_OPLOCK_LEVEL_II,  }; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index dada9d0abed..4dcc0d81a7a 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -329,4 +329,5 @@ struct file_system_type coda_fs_type = {  	.kill_sb	= kill_anon_super,  	.fs_flags	= FS_BINARY_MOUNTDATA,  }; +MODULE_ALIAS_FS("coda"); diff --git a/fs/compat.c b/fs/compat.c index fe40fde2911..d487985dd0e 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -558,6 +558,10 @@ ssize_t compat_rw_copy_check_uvector(int type,  	}  	*ret_pointer = iov; +	ret = -EFAULT; +	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) +		goto out; +  	/*  	 * Single unix specification:  	 * We should -EINVAL if an element length is not >= 0 and fitting an @@ -1080,17 +1084,12 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,  	if (!file->f_op)  		goto out; -	ret = -EFAULT; -	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) -		goto out; - -	tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, +	ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,  					       UIO_FASTIOV, iovstack, &iov); -	if (tot_len == 0) { -		ret = 0; +	if (ret <= 0)  		goto out; -	} +	tot_len = ret;  	ret = rw_verify_area(type, file, pos, tot_len);  	if (ret < 0)  		goto out; diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index aee0a7ebbd8..7f26c3cf75a 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -114,6 +114,7 @@ static struct file_system_type configfs_fs_type = {  	.mount		= configfs_do_mount,  	.kill_sb	= kill_litter_super,  }; +MODULE_ALIAS_FS("configfs");  struct dentry *configfs_pin_fs(void)  { diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 3ceb9ec976e..35b1c7bd18b 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -573,6 +573,7 @@ static struct file_system_type cramfs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("cramfs");  static int __init init_cramfs_fs(void)  { diff --git a/fs/dcache.c b/fs/dcache.c index fbfae008ba4..e8bc3420d63 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2542,7 +2542,6 @@ static int prepend_path(const struct path *path,  	bool slash = false;  	int error = 0; -	br_read_lock(&vfsmount_lock);  	while (dentry != root->dentry || vfsmnt != root->mnt) {  		struct dentry * parent; @@ -2572,8 +2571,6 @@ static int prepend_path(const struct path *path,  	if (!error && !slash)  		error = prepend(buffer, buflen, "/", 1); -out: -	br_read_unlock(&vfsmount_lock);  	return error;  global_root: @@ -2590,7 +2587,7 @@ global_root:  		error = prepend(buffer, buflen, "/", 1);  	if (!error)  		error = is_mounted(vfsmnt) ? 1 : 2; -	goto out; +	return error;  }  /** @@ -2617,9 +2614,11 @@ char *__d_path(const struct path *path,  	int error;  	prepend(&res, &buflen, "\0", 1); +	br_read_lock(&vfsmount_lock);  	write_seqlock(&rename_lock);  	error = prepend_path(path, root, &res, &buflen);  	write_sequnlock(&rename_lock); +	br_read_unlock(&vfsmount_lock);  	if (error < 0)  		return ERR_PTR(error); @@ -2636,9 +2635,11 @@ char *d_absolute_path(const struct path *path,  	int error;  	prepend(&res, &buflen, "\0", 1); +	br_read_lock(&vfsmount_lock);  	write_seqlock(&rename_lock);  	error = prepend_path(path, &root, &res, &buflen);  	write_sequnlock(&rename_lock); +	br_read_unlock(&vfsmount_lock);  	if (error > 1)  		error = -EINVAL; @@ -2702,11 +2703,13 @@ char *d_path(const struct path *path, char *buf, int buflen)  		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);  	get_fs_root(current->fs, &root); +	br_read_lock(&vfsmount_lock);  	write_seqlock(&rename_lock);  	error = path_with_deleted(path, &root, &res, &buflen); +	write_sequnlock(&rename_lock); +	br_read_unlock(&vfsmount_lock);  	if (error < 0)  		res = ERR_PTR(error); -	write_sequnlock(&rename_lock);  	path_put(&root);  	return res;  } @@ -2830,6 +2833,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)  	get_fs_root_and_pwd(current->fs, &root, &pwd);  	error = -ENOENT; +	br_read_lock(&vfsmount_lock);  	write_seqlock(&rename_lock);  	if (!d_unlinked(pwd.dentry)) {  		unsigned long len; @@ -2839,6 +2843,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)  		prepend(&cwd, &buflen, "\0", 1);  		error = prepend_path(&pwd, &root, &cwd, &buflen);  		write_sequnlock(&rename_lock); +		br_read_unlock(&vfsmount_lock);  		if (error < 0)  			goto out; @@ -2859,6 +2864,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)  		}  	} else {  		write_sequnlock(&rename_lock); +		br_read_unlock(&vfsmount_lock);  	}  out: diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 0c4f80b447f..4888cb3fdef 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -299,6 +299,7 @@ static struct file_system_type debug_fs_type = {  	.mount =	debug_mount,  	.kill_sb =	kill_litter_super,  }; +MODULE_ALIAS_FS("debugfs");  static struct dentry *__create_file(const char *name, umode_t mode,  				    struct dentry *parent, void *data, diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index e15ef38c24f..434aa313f07 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -12,3 +12,11 @@ config ECRYPT_FS  	  To compile this file system support as a module, choose M here: the  	  module will be called ecryptfs. + +config ECRYPT_FS_MESSAGING +	bool "Enable notifications for userspace key wrap/unwrap" +	depends on ECRYPT_FS +	help +	  Enables the /dev/ecryptfs entry for use by ecryptfsd. This allows +	  for userspace to wrap/unwrap file encryption keys by other +	  backends, like OpenSSL. diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index 2cc9ee4ad2e..49678a69947 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -1,7 +1,10 @@  # -# Makefile for the Linux 2.6 eCryptfs +# Makefile for the Linux eCryptfs  #  obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o +ecryptfs-y := dentry.o file.o inode.o main.o super.o mmap.o read_write.o \ +	      crypto.o keystore.o kthread.o debug.o + +ecryptfs-$(CONFIG_ECRYPT_FS_MESSAGING) += messaging.o miscdev.o diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index a7b0c2dfb3d..d5c25db4398 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -301,17 +301,14 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,  	while (size > 0 && i < sg_size) {  		pg = virt_to_page(addr);  		offset = offset_in_page(addr); -		if (sg) -			sg_set_page(&sg[i], pg, 0, offset); +		sg_set_page(&sg[i], pg, 0, offset);  		remainder_of_page = PAGE_CACHE_SIZE - offset;  		if (size >= remainder_of_page) { -			if (sg) -				sg[i].length = remainder_of_page; +			sg[i].length = remainder_of_page;  			addr += remainder_of_page;  			size -= remainder_of_page;  		} else { -			if (sg) -				sg[i].length = size; +			sg[i].length = size;  			addr += size;  			size = 0;  		} diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 1b5d9af937d..bf12ba5dd22 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -45,14 +45,12 @@  static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)  {  	struct dentry *lower_dentry; -	struct vfsmount *lower_mnt;  	int rc = 1;  	if (flags & LOOKUP_RCU)  		return -ECHILD;  	lower_dentry = ecryptfs_dentry_to_lower(dentry); -	lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);  	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)  		goto out;  	rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 7e2c6f5d798..dd299b389d4 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -172,6 +172,19 @@ ecryptfs_get_key_payload_data(struct key *key)  #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24  #define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) +#ifdef CONFIG_ECRYPT_FS_MESSAGING +# define ECRYPTFS_VERSIONING_MASK_MESSAGING (ECRYPTFS_VERSIONING_DEVMISC \ +					     | ECRYPTFS_VERSIONING_PUBKEY) +#else +# define ECRYPTFS_VERSIONING_MASK_MESSAGING 0 +#endif + +#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ +				  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ +				  | ECRYPTFS_VERSIONING_XATTR \ +				  | ECRYPTFS_VERSIONING_MULTKEY \ +				  | ECRYPTFS_VERSIONING_MASK_MESSAGING \ +				  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)  struct ecryptfs_key_sig {  	struct list_head crypt_stat_list;  	char keysig[ECRYPTFS_SIG_SIZE_HEX + 1]; @@ -399,7 +412,9 @@ struct ecryptfs_daemon {  	struct hlist_node euid_chain;  }; +#ifdef CONFIG_ECRYPT_FS_MESSAGING  extern struct mutex ecryptfs_daemon_hash_mux; +#endif  static inline size_t  ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) @@ -610,6 +625,7 @@ int  ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,  		  size_t size, int flags);  int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); +#ifdef CONFIG_ECRYPT_FS_MESSAGING  int ecryptfs_process_response(struct ecryptfs_daemon *daemon,  			      struct ecryptfs_message *msg, u32 seq);  int ecryptfs_send_message(char *data, int data_len, @@ -618,6 +634,24 @@ int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,  			       struct ecryptfs_message **emsg);  int ecryptfs_init_messaging(void);  void ecryptfs_release_messaging(void); +#else +static inline int ecryptfs_init_messaging(void) +{ +	return 0; +} +static inline void ecryptfs_release_messaging(void) +{ } +static inline int ecryptfs_send_message(char *data, int data_len, +					struct ecryptfs_msg_ctx **msg_ctx) +{ +	return -ENOTCONN; +} +static inline int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, +					     struct ecryptfs_message **emsg) +{ +	return -ENOMSG; +} +#endif  void  ecryptfs_write_header_metadata(char *virt, @@ -655,12 +689,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,  				     size_t offset_in_page, size_t size,  				     struct inode *ecryptfs_inode);  struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); -int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); -int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon);  int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,  				 size_t *length_size);  int ecryptfs_write_packet_length(char *dest, size_t size,  				 size_t *packet_size_length); +#ifdef CONFIG_ECRYPT_FS_MESSAGING  int ecryptfs_init_ecryptfs_miscdev(void);  void ecryptfs_destroy_ecryptfs_miscdev(void);  int ecryptfs_send_miscdev(char *data, size_t data_size, @@ -669,6 +702,9 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,  void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);  int  ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file); +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); +#endif  int ecryptfs_init_kthread(void);  void ecryptfs_destroy_kthread(void);  int ecryptfs_privileged_open(struct file **lower_file, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 53acc9d0c13..63b1f54b6a1 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -199,7 +199,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)  	struct dentry *ecryptfs_dentry = file->f_path.dentry;  	/* Private value of ecryptfs_dentry allocated in  	 * ecryptfs_lookup() */ -	struct dentry *lower_dentry;  	struct ecryptfs_file_info *file_info;  	mount_crypt_stat = &ecryptfs_superblock_to_private( @@ -222,7 +221,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)  		rc = -ENOMEM;  		goto out;  	} -	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);  	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;  	mutex_lock(&crypt_stat->cs_mutex);  	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e0f07fb6d56..5eab400e259 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -999,8 +999,8 @@ out:  	return rc;  } -int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, -			  struct kstat *stat) +static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, +				 struct kstat *stat)  {  	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;  	int rc = 0; @@ -1021,8 +1021,8 @@ int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,  	return rc;  } -int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, -		     struct kstat *stat) +static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, +			    struct kstat *stat)  {  	struct kstat lower_stat;  	int rc; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 2333203a120..7d52806c211 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1150,7 +1150,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,  	struct ecryptfs_message *msg = NULL;  	char *auth_tok_sig;  	char *payload; -	size_t payload_len; +	size_t payload_len = 0;  	int rc;  	rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); @@ -1168,7 +1168,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,  	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);  	if (rc) {  		ecryptfs_printk(KERN_ERR, "Error sending message to " -				"ecryptfsd\n"); +				"ecryptfsd: %d\n", rc);  		goto out;  	}  	rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1202,8 +1202,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,  				  crypt_stat->key_size);  	}  out: -	if (msg) -		kfree(msg); +	kfree(msg);  	return rc;  } @@ -1989,7 +1988,7 @@ pki_encrypt_session_key(struct key *auth_tok_key,  	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);  	if (rc) {  		ecryptfs_printk(KERN_ERR, "Error sending message to " -				"ecryptfsd\n"); +				"ecryptfsd: %d\n", rc);  		goto out;  	}  	rc = ecryptfs_wait_for_response(msg_ctx, &msg); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 4e0886c9e5c..e924cf45aad 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -629,6 +629,7 @@ static struct file_system_type ecryptfs_fs_type = {  	.kill_sb = ecryptfs_kill_block_super,  	.fs_flags = 0  }; +MODULE_ALIAS_FS("ecryptfs");  /**   * inode_info_init_once diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 8d7a577ae49..49ff8ea08f1 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -97,8 +97,7 @@ static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx)  void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)  {  	list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list); -	if (msg_ctx->msg) -		kfree(msg_ctx->msg); +	kfree(msg_ctx->msg);  	msg_ctx->msg = NULL;  	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE;  } @@ -283,7 +282,7 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,  	int rc;  	rc = ecryptfs_find_daemon_by_euid(&daemon); -	if (rc || !daemon) { +	if (rc) {  		rc = -ENOTCONN;  		goto out;  	} diff --git a/fs/efs/super.c b/fs/efs/super.c index 2002431ef9a..c6f57a74a55 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -33,6 +33,7 @@ static struct file_system_type efs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("efs");  static struct pt_types sgi_pt_types[] = {  	{0x00,		"SGI vh"}, diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 5e59280d42d..9d976332873 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -1010,6 +1010,7 @@ static struct file_system_type exofs_type = {  	.mount          = exofs_mount,  	.kill_sb        = generic_shutdown_super,  }; +MODULE_ALIAS_FS("exofs");  static int __init init_exofs(void)  { diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 8f370e012e6..7cadd823bb3 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -118,7 +118,6 @@ void ext2_free_inode (struct inode * inode)  	 * as writing the quota to disk may need the lock as well.  	 */  	/* Quota is already initialized in iput() */ -	ext2_xattr_delete_inode(inode);  	dquot_free_inode(inode);  	dquot_drop(inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c3881e56662..fe60cc1117d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -34,6 +34,7 @@  #include "ext2.h"  #include "acl.h"  #include "xip.h" +#include "xattr.h"  static int __ext2_write_inode(struct inode *inode, int do_sync); @@ -88,6 +89,7 @@ void ext2_evict_inode(struct inode * inode)  		inode->i_size = 0;  		if (inode->i_blocks)  			ext2_truncate_blocks(inode, 0); +		ext2_xattr_delete_inode(inode);  	}  	invalidate_inode_buffers(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7f68c811402..288534920fe 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1536,6 +1536,7 @@ static struct file_system_type ext2_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("ext2");  static int __init init_ext2_fs(void)  { diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5546ca225ff..fb5120a5505 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -353,7 +353,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)  	return bdev;  fail: -	ext3_msg(sb, "error: failed to open journal device %s: %ld", +	ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld",  		__bdevname(dev, b), PTR_ERR(bdev));  	return NULL; @@ -887,7 +887,7 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)  	/*todo: use simple_strtoll with >32bit ext3 */  	sb_block = simple_strtoul(options, &options, 0);  	if (*options && *options != ',') { -		ext3_msg(sb, "error: invalid sb specification: %s", +		ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s",  		       (char *) *data);  		return 1;  	} @@ -3068,6 +3068,7 @@ static struct file_system_type ext3_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("ext3");  static int __init init_ext3_fs(void)  { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a01ba31526..3b83cd60479 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -335,9 +335,9 @@ struct ext4_group_desc   */  struct flex_groups { -	atomic_t free_inodes; -	atomic_t free_clusters; -	atomic_t used_dirs; +	atomic64_t	free_clusters; +	atomic_t	free_inodes; +	atomic_t	used_dirs;  };  #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */ @@ -2617,7 +2617,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,  extern int __init ext4_init_pageio(void);  extern void ext4_add_complete_io(ext4_io_end_t *io_end);  extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); +extern void ext4_ioend_shutdown(struct inode *);  extern void ext4_free_io_end(ext4_io_end_t *io);  extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);  extern void ext4_end_io_work(struct work_struct *work); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 28dd8eeea6a..9c6d06dcef8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1584,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,  	unsigned short ext1_ee_len, ext2_ee_len, max_len;  	/* -	 * Make sure that either both extents are uninitialized, or -	 * both are _not_. +	 * Make sure that both extents are initialized. We don't merge +	 * uninitialized extents so that we can be sure that end_io code has +	 * the extent that was written properly split out and conversion to +	 * initialized is trivial.  	 */ -	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) +	if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))  		return 0;  	if (ext4_ext_is_uninitialized(ex1)) @@ -2923,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle,  {  	ext4_fsblk_t newblock;  	ext4_lblk_t ee_block; -	struct ext4_extent *ex, newex, orig_ex; +	struct ext4_extent *ex, newex, orig_ex, zero_ex;  	struct ext4_extent *ex2 = NULL;  	unsigned int ee_len, depth;  	int err = 0; @@ -2943,6 +2945,10 @@ static int ext4_split_extent_at(handle_t *handle,  	newblock = split - ee_block + ext4_ext_pblock(ex);  	BUG_ON(split < ee_block || split >= (ee_block + ee_len)); +	BUG_ON(!ext4_ext_is_uninitialized(ex) && +	       split_flag & (EXT4_EXT_MAY_ZEROOUT | +			     EXT4_EXT_MARK_UNINIT1 | +			     EXT4_EXT_MARK_UNINIT2));  	err = ext4_ext_get_access(handle, inode, path + depth);  	if (err) @@ -2990,12 +2996,29 @@ static int ext4_split_extent_at(handle_t *handle,  	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);  	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {  		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { -			if (split_flag & EXT4_EXT_DATA_VALID1) +			if (split_flag & EXT4_EXT_DATA_VALID1) {  				err = ext4_ext_zeroout(inode, ex2); -			else +				zero_ex.ee_block = ex2->ee_block; +				zero_ex.ee_len = cpu_to_le16( +						ext4_ext_get_actual_len(ex2)); +				ext4_ext_store_pblock(&zero_ex, +						      ext4_ext_pblock(ex2)); +			} else {  				err = ext4_ext_zeroout(inode, ex); -		} else +				zero_ex.ee_block = ex->ee_block; +				zero_ex.ee_len = cpu_to_le16( +						ext4_ext_get_actual_len(ex)); +				ext4_ext_store_pblock(&zero_ex, +						      ext4_ext_pblock(ex)); +			} +		} else {  			err = ext4_ext_zeroout(inode, &orig_ex); +			zero_ex.ee_block = orig_ex.ee_block; +			zero_ex.ee_len = cpu_to_le16( +						ext4_ext_get_actual_len(&orig_ex)); +			ext4_ext_store_pblock(&zero_ex, +					      ext4_ext_pblock(&orig_ex)); +		}  		if (err)  			goto fix_extent_len; @@ -3003,6 +3026,12 @@ static int ext4_split_extent_at(handle_t *handle,  		ex->ee_len = cpu_to_le16(ee_len);  		ext4_ext_try_to_merge(handle, inode, path, ex);  		err = ext4_ext_dirty(handle, inode, path + path->p_depth); +		if (err) +			goto fix_extent_len; + +		/* update extent status tree */ +		err = ext4_es_zeroout(inode, &zero_ex); +  		goto out;  	} else if (err)  		goto fix_extent_len; @@ -3041,6 +3070,7 @@ static int ext4_split_extent(handle_t *handle,  	int err = 0;  	int uninitialized;  	int split_flag1, flags1; +	int allocated = map->m_len;  	depth = ext_depth(inode);  	ex = path[depth].p_ext; @@ -3060,20 +3090,29 @@ static int ext4_split_extent(handle_t *handle,  				map->m_lblk + map->m_len, split_flag1, flags1);  		if (err)  			goto out; +	} else { +		allocated = ee_len - (map->m_lblk - ee_block);  	} - +	/* +	 * Update path is required because previous ext4_split_extent_at() may +	 * result in split of original leaf or extent zeroout. +	 */  	ext4_ext_drop_refs(path);  	path = ext4_ext_find_extent(inode, map->m_lblk, path);  	if (IS_ERR(path))  		return PTR_ERR(path); +	depth = ext_depth(inode); +	ex = path[depth].p_ext; +	uninitialized = ext4_ext_is_uninitialized(ex); +	split_flag1 = 0;  	if (map->m_lblk >= ee_block) { -		split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | -					    EXT4_EXT_DATA_VALID2); -		if (uninitialized) +		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; +		if (uninitialized) {  			split_flag1 |= EXT4_EXT_MARK_UNINIT1; -		if (split_flag & EXT4_EXT_MARK_UNINIT2) -			split_flag1 |= EXT4_EXT_MARK_UNINIT2; +			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | +						     EXT4_EXT_MARK_UNINIT2); +		}  		err = ext4_split_extent_at(handle, inode, path,  				map->m_lblk, split_flag1, flags);  		if (err) @@ -3082,7 +3121,7 @@ static int ext4_split_extent(handle_t *handle,  	ext4_ext_show_leaf(inode, path);  out: -	return err ? err : map->m_len; +	return err ? err : allocated;  }  /* @@ -3137,6 +3176,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  	ee_block = le32_to_cpu(ex->ee_block);  	ee_len = ext4_ext_get_actual_len(ex);  	allocated = ee_len - (map->m_lblk - ee_block); +	zero_ex.ee_len = 0;  	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3227,13 +3267,16 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  	if (EXT4_EXT_MAY_ZEROOUT & split_flag)  		max_zeroout = sbi->s_extent_max_zeroout_kb >> -			inode->i_sb->s_blocksize_bits; +			(inode->i_sb->s_blocksize_bits - 10);  	/* If extent is less than s_max_zeroout_kb, zeroout directly */  	if (max_zeroout && (ee_len <= max_zeroout)) {  		err = ext4_ext_zeroout(inode, ex);  		if (err)  			goto out; +		zero_ex.ee_block = ex->ee_block; +		zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); +		ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));  		err = ext4_ext_get_access(handle, inode, path + depth);  		if (err) @@ -3292,6 +3335,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  		err = allocated;  out: +	/* If we have gotten a failure, don't zero out status tree */ +	if (!err) +		err = ext4_es_zeroout(inode, &zero_ex);  	return err ? err : allocated;  } @@ -3374,8 +3420,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,  		"block %llu, max_blocks %u\n", inode->i_ino,  		  (unsigned long long)ee_block, ee_len); -	/* If extent is larger than requested then split is required */ +	/* If extent is larger than requested it is a clear sign that we still +	 * have some extent state machine issues left. So extent_split is still +	 * required. +	 * TODO: Once all related issues will be fixed this situation should be +	 * illegal. +	 */  	if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG +		ext4_warning("Inode (%ld) finished: extent logical block %llu," +			     " len %u; IO logical block %llu, len %u\n", +			     inode->i_ino, (unsigned long long)ee_block, ee_len, +			     (unsigned long long)map->m_lblk, map->m_len); +#endif  		err = ext4_split_unwritten_extents(handle, inode, map, path,  						   EXT4_GET_BLOCKS_CONVERT);  		if (err < 0) @@ -3626,6 +3683,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  						 path, map->m_len);  		} else  			err = ret; +		map->m_flags |= EXT4_MAP_MAPPED; +		if (allocated > map->m_len) +			allocated = map->m_len; +		map->m_len = allocated;  		goto out2;  	}  	/* buffered IO case */ @@ -3675,6 +3736,7 @@ out:  					allocated - map->m_len);  		allocated = map->m_len;  	} +	map->m_len = allocated;  	/*  	 * If we have done fallocate with the offset that is already @@ -4106,9 +4168,6 @@ got_allocated_blocks:  			}  		} else {  			BUG_ON(allocated_clusters < reserved_clusters); -			/* We will claim quota for all newly allocated blocks.*/ -			ext4_da_update_reserve_space(inode, allocated_clusters, -							1);  			if (reserved_clusters < allocated_clusters) {  				struct ext4_inode_info *ei = EXT4_I(inode);  				int reservation = allocated_clusters - @@ -4159,6 +4218,15 @@ got_allocated_blocks:  				ei->i_reserved_data_blocks += reservation;  				spin_unlock(&ei->i_block_reservation_lock);  			} +			/* +			 * We will claim quota for all newly allocated blocks. +			 * We're updating the reserved space *after* the +			 * correction above so we do not accidentally free +			 * all the metadata reservation because we might +			 * actually need it later on. +			 */ +			ext4_da_update_reserve_space(inode, allocated_clusters, +							1);  		}  	} @@ -4368,8 +4436,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  	if (len <= EXT_UNINIT_MAX_LEN << blkbits)  		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -	/* Prevent race condition between unwritten */ -	ext4_flush_unwritten_io(inode);  retry:  	while (ret >= 0 && ret < max_blocks) {  		map.m_lblk = map.m_lblk + ret; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95796a1b752..fe3337a85ed 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -333,17 +333,27 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)  static int ext4_es_can_be_merged(struct extent_status *es1,  				 struct extent_status *es2)  { -	if (es1->es_lblk + es1->es_len != es2->es_lblk) +	if (ext4_es_status(es1) != ext4_es_status(es2))  		return 0; -	if (ext4_es_status(es1) != ext4_es_status(es2)) +	if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)  		return 0; -	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && -	    (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) +	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)  		return 0; -	return 1; +	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && +	    (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) +		return 1; + +	if (ext4_es_is_hole(es1)) +		return 1; + +	/* we need to check delayed extent is without unwritten status */ +	if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) +		return 1; + +	return 0;  }  static struct extent_status * @@ -389,6 +399,179 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)  	return es;  } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_es_insert_extent_ext_check(struct inode *inode, +					    struct extent_status *es) +{ +	struct ext4_ext_path *path = NULL; +	struct ext4_extent *ex; +	ext4_lblk_t ee_block; +	ext4_fsblk_t ee_start; +	unsigned short ee_len; +	int depth, ee_status, es_status; + +	path = ext4_ext_find_extent(inode, es->es_lblk, NULL); +	if (IS_ERR(path)) +		return; + +	depth = ext_depth(inode); +	ex = path[depth].p_ext; + +	if (ex) { + +		ee_block = le32_to_cpu(ex->ee_block); +		ee_start = ext4_ext_pblock(ex); +		ee_len = ext4_ext_get_actual_len(ex); + +		ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; +		es_status = ext4_es_is_unwritten(es) ? 1 : 0; + +		/* +		 * Make sure ex and es are not overlap when we try to insert +		 * a delayed/hole extent. +		 */ +		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { +			if (in_range(es->es_lblk, ee_block, ee_len)) { +				pr_warn("ES insert assertation failed for " +					"inode: %lu we can find an extent " +					"at block [%d/%d/%llu/%c], but we " +					"want to add an delayed/hole extent " +					"[%d/%d/%llu/%llx]\n", +					inode->i_ino, ee_block, ee_len, +					ee_start, ee_status ? 'u' : 'w', +					es->es_lblk, es->es_len, +					ext4_es_pblock(es), ext4_es_status(es)); +			} +			goto out; +		} + +		/* +		 * We don't check ee_block == es->es_lblk, etc. because es +		 * might be a part of whole extent, vice versa. +		 */ +		if (es->es_lblk < ee_block || +		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { +			pr_warn("ES insert assertation failed for inode: %lu " +				"ex_status [%d/%d/%llu/%c] != " +				"es_status [%d/%d/%llu/%c]\n", inode->i_ino, +				ee_block, ee_len, ee_start, +				ee_status ? 'u' : 'w', es->es_lblk, es->es_len, +				ext4_es_pblock(es), es_status ? 'u' : 'w'); +			goto out; +		} + +		if (ee_status ^ es_status) { +			pr_warn("ES insert assertation failed for inode: %lu " +				"ex_status [%d/%d/%llu/%c] != " +				"es_status [%d/%d/%llu/%c]\n", inode->i_ino, +				ee_block, ee_len, ee_start, +				ee_status ? 'u' : 'w', es->es_lblk, es->es_len, +				ext4_es_pblock(es), es_status ? 'u' : 'w'); +		} +	} else { +		/* +		 * We can't find an extent on disk.  So we need to make sure +		 * that we don't want to add an written/unwritten extent. +		 */ +		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { +			pr_warn("ES insert assertation failed for inode: %lu " +				"can't find an extent at block %d but we want " +				"to add an written/unwritten extent " +				"[%d/%d/%llu/%llx]\n", inode->i_ino, +				es->es_lblk, es->es_lblk, es->es_len, +				ext4_es_pblock(es), ext4_es_status(es)); +		} +	} +out: +	if (path) { +		ext4_ext_drop_refs(path); +		kfree(path); +	} +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, +					    struct extent_status *es) +{ +	struct ext4_map_blocks map; +	int retval; + +	/* +	 * Here we call ext4_ind_map_blocks to lookup a block mapping because +	 * 'Indirect' structure is defined in indirect.c.  So we couldn't +	 * access direct/indirect tree from outside.  It is too dirty to define +	 * this function in indirect.c file. +	 */ + +	map.m_lblk = es->es_lblk; +	map.m_len = es->es_len; + +	retval = ext4_ind_map_blocks(NULL, inode, &map, 0); +	if (retval > 0) { +		if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { +			/* +			 * We want to add a delayed/hole extent but this +			 * block has been allocated. +			 */ +			pr_warn("ES insert assertation failed for inode: %lu " +				"We can find blocks but we want to add a " +				"delayed/hole extent [%d/%d/%llu/%llx]\n", +				inode->i_ino, es->es_lblk, es->es_len, +				ext4_es_pblock(es), ext4_es_status(es)); +			return; +		} else if (ext4_es_is_written(es)) { +			if (retval != es->es_len) { +				pr_warn("ES insert assertation failed for " +					"inode: %lu retval %d != es_len %d\n", +					inode->i_ino, retval, es->es_len); +				return; +			} +			if (map.m_pblk != ext4_es_pblock(es)) { +				pr_warn("ES insert assertation failed for " +					"inode: %lu m_pblk %llu != " +					"es_pblk %llu\n", +					inode->i_ino, map.m_pblk, +					ext4_es_pblock(es)); +				return; +			} +		} else { +			/* +			 * We don't need to check unwritten extent because +			 * indirect-based file doesn't have it. +			 */ +			BUG_ON(1); +		} +	} else if (retval == 0) { +		if (ext4_es_is_written(es)) { +			pr_warn("ES insert assertation failed for inode: %lu " +				"We can't find the block but we want to add " +				"an written extent [%d/%d/%llu/%llx]\n", +				inode->i_ino, es->es_lblk, es->es_len, +				ext4_es_pblock(es), ext4_es_status(es)); +			return; +		} +	} +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, +					       struct extent_status *es) +{ +	/* +	 * We don't need to worry about the race condition because +	 * caller takes i_data_sem locking. +	 */ +	BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); +	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) +		ext4_es_insert_extent_ext_check(inode, es); +	else +		ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, +					       struct extent_status *es) +{ +} +#endif +  static int __es_insert_extent(struct inode *inode, struct extent_status *newes)  {  	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -471,6 +654,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,  	ext4_es_store_status(&newes, status);  	trace_ext4_es_insert_extent(inode, &newes); +	ext4_es_insert_extent_check(inode, &newes); +  	write_lock(&EXT4_I(inode)->i_es_lock);  	err = __es_remove_extent(inode, lblk, end);  	if (err != 0) @@ -669,6 +854,23 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,  	return err;  } +int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) +{ +	ext4_lblk_t  ee_block; +	ext4_fsblk_t ee_pblock; +	unsigned int ee_len; + +	ee_block  = le32_to_cpu(ex->ee_block); +	ee_len    = ext4_ext_get_actual_len(ex); +	ee_pblock = ext4_ext_pblock(ex); + +	if (ee_len == 0) +		return 0; + +	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, +				     EXTENT_STATUS_WRITTEN); +} +  static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)  {  	struct ext4_sb_info *sbi = container_of(shrink, diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f190dfe969d..d8e2d4dc311 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -21,6 +21,12 @@  #endif  /* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/*   * These flags live in the high bits of extent_status.es_pblk   */  #define EXTENT_STATUS_WRITTEN	(1ULL << 63) @@ -33,6 +39,8 @@  				 EXTENT_STATUS_DELAYED | \  				 EXTENT_STATUS_HOLE) +struct ext4_extent; +  struct extent_status {  	struct rb_node rb_node;  	ext4_lblk_t es_lblk;	/* first logical block extent covers */ @@ -58,6 +66,7 @@ extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,  					struct extent_status *es);  extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,  				 struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);  static inline int ext4_es_is_written(struct extent_status *es)  { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 32fd2b9075d..6c5bb8d993f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -324,8 +324,8 @@ error_return:  }  struct orlov_stats { +	__u64 free_clusters;  	__u32 free_inodes; -	__u32 free_clusters;  	__u32 used_dirs;  }; @@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,  	if (flex_size > 1) {  		stats->free_inodes = atomic_read(&flex_group[g].free_inodes); -		stats->free_clusters = atomic_read(&flex_group[g].free_clusters); +		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);  		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);  		return;  	} diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index b505a145a59..a04183127ef 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1539,9 +1539,9 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,  		blk = *i_data;  		if (level > 0) {  			ext4_lblk_t first2; -			bh = sb_bread(inode->i_sb, blk); +			bh = sb_bread(inode->i_sb, le32_to_cpu(blk));  			if (!bh) { -				EXT4_ERROR_INODE_BLOCK(inode, blk, +				EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),  						       "Read failure");  				return -EIO;  			} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9ea0cde3fa9..b3a5213bc73 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -185,8 +185,6 @@ void ext4_evict_inode(struct inode *inode)  	trace_ext4_evict_inode(inode); -	ext4_ioend_wait(inode); -  	if (inode->i_nlink) {  		/*  		 * When journalling data dirty buffers are tracked only in the @@ -207,7 +205,8 @@ void ext4_evict_inode(struct inode *inode)  		 * don't use page cache.  		 */  		if (ext4_should_journal_data(inode) && -		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { +		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && +		    inode->i_ino != EXT4_JOURNAL_INO) {  			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;  			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -216,6 +215,7 @@ void ext4_evict_inode(struct inode *inode)  			filemap_write_and_wait(&inode->i_data);  		}  		truncate_inode_pages(&inode->i_data, 0); +		ext4_ioend_shutdown(inode);  		goto no_delete;  	} @@ -225,6 +225,7 @@ void ext4_evict_inode(struct inode *inode)  	if (ext4_should_order_data(inode))  		ext4_begin_ordered_truncate(inode, 0);  	truncate_inode_pages(&inode->i_data, 0); +	ext4_ioend_shutdown(inode);  	if (is_bad_inode(inode))  		goto no_delete; @@ -482,6 +483,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,  	return num;  } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, +				       struct inode *inode, +				       struct ext4_map_blocks *es_map, +				       struct ext4_map_blocks *map, +				       int flags) +{ +	int retval; + +	map->m_flags = 0; +	/* +	 * There is a race window that the result is not the same. +	 * e.g. xfstests #223 when dioread_nolock enables.  The reason +	 * is that we lookup a block mapping in extent status tree with +	 * out taking i_data_sem.  So at the time the unwritten extent +	 * could be converted. +	 */ +	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) +		down_read((&EXT4_I(inode)->i_data_sem)); +	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { +		retval = ext4_ext_map_blocks(handle, inode, map, flags & +					     EXT4_GET_BLOCKS_KEEP_SIZE); +	} else { +		retval = ext4_ind_map_blocks(handle, inode, map, flags & +					     EXT4_GET_BLOCKS_KEEP_SIZE); +	} +	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) +		up_read((&EXT4_I(inode)->i_data_sem)); +	/* +	 * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag +	 * because it shouldn't be marked in es_map->m_flags. +	 */ +	map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); + +	/* +	 * We don't check m_len because extent will be collpased in status +	 * tree.  So the m_len might not equal. +	 */ +	if (es_map->m_lblk != map->m_lblk || +	    es_map->m_flags != map->m_flags || +	    es_map->m_pblk != map->m_pblk) { +		printk("ES cache assertation failed for inode: %lu " +		       "es_cached ex [%d/%d/%llu/%x] != " +		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n", +		       inode->i_ino, es_map->m_lblk, es_map->m_len, +		       es_map->m_pblk, es_map->m_flags, map->m_lblk, +		       map->m_len, map->m_pblk, map->m_flags, +		       retval, flags); +	} +} +#endif /* ES_AGGRESSIVE_TEST */ +  /*   * The ext4_map_blocks() function tries to look up the requested blocks,   * and returns if the blocks are already mapped. @@ -509,6 +562,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  {  	struct extent_status es;  	int retval; +#ifdef ES_AGGRESSIVE_TEST +	struct ext4_map_blocks orig_map; + +	memcpy(&orig_map, map, sizeof(*map)); +#endif  	map->m_flags = 0;  	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," @@ -531,6 +589,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		} else {  			BUG_ON(1);  		} +#ifdef ES_AGGRESSIVE_TEST +		ext4_map_blocks_es_recheck(handle, inode, map, +					   &orig_map, flags); +#endif  		goto found;  	} @@ -551,6 +613,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		int ret;  		unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST +		if (retval != map->m_len) { +			printk("ES len assertation failed for inode: %lu " +			       "retval %d != map->m_len %d " +			       "in %s (lookup)\n", inode->i_ino, retval, +			       map->m_len, __func__); +		} +#endif +  		status = map->m_flags & EXT4_MAP_UNWRITTEN ?  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;  		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -643,6 +714,24 @@ found:  		int ret;  		unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST +		if (retval != map->m_len) { +			printk("ES len assertation failed for inode: %lu " +			       "retval %d != map->m_len %d " +			       "in %s (allocation)\n", inode->i_ino, retval, +			       map->m_len, __func__); +		} +#endif + +		/* +		 * If the extent has been zeroed out, we don't need to update +		 * extent status tree. +		 */ +		if ((flags & EXT4_GET_BLOCKS_PRE_IO) && +		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) { +			if (ext4_es_is_written(&es)) +				goto has_zeroout; +		}  		status = map->m_flags & EXT4_MAP_UNWRITTEN ?  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;  		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -655,6 +744,7 @@ found:  			retval = ret;  	} +has_zeroout:  	up_write((&EXT4_I(inode)->i_data_sem));  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {  		int ret = check_block_validity(inode, map); @@ -1216,6 +1306,55 @@ static int ext4_journalled_write_end(struct file *file,  }  /* + * Reserve a metadata for a single block located at lblock + */ +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) +{ +	int retries = 0; +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	struct ext4_inode_info *ei = EXT4_I(inode); +	unsigned int md_needed; +	ext4_lblk_t save_last_lblock; +	int save_len; + +	/* +	 * recalculate the amount of metadata blocks to reserve +	 * in order to allocate nrblocks +	 * worse case is one extent per block +	 */ +repeat: +	spin_lock(&ei->i_block_reservation_lock); +	/* +	 * ext4_calc_metadata_amount() has side effects, which we have +	 * to be prepared undo if we fail to claim space. +	 */ +	save_len = ei->i_da_metadata_calc_len; +	save_last_lblock = ei->i_da_metadata_calc_last_lblock; +	md_needed = EXT4_NUM_B2C(sbi, +				 ext4_calc_metadata_amount(inode, lblock)); +	trace_ext4_da_reserve_space(inode, md_needed); + +	/* +	 * We do still charge estimated metadata to the sb though; +	 * we cannot afford to run out of free blocks. +	 */ +	if (ext4_claim_free_clusters(sbi, md_needed, 0)) { +		ei->i_da_metadata_calc_len = save_len; +		ei->i_da_metadata_calc_last_lblock = save_last_lblock; +		spin_unlock(&ei->i_block_reservation_lock); +		if (ext4_should_retry_alloc(inode->i_sb, &retries)) { +			cond_resched(); +			goto repeat; +		} +		return -ENOSPC; +	} +	ei->i_reserved_meta_blocks += md_needed; +	spin_unlock(&ei->i_block_reservation_lock); + +	return 0;       /* success */ +} + +/*   * Reserve a single cluster located at lblock   */  static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) @@ -1263,7 +1402,7 @@ repeat:  		ei->i_da_metadata_calc_last_lblock = save_last_lblock;  		spin_unlock(&ei->i_block_reservation_lock);  		if (ext4_should_retry_alloc(inode->i_sb, &retries)) { -			yield(); +			cond_resched();  			goto repeat;  		}  		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); @@ -1768,6 +1907,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  	struct extent_status es;  	int retval;  	sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST +	struct ext4_map_blocks orig_map; + +	memcpy(&orig_map, map, sizeof(*map)); +#endif  	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))  		invalid_block = ~0; @@ -1809,6 +1953,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  		else  			BUG_ON(1); +#ifdef ES_AGGRESSIVE_TEST +		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif  		return retval;  	} @@ -1843,8 +1990,11 @@ add_delayed:  		 * XXX: __block_prepare_write() unmaps passed block,  		 * is it OK?  		 */ -		/* If the block was allocated from previously allocated cluster, -		 * then we dont need to reserve it again. */ +		/* +		 * If the block was allocated from previously allocated cluster, +		 * then we don't need to reserve it again. However we still need +		 * to reserve metadata for every block we're going to write. +		 */  		if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {  			ret = ext4_da_reserve_space(inode, iblock);  			if (ret) { @@ -1852,6 +2002,13 @@ add_delayed:  				retval = ret;  				goto out_unlock;  			} +		} else { +			ret = ext4_da_reserve_metadata(inode, iblock); +			if (ret) { +				/* not enough space to reserve */ +				retval = ret; +				goto out_unlock; +			}  		}  		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -1873,6 +2030,15 @@ add_delayed:  		int ret;  		unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST +		if (retval != map->m_len) { +			printk("ES len assertation failed for inode: %lu " +			       "retval %d != map->m_len %d " +			       "in %s (lookup)\n", inode->i_ino, retval, +			       map->m_len, __func__); +		} +#endif +  		status = map->m_flags & EXT4_MAP_UNWRITTEN ?  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;  		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -2908,8 +3074,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)  	trace_ext4_releasepage(page); -	WARN_ON(PageChecked(page)); -	if (!page_has_buffers(page)) +	/* Page has dirty journalled data -> cannot release */ +	if (PageChecked(page))  		return 0;  	if (journal)  		return jbd2_journal_try_to_free_buffers(journal, page, wait); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7bb713a46fe..ee6614bdb63 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2804,8 +2804,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,  	if (sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group = ext4_flex_group(sbi,  							  ac->ac_b_ex.fe_group); -		atomic_sub(ac->ac_b_ex.fe_len, -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_sub(ac->ac_b_ex.fe_len, +			     &sbi->s_flex_groups[flex_group].free_clusters);  	}  	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -3692,11 +3692,7 @@ repeat:  	if (free < needed && busy) {  		busy = 0;  		ext4_unlock_group(sb, group); -		/* -		 * Yield the CPU here so that we don't get soft lockup -		 * in non preempt case. -		 */ -		yield(); +		cond_resched();  		goto repeat;  	} @@ -4246,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,  			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {  			/* let others to free the space */ -			yield(); +			cond_resched();  			ar->len = ar->len >> 1;  		}  		if (!ar->len) { @@ -4464,7 +4460,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,  	struct buffer_head *bitmap_bh = NULL;  	struct super_block *sb = inode->i_sb;  	struct ext4_group_desc *gdp; -	unsigned long freed = 0;  	unsigned int overflow;  	ext4_grpblk_t bit;  	struct buffer_head *gd_bh; @@ -4666,14 +4661,12 @@ do_more:  	if (sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group = ext4_flex_group(sbi, block_group); -		atomic_add(count_clusters, -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_add(count_clusters, +			     &sbi->s_flex_groups[flex_group].free_clusters);  	}  	ext4_mb_unload_buddy(&e4b); -	freed += count; -  	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))  		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); @@ -4811,8 +4804,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,  	if (sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group = ext4_flex_group(sbi, block_group); -		atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), +			     &sbi->s_flex_groups[flex_group].free_clusters);  	}  	ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 4e81d47aa8c..33e1c086858 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,16 +32,18 @@   */  static inline int  get_ext_path(struct inode *inode, ext4_lblk_t lblock, -		struct ext4_ext_path **path) +		struct ext4_ext_path **orig_path)  {  	int ret = 0; +	struct ext4_ext_path *path; -	*path = ext4_ext_find_extent(inode, lblock, *path); -	if (IS_ERR(*path)) { -		ret = PTR_ERR(*path); -		*path = NULL; -	} else if ((*path)[ext_depth(inode)].p_ext == NULL) +	path = ext4_ext_find_extent(inode, lblock, *orig_path); +	if (IS_ERR(path)) +		ret = PTR_ERR(path); +	else if (path[ext_depth(inode)].p_ext == NULL)  		ret = -ENODATA; +	else +		*orig_path = path;  	return ret;  } @@ -611,24 +613,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,  {  	struct ext4_ext_path *path = NULL;  	struct ext4_extent *ext; +	int ret = 0;  	ext4_lblk_t last = from + count;  	while (from < last) {  		*err = get_ext_path(inode, from, &path);  		if (*err) -			return 0; +			goto out;  		ext = path[ext_depth(inode)].p_ext; -		if (!ext) { -			ext4_ext_drop_refs(path); -			return 0; -		} -		if (uninit != ext4_ext_is_uninitialized(ext)) { -			ext4_ext_drop_refs(path); -			return 0; -		} +		if (uninit != ext4_ext_is_uninitialized(ext)) +			goto out;  		from += ext4_ext_get_actual_len(ext);  		ext4_ext_drop_refs(path);  	} -	return 1; +	ret = 1; +out: +	if (path) { +		ext4_ext_drop_refs(path); +		kfree(path); +	} +	return ret;  }  /** @@ -666,6 +669,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,  	int replaced_count = 0;  	int dext_alen; +	*err = ext4_es_remove_extent(orig_inode, from, count); +	if (*err) +		goto out; + +	*err = ext4_es_remove_extent(donor_inode, from, count); +	if (*err) +		goto out; +  	/* Get the original extent for the block "orig_off" */  	*err = get_ext_path(orig_inode, orig_off, &orig_path);  	if (*err) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 809b31003ec..047a6de04a0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -50,11 +50,21 @@ void ext4_exit_pageio(void)  	kmem_cache_destroy(io_page_cachep);  } -void ext4_ioend_wait(struct inode *inode) +/* + * This function is called by ext4_evict_inode() to make sure there is + * no more pending I/O completion work left to do. + */ +void ext4_ioend_shutdown(struct inode *inode)  {  	wait_queue_head_t *wq = ext4_ioend_wq(inode);  	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +	/* +	 * We need to make sure the work structure is finished being +	 * used before we let the inode get destroyed. +	 */ +	if (work_pending(&EXT4_I(inode)->i_unwritten_work)) +		cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);  }  static void put_io_page(struct ext4_io_page *io_page) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b2c8ee56eb9..c169477a62c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1360,8 +1360,8 @@ static void ext4_update_super(struct super_block *sb,  	    sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group;  		flex_group = ext4_flex_group(sbi, group_data[0].group); -		atomic_add(EXT4_NUM_B2C(sbi, free_blocks), -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), +			     &sbi->s_flex_groups[flex_group].free_clusters);  		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,  			   &sbi->s_flex_groups[flex_group].free_inodes);  	} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5e6c8783619..5d6d5357812 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -90,6 +90,8 @@ static struct file_system_type ext2_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("ext2"); +MODULE_ALIAS("ext2");  #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)  #else  #define IS_EXT2_SB(sb) (0) @@ -104,6 +106,8 @@ static struct file_system_type ext3_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("ext3"); +MODULE_ALIAS("ext3");  #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)  #else  #define IS_EXT3_SB(sb) (0) @@ -1923,8 +1927,8 @@ static int ext4_fill_flex_info(struct super_block *sb)  		flex_group = ext4_flex_group(sbi, i);  		atomic_add(ext4_free_inodes_count(sb, gdp),  			   &sbi->s_flex_groups[flex_group].free_inodes); -		atomic_add(ext4_free_group_clusters(sb, gdp), -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_add(ext4_free_group_clusters(sb, gdp), +			     &sbi->s_flex_groups[flex_group].free_clusters);  		atomic_add(ext4_used_dirs_count(sb, gdp),  			   &sbi->s_flex_groups[flex_group].used_dirs);  	} @@ -5152,7 +5156,6 @@ static inline int ext2_feature_set_ok(struct super_block *sb)  		return 0;  	return 1;  } -MODULE_ALIAS("ext2");  #else  static inline void register_as_ext2(void) { }  static inline void unregister_as_ext2(void) { } @@ -5185,7 +5188,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb)  		return 0;  	return 1;  } -MODULE_ALIAS("ext3");  #else  static inline void register_as_ext3(void) { }  static inline void unregister_as_ext3(void) { } @@ -5199,6 +5201,7 @@ static struct file_system_type ext4_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("ext4");  static int __init ext4_init_feat_adverts(void)  { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8c117649a03..fea6e582a2e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -687,6 +687,7 @@ static struct file_system_type f2fs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("f2fs");  static int __init init_inodecache(void)  { diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index e2cfda94a28..081b759cff8 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -668,6 +668,7 @@ static struct file_system_type msdos_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("msdos");  static int __init init_msdos_fs(void)  { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index ac959d655e7..2da952036a3 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1073,6 +1073,7 @@ static struct file_system_type vfat_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("vfat");  static int __init init_vfat_fs(void)  { diff --git a/fs/filesystems.c b/fs/filesystems.c index da165f6adcb..92567d95ba6 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -273,7 +273,7 @@ struct file_system_type *get_fs_type(const char *name)  	int len = dot ? dot - name : strlen(name);  	fs = __get_fs_type(name, len); -	if (!fs && (request_module("%.*s", len, name) == 0)) +	if (!fs && (request_module("fs-%.*s", len, name) == 0))  		fs = __get_fs_type(name, len);  	if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index fed2c8afb3a..e37eb274e49 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -52,7 +52,6 @@ MODULE_AUTHOR("Christoph Hellwig");  MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver");  MODULE_LICENSE("Dual BSD/GPL"); -MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */  static void		vxfs_put_super(struct super_block *); @@ -258,6 +257,8 @@ static struct file_system_type vxfs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */ +MODULE_ALIAS("vxfs");  static int __init  vxfs_init(void) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index b7978b9f75e..a0b0855d00a 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -341,6 +341,7 @@ static struct file_system_type fuse_ctl_fs_type = {  	.mount		= fuse_ctl_mount,  	.kill_sb	= fuse_ctl_kill_sb,  }; +MODULE_ALIAS_FS("fusectl");  int __init fuse_ctl_init(void)  { diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index df00993ed10..137185c3884 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1117,6 +1117,7 @@ static struct file_system_type fuse_fs_type = {  	.mount		= fuse_mount,  	.kill_sb	= fuse_kill_sb_anon,  }; +MODULE_ALIAS_FS("fuse");  #ifdef CONFIG_BLOCK  static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, @@ -1146,6 +1147,7 @@ static struct file_system_type fuseblk_fs_type = {  	.kill_sb	= fuse_kill_sb_blk,  	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,  }; +MODULE_ALIAS_FS("fuseblk");  static inline int register_fuseblk(void)  { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 019f45e4509..d79c2dadc53 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -923,8 +923,11 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)  		cmd = F_SETLK;  		fl->fl_type = F_UNLCK;  	} -	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) +	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { +		if (fl->fl_type == F_UNLCK) +			posix_lock_file_wait(file, fl);  		return -EIO; +	}  	if (IS_GETLK(cmd))  		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);  	else if (fl->fl_type == F_UNLCK) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 156e42ec84e..5c29216e9cc 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -588,6 +588,7 @@ struct lm_lockstruct {  	struct dlm_lksb ls_control_lksb; /* control_lock */  	char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */  	struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ +	char *ls_lvb_bits;  	spinlock_t ls_recover_spin; /* protects following fields */  	unsigned long ls_recover_flags; /* DFL_ */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 9802de0f85e..c8423d6de6c 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -483,12 +483,8 @@ static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,  static int all_jid_bits_clear(char *lvb)  { -	int i; -	for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) { -		if (lvb[i]) -			return 0; -	} -	return 1; +	return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0, +			GDLM_LVB_SIZE - JID_BITMAP_OFFSET);  }  static void sync_wait_cb(void *arg) @@ -580,7 +576,6 @@ static void gfs2_control_func(struct work_struct *work)  {  	struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);  	struct lm_lockstruct *ls = &sdp->sd_lockstruct; -	char lvb_bits[GDLM_LVB_SIZE];  	uint32_t block_gen, start_gen, lvb_gen, flags;  	int recover_set = 0;  	int write_lvb = 0; @@ -634,7 +629,7 @@ static void gfs2_control_func(struct work_struct *work)  		return;  	} -	control_lvb_read(ls, &lvb_gen, lvb_bits); +	control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits);  	spin_lock(&ls->ls_recover_spin);  	if (block_gen != ls->ls_recover_block || @@ -664,10 +659,10 @@ static void gfs2_control_func(struct work_struct *work)  			ls->ls_recover_result[i] = 0; -			if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) +			if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET))  				continue; -			__clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); +			__clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET);  			write_lvb = 1;  		}  	} @@ -691,7 +686,7 @@ static void gfs2_control_func(struct work_struct *work)  				continue;  			if (ls->ls_recover_submit[i] < start_gen) {  				ls->ls_recover_submit[i] = 0; -				__set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); +				__set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET);  			}  		}  		/* even if there are no bits to set, we need to write the @@ -705,7 +700,7 @@ static void gfs2_control_func(struct work_struct *work)  	spin_unlock(&ls->ls_recover_spin);  	if (write_lvb) { -		control_lvb_write(ls, start_gen, lvb_bits); +		control_lvb_write(ls, start_gen, ls->ls_lvb_bits);  		flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;  	} else {  		flags = DLM_LKF_CONVERT; @@ -725,7 +720,7 @@ static void gfs2_control_func(struct work_struct *work)  	 */  	for (i = 0; i < recover_size; i++) { -		if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) { +		if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) {  			fs_info(sdp, "recover generation %u jid %d\n",  				start_gen, i);  			gfs2_recover_set(sdp, i); @@ -758,7 +753,6 @@ static void gfs2_control_func(struct work_struct *work)  static int control_mount(struct gfs2_sbd *sdp)  {  	struct lm_lockstruct *ls = &sdp->sd_lockstruct; -	char lvb_bits[GDLM_LVB_SIZE];  	uint32_t start_gen, block_gen, mount_gen, lvb_gen;  	int mounted_mode;  	int retries = 0; @@ -857,7 +851,7 @@ locks_done:  	 * lvb_gen will be non-zero.  	 */ -	control_lvb_read(ls, &lvb_gen, lvb_bits); +	control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits);  	if (lvb_gen == 0xFFFFFFFF) {  		/* special value to force mount attempts to fail */ @@ -887,7 +881,7 @@ locks_done:  	 * and all lvb bits to be clear (no pending journal recoveries.)  	 */ -	if (!all_jid_bits_clear(lvb_bits)) { +	if (!all_jid_bits_clear(ls->ls_lvb_bits)) {  		/* journals need recovery, wait until all are clear */  		fs_info(sdp, "control_mount wait for journal recovery\n");  		goto restart; @@ -949,7 +943,6 @@ static int dlm_recovery_wait(void *word)  static int control_first_done(struct gfs2_sbd *sdp)  {  	struct lm_lockstruct *ls = &sdp->sd_lockstruct; -	char lvb_bits[GDLM_LVB_SIZE];  	uint32_t start_gen, block_gen;  	int error; @@ -991,8 +984,8 @@ restart:  	memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));  	spin_unlock(&ls->ls_recover_spin); -	memset(lvb_bits, 0, sizeof(lvb_bits)); -	control_lvb_write(ls, start_gen, lvb_bits); +	memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE); +	control_lvb_write(ls, start_gen, ls->ls_lvb_bits);  	error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);  	if (error) @@ -1022,6 +1015,12 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,  	uint32_t old_size, new_size;  	int i, max_jid; +	if (!ls->ls_lvb_bits) { +		ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); +		if (!ls->ls_lvb_bits) +			return -ENOMEM; +	} +  	max_jid = 0;  	for (i = 0; i < num_slots; i++) {  		if (max_jid < slots[i].slot - 1) @@ -1057,6 +1056,7 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,  static void free_recover_size(struct lm_lockstruct *ls)  { +	kfree(ls->ls_lvb_bits);  	kfree(ls->ls_recover_submit);  	kfree(ls->ls_recover_result);  	ls->ls_recover_submit = NULL; @@ -1205,6 +1205,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)  	ls->ls_recover_size = 0;  	ls->ls_recover_submit = NULL;  	ls->ls_recover_result = NULL; +	ls->ls_lvb_bits = NULL;  	error = set_recover_size(sdp, NULL, 0);  	if (error) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1b612be4b87..60ede2a0f43 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -20,6 +20,7 @@  #include <linux/gfs2_ondisk.h>  #include <linux/quotaops.h>  #include <linux/lockdep.h> +#include <linux/module.h>  #include "gfs2.h"  #include "incore.h" @@ -1425,6 +1426,7 @@ struct file_system_type gfs2_fs_type = {  	.kill_sb = gfs2_kill_sb,  	.owner = THIS_MODULE,  }; +MODULE_ALIAS_FS("gfs2");  struct file_system_type gfs2meta_fs_type = {  	.name = "gfs2meta", @@ -1432,4 +1434,4 @@ struct file_system_type gfs2meta_fs_type = {  	.mount = gfs2_mount_meta,  	.owner = THIS_MODULE,  }; - +MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index d1f51fd73f8..5a51265a434 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -576,7 +576,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)  	RB_CLEAR_NODE(&ip->i_res->rs_node);  out:  	up_write(&ip->i_rw_mutex); -	return 0; +	return error;  }  static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) @@ -1181,12 +1181,9 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,  			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)  {  	struct super_block *sb = sdp->sd_vfs; -	struct block_device *bdev = sb->s_bdev; -	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / -					   bdev_logical_block_size(sb->s_bdev);  	u64 blk;  	sector_t start = 0; -	sector_t nr_sects = 0; +	sector_t nr_blks = 0;  	int rv;  	unsigned int x;  	u32 trimmed = 0; @@ -1206,35 +1203,34 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,  		if (diff == 0)  			continue;  		blk = offset + ((bi->bi_start + x) * GFS2_NBBY); -		blk *= sects_per_blk; /* convert to sectors */  		while(diff) {  			if (diff & 1) { -				if (nr_sects == 0) +				if (nr_blks == 0)  					goto start_new_extent; -				if ((start + nr_sects) != blk) { -					if (nr_sects >= minlen) { -						rv = blkdev_issue_discard(bdev, -							start, nr_sects, +				if ((start + nr_blks) != blk) { +					if (nr_blks >= minlen) { +						rv = sb_issue_discard(sb, +							start, nr_blks,  							GFP_NOFS, 0);  						if (rv)  							goto fail; -						trimmed += nr_sects; +						trimmed += nr_blks;  					} -					nr_sects = 0; +					nr_blks = 0;  start_new_extent:  					start = blk;  				} -				nr_sects += sects_per_blk; +				nr_blks++;  			}  			diff >>= 2; -			blk += sects_per_blk; +			blk++;  		}  	} -	if (nr_sects >= minlen) { -		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0); +	if (nr_blks >= minlen) { +		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);  		if (rv)  			goto fail; -		trimmed += nr_sects; +		trimmed += nr_blks;  	}  	if (ptrimmed)  		*ptrimmed = trimmed; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index e93ddaadfd1..bbaaa8a4ee6 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -466,6 +466,7 @@ static struct file_system_type hfs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("hfs");  static void hfs_init_once(void *p)  { diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 974c26f96fa..7b87284e46d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -654,6 +654,7 @@ static struct file_system_type hfsplus_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("hfsplus");  static void hfsplus_init_once(void *p)  { diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index fbabb906066..0f6e52d22b8 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -845,15 +845,8 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)  		return err;  	if ((attr->ia_valid & ATTR_SIZE) && -	    attr->ia_size != i_size_read(inode)) { -		int error; - -		error = inode_newsize_ok(inode, attr->ia_size); -		if (error) -			return error; - +	    attr->ia_size != i_size_read(inode))  		truncate_setsize(inode, attr->ia_size); -	}  	setattr_copy(inode, attr);  	mark_inode_dirty(inode); @@ -993,6 +986,7 @@ static struct file_system_type hostfs_type = {  	.kill_sb	= hostfs_kill_sb,  	.fs_flags 	= 0,  }; +MODULE_ALIAS_FS("hostfs");  static int __init init_hostfs(void)  { diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a3076228523..a0617e70695 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -688,6 +688,7 @@ static struct file_system_type hpfs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("hpfs");  static int __init init_hpfs_fs(void)  { diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 74f55703be4..126d3c2e2de 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -748,6 +748,7 @@ static struct file_system_type hppfs_type = {  	.kill_sb	= kill_anon_super,  	.fs_flags 	= 0,  }; +MODULE_ALIAS_FS("hppfs");  static int __init init_hppfs(void)  { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7f94e0cbc69..84e3d856e91 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -896,6 +896,7 @@ static struct file_system_type hugetlbfs_fs_type = {  	.mount		= hugetlbfs_mount,  	.kill_sb	= kill_litter_super,  }; +MODULE_ALIAS_FS("hugetlbfs");  static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; diff --git a/fs/internal.h b/fs/internal.h index 507141fceb9..4be78237d89 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,3 +125,8 @@ extern int invalidate_inodes(struct super_block *, bool);   * dcache.c   */  extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); + +/* + * read_write.c + */ +extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 67ce52507d7..d9b8aebdeb2 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1556,6 +1556,8 @@ static struct file_system_type iso9660_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("iso9660"); +MODULE_ALIAS("iso9660");  static int __init init_iso9660_fs(void)  { @@ -1593,5 +1595,3 @@ static void __exit exit_iso9660_fs(void)  module_init(init_iso9660_fs)  module_exit(exit_iso9660_fs)  MODULE_LICENSE("GPL"); -/* Actual filesystem name is iso9660, as requested in filesystems.c */ -MODULE_ALIAS("iso9660"); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6ee5aed56b..325bc019ed8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1065,9 +1065,12 @@ out:  void jbd2_journal_set_triggers(struct buffer_head *bh,  			       struct jbd2_buffer_trigger_type *type)  { -	struct journal_head *jh = bh2jh(bh); +	struct journal_head *jh = jbd2_journal_grab_journal_head(bh); +	if (WARN_ON(!jh)) +		return;  	jh->b_triggers = type; +	jbd2_journal_put_journal_head(jh);  }  void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, @@ -1119,17 +1122,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)  {  	transaction_t *transaction = handle->h_transaction;  	journal_t *journal = transaction->t_journal; -	struct journal_head *jh = bh2jh(bh); +	struct journal_head *jh;  	int ret = 0; -	jbd_debug(5, "journal_head %p\n", jh); -	JBUFFER_TRACE(jh, "entry");  	if (is_handle_aborted(handle))  		goto out; -	if (!buffer_jbd(bh)) { +	jh = jbd2_journal_grab_journal_head(bh); +	if (!jh) {  		ret = -EUCLEAN;  		goto out;  	} +	jbd_debug(5, "journal_head %p\n", jh); +	JBUFFER_TRACE(jh, "entry");  	jbd_lock_bh_state(bh); @@ -1220,6 +1224,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)  	spin_unlock(&journal->j_list_lock);  out_unlock_bh:  	jbd_unlock_bh_state(bh); +	jbd2_journal_put_journal_head(jh);  out:  	JBUFFER_TRACE(jh, "exit");  	WARN_ON(ret);	/* All errors are bugs, so dump the stack */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d3d8799e218..0defb1cc2a3 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -356,6 +356,7 @@ static struct file_system_type jffs2_fs_type = {  	.mount =	jffs2_mount,  	.kill_sb =	jffs2_kill_sb,  }; +MODULE_ALIAS_FS("jffs2");  static int __init init_jffs2_fs(void)  { diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 060ba638bec..2003e830ed1 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -833,6 +833,7 @@ static struct file_system_type jfs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("jfs");  static void init_once(void *foo)  { diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 345c24b8a6f..54360293bcb 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -608,6 +608,7 @@ static struct file_system_type logfs_fs_type = {  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("logfs");  static int __init logfs_init(void)  { diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 99541cceb58..df122496f32 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -660,6 +660,7 @@ static struct file_system_type minix_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("minix");  static int __init init_minix_fs(void)  { diff --git a/fs/namei.c b/fs/namei.c index 961bc126836..57ae9c8c66b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -689,8 +689,6 @@ void nd_jump_link(struct nameidata *nd, struct path *path)  	nd->path = *path;  	nd->inode = nd->path.dentry->d_inode;  	nd->flags |= LOOKUP_JUMPED; - -	BUG_ON(nd->inode->i_op->follow_link);  }  static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb4..d581e45c0a9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -798,6 +798,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,  	}  	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; +	/* Don't allow unprivileged users to change mount flags */ +	if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) +		mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; +  	atomic_inc(&sb->s_active);  	mnt->mnt.mnt_sb = sb;  	mnt->mnt.mnt_root = dget(root); @@ -1713,6 +1717,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)  	if (readonly_request == __mnt_is_readonly(mnt))  		return 0; +	if (mnt->mnt_flags & MNT_LOCK_READONLY) +		return -EPERM; +  	if (readonly_request)  		error = mnt_make_readonly(real_mount(mnt));  	else @@ -2339,7 +2346,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,  	/* First pass: copy the tree topology */  	copy_flags = CL_COPY_ALL | CL_EXPIRE;  	if (user_ns != mnt_ns->user_ns) -		copy_flags |= CL_SHARED_TO_SLAVE; +		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;  	new = copy_tree(old, old->mnt.mnt_root, copy_flags);  	if (IS_ERR(new)) {  		up_write(&namespace_sem); @@ -2732,6 +2739,51 @@ bool our_mnt(struct vfsmount *mnt)  	return check_mnt(real_mount(mnt));  } +bool current_chrooted(void) +{ +	/* Does the current process have a non-standard root */ +	struct path ns_root; +	struct path fs_root; +	bool chrooted; + +	/* Find the namespace root */ +	ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; +	ns_root.dentry = ns_root.mnt->mnt_root; +	path_get(&ns_root); +	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) +		; + +	get_fs_root(current->fs, &fs_root); + +	chrooted = !path_equal(&fs_root, &ns_root); + +	path_put(&fs_root); +	path_put(&ns_root); + +	return chrooted; +} + +void update_mnt_policy(struct user_namespace *userns) +{ +	struct mnt_namespace *ns = current->nsproxy->mnt_ns; +	struct mount *mnt; + +	down_read(&namespace_sem); +	list_for_each_entry(mnt, &ns->list, mnt_list) { +		switch (mnt->mnt.mnt_sb->s_magic) { +		case SYSFS_MAGIC: +			userns->may_mount_sysfs = true; +			break; +		case PROC_SUPER_MAGIC: +			userns->may_mount_proc = true; +			break; +		} +		if (userns->may_mount_sysfs && userns->may_mount_proc) +			break; +	} +	up_read(&namespace_sem); +} +  static void *mntns_get(struct task_struct *task)  {  	struct mnt_namespace *ns = NULL; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 7dafd6899a6..26910c8154d 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -1051,6 +1051,7 @@ static struct file_system_type ncp_fs_type = {  	.kill_sb	= kill_anon_super,  	.fs_flags	= FS_BINARY_MOUNTDATA,  }; +MODULE_ALIAS_FS("ncpfs");  static int __init init_ncp_fs(void)  { diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 737d839bc17..6fc7b5cae92 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -55,7 +55,8 @@ static void dev_remove(struct net *net, dev_t dev)  	bl_pipe_msg.bl_wq = &nn->bl_wq;  	memset(msg, 0, sizeof(*msg)); -	msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); +	msg->len = sizeof(bl_msg) + bl_msg.totallen; +	msg->data = kzalloc(msg->len, GFP_NOFS);  	if (!msg->data)  		goto out; @@ -66,7 +67,6 @@ static void dev_remove(struct net *net, dev_t dev)  	memcpy(msg->data, &bl_msg, sizeof(bl_msg));  	dataptr = (uint8_t *) msg->data;  	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); -	msg->len = sizeof(bl_msg) + bl_msg.totallen;  	add_wait_queue(&nn->bl_wq, &wq);  	if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index dc0f98dfa71..c516da5873f 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -726,9 +726,9 @@ out1:  	return ret;  } -static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data) +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen)  { -	return key_instantiate_and_link(key, data, strlen(data) + 1, +	return key_instantiate_and_link(key, data, datalen,  					id_resolver_cache->thread_keyring,  					authkey);  } @@ -738,6 +738,7 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,  		struct key *key, struct key *authkey)  {  	char id_str[NFS_UINT_MAXLEN]; +	size_t len;  	int ret = -ENOKEY;  	/* ret = -ENOKEY */ @@ -747,13 +748,15 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,  	case IDMAP_CONV_NAMETOID:  		if (strcmp(upcall->im_name, im->im_name) != 0)  			break; -		sprintf(id_str, "%d", im->im_id); -		ret = nfs_idmap_instantiate(key, authkey, id_str); +		/* Note: here we store the NUL terminator too */ +		len = sprintf(id_str, "%d", im->im_id) + 1; +		ret = nfs_idmap_instantiate(key, authkey, id_str, len);  		break;  	case IDMAP_CONV_IDTONAME:  		if (upcall->im_id != im->im_id)  			break; -		ret = nfs_idmap_instantiate(key, authkey, im->im_name); +		len = strlen(im->im_name); +		ret = nfs_idmap_instantiate(key, authkey, im->im_name, len);  		break;  	default:  		ret = -EINVAL; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 49eeb044c10..4fb234d3aef 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -129,7 +129,6 @@ static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)  {  	if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))  		return; -	clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags);  	pnfs_return_layout(inode);  } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b2671cb0f90..26431cf62dd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2632,7 +2632,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,  	int status;  	if (pnfs_ld_layoutret_on_setattr(inode)) -		pnfs_return_layout(inode); +		pnfs_commit_and_return_layout(inode);  	nfs_fattr_init(fattr); @@ -6416,22 +6416,8 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)  static void nfs4_layoutcommit_release(void *calldata)  {  	struct nfs4_layoutcommit_data *data = calldata; -	struct pnfs_layout_segment *lseg, *tmp; -	unsigned long *bitlock = &NFS_I(data->args.inode)->flags;  	pnfs_cleanup_layoutcommit(data); -	/* Matched by references in pnfs_set_layoutcommit */ -	list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { -		list_del_init(&lseg->pls_lc_list); -		if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, -				       &lseg->pls_flags)) -			pnfs_put_lseg(lseg); -	} - -	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); -	smp_mb__after_clear_bit(); -	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); -  	put_rpccred(data->cred);  	kfree(data);  } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 48ac5aad625..4bdffe0ba02 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -417,6 +417,16 @@ should_free_lseg(struct pnfs_layout_range *lseg_range,  	       lo_seg_intersecting(lseg_range, recall_range);  } +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, +		struct list_head *tmp_list) +{ +	if (!atomic_dec_and_test(&lseg->pls_refcount)) +		return false; +	pnfs_layout_remove_lseg(lseg->pls_layout, lseg); +	list_add(&lseg->pls_list, tmp_list); +	return true; +} +  /* Returns 1 if lseg is removed from list, 0 otherwise */  static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,  			     struct list_head *tmp_list) @@ -430,11 +440,8 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,  		 */  		dprintk("%s: lseg %p ref %d\n", __func__, lseg,  			atomic_read(&lseg->pls_refcount)); -		if (atomic_dec_and_test(&lseg->pls_refcount)) { -			pnfs_layout_remove_lseg(lseg->pls_layout, lseg); -			list_add(&lseg->pls_list, tmp_list); +		if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))  			rv = 1; -		}  	}  	return rv;  } @@ -777,6 +784,21 @@ send_layoutget(struct pnfs_layout_hdr *lo,  	return lseg;  } +static void pnfs_clear_layoutcommit(struct inode *inode, +		struct list_head *head) +{ +	struct nfs_inode *nfsi = NFS_I(inode); +	struct pnfs_layout_segment *lseg, *tmp; + +	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) +		return; +	list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { +		if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) +			continue; +		pnfs_lseg_dec_and_remove_zero(lseg, head); +	} +} +  /*   * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr   * when the layout segment list is empty. @@ -808,6 +830,7 @@ _pnfs_return_layout(struct inode *ino)  	/* Reference matched in nfs4_layoutreturn_release */  	pnfs_get_layout_hdr(lo);  	empty = list_empty(&lo->plh_segs); +	pnfs_clear_layoutcommit(ino, &tmp_list);  	pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);  	/* Don't send a LAYOUTRETURN if list was initially empty */  	if (empty) { @@ -820,8 +843,6 @@ _pnfs_return_layout(struct inode *ino)  	spin_unlock(&ino->i_lock);  	pnfs_free_lseg_list(&tmp_list); -	WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); -  	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);  	if (unlikely(lrp == NULL)) {  		status = -ENOMEM; @@ -845,6 +866,33 @@ out:  }  EXPORT_SYMBOL_GPL(_pnfs_return_layout); +int +pnfs_commit_and_return_layout(struct inode *inode) +{ +	struct pnfs_layout_hdr *lo; +	int ret; + +	spin_lock(&inode->i_lock); +	lo = NFS_I(inode)->layout; +	if (lo == NULL) { +		spin_unlock(&inode->i_lock); +		return 0; +	} +	pnfs_get_layout_hdr(lo); +	/* Block new layoutgets and read/write to ds */ +	lo->plh_block_lgets++; +	spin_unlock(&inode->i_lock); +	filemap_fdatawait(inode->i_mapping); +	ret = pnfs_layoutcommit_inode(inode, true); +	if (ret == 0) +		ret = _pnfs_return_layout(inode); +	spin_lock(&inode->i_lock); +	lo->plh_block_lgets--; +	spin_unlock(&inode->i_lock); +	pnfs_put_layout_hdr(lo); +	return ret; +} +  bool pnfs_roc(struct inode *ino)  {  	struct pnfs_layout_hdr *lo; @@ -1458,7 +1506,6 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)  	dprintk("pnfs write error = %d\n", hdr->pnfs_error);  	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &  	    PNFS_LAYOUTRET_ON_ERROR) { -		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);  		pnfs_return_layout(hdr->inode);  	}  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) @@ -1613,7 +1660,6 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)  	dprintk("pnfs read error = %d\n", hdr->pnfs_error);  	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &  	    PNFS_LAYOUTRET_ON_ERROR) { -		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);  		pnfs_return_layout(hdr->inode);  	}  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) @@ -1746,11 +1792,27 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)  	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {  		if (lseg->pls_range.iomode == IOMODE_RW && -		    test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) +		    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))  			list_add(&lseg->pls_lc_list, listp);  	}  } +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ +	struct pnfs_layout_segment *lseg, *tmp; +	unsigned long *bitlock = &NFS_I(inode)->flags; + +	/* Matched by references in pnfs_set_layoutcommit */ +	list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { +		list_del_init(&lseg->pls_lc_list); +		pnfs_put_lseg(lseg); +	} + +	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); +	smp_mb__after_clear_bit(); +	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} +  void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)  {  	pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); @@ -1795,6 +1857,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)  	if (nfss->pnfs_curr_ld->cleanup_layoutcommit)  		nfss->pnfs_curr_ld->cleanup_layoutcommit(data); +	pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);  }  /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 94ba8041774..f5f8a470a64 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -219,6 +219,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata);  void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);  int pnfs_layoutcommit_inode(struct inode *inode, bool sync);  int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *);  void pnfs_ld_write_done(struct nfs_write_data *);  void pnfs_ld_read_done(struct nfs_read_data *);  struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, @@ -407,6 +408,11 @@ static inline int pnfs_return_layout(struct inode *ino)  	return 0;  } +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ +	return 0; +} +  static inline bool  pnfs_ld_layoutret_on_setattr(struct inode *inode)  { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 17b32b72245..2f8a29db0f1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -294,6 +294,7 @@ struct file_system_type nfs_fs_type = {  	.kill_sb	= nfs_kill_super,  	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  }; +MODULE_ALIAS_FS("nfs");  EXPORT_SYMBOL_GPL(nfs_fs_type);  struct file_system_type nfs_xdev_fs_type = { @@ -333,6 +334,8 @@ struct file_system_type nfs4_fs_type = {  	.kill_sb	= nfs_kill_super,  	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  }; +MODULE_ALIAS_FS("nfs4"); +MODULE_ALIAS("nfs4");  EXPORT_SYMBOL_GPL(nfs4_fs_type);  static int __init register_nfs4_fs(void) @@ -2717,6 +2720,5 @@ module_param(send_implementation_id, ushort, 0644);  MODULE_PARM_DESC(send_implementation_id,  		"Send implementation ID with NFSv4.1 exchange_id");  MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); -MODULE_ALIAS("nfs4");  #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 16d39c6c4fb..2e27430b907 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -230,37 +230,6 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)  		__nfs4_file_put_access(fp, oflag);  } -static inline int get_new_stid(struct nfs4_stid *stid) -{ -	static int min_stateid = 0; -	struct idr *stateids = &stid->sc_client->cl_stateids; -	int new_stid; -	int error; - -	error = idr_get_new_above(stateids, stid, min_stateid, &new_stid); -	/* -	 * Note: the necessary preallocation was done in -	 * nfs4_alloc_stateid().  The idr code caps the number of -	 * preallocations that can exist at a time, but the state lock -	 * prevents anyone from using ours before we get here: -	 */ -	WARN_ON_ONCE(error); -	/* -	 * It shouldn't be a problem to reuse an opaque stateid value. -	 * I don't think it is for 4.1.  But with 4.0 I worry that, for -	 * example, a stray write retransmission could be accepted by -	 * the server when it should have been rejected.  Therefore, -	 * adopt a trick from the sctp code to attempt to maximize the -	 * amount of time until an id is reused, by ensuring they always -	 * "increase" (mod INT_MAX): -	 */ - -	min_stateid = new_stid+1; -	if (min_stateid == INT_MAX) -		min_stateid = 0; -	return new_stid; -} -  static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct  kmem_cache *slab)  { @@ -273,9 +242,8 @@ kmem_cache *slab)  	if (!stid)  		return NULL; -	if (!idr_pre_get(stateids, GFP_KERNEL)) -		goto out_free; -	if (idr_get_new_above(stateids, stid, min_stateid, &new_id)) +	new_id = idr_alloc(stateids, stid, min_stateid, 0, GFP_KERNEL); +	if (new_id < 0)  		goto out_free;  	stid->sc_client = cl;  	stid->sc_type = 0; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 01168865dd3..a2720071f28 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -264,7 +264,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,  		iattr->ia_valid |= ATTR_SIZE;  	}  	if (bmval[0] & FATTR4_WORD0_ACL) { -		int nace; +		u32 nace;  		struct nfs4_ace *ace;  		READ_BUF(4); len += 4; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 62c1ee128ae..ca05f6dc354 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -102,7 +102,8 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)  {  	if (rp->c_type == RC_REPLBUFF)  		kfree(rp->c_replvec.iov_base); -	hlist_del(&rp->c_hash); +	if (!hlist_unhashed(&rp->c_hash)) +		hlist_del(&rp->c_hash);  	list_del(&rp->c_lru);  	--num_drc_entries;  	kmem_cache_free(drc_slab, rp); @@ -118,6 +119,10 @@ nfsd_reply_cache_free(struct svc_cacherep *rp)  int nfsd_reply_cache_init(void)  { +	INIT_LIST_HEAD(&lru_head); +	max_drc_entries = nfsd_cache_size_limit(); +	num_drc_entries = 0; +  	register_shrinker(&nfsd_reply_cache_shrinker);  	drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),  					0, 0, NULL); @@ -128,10 +133,6 @@ int nfsd_reply_cache_init(void)  	if (!cache_hash)  		goto out_nomem; -	INIT_LIST_HEAD(&lru_head); -	max_drc_entries = nfsd_cache_size_limit(); -	num_drc_entries = 0; -  	return 0;  out_nomem:  	printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 13a21c8fca4..f33455b4d95 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1090,6 +1090,7 @@ static struct file_system_type nfsd_fs_type = {  	.mount		= nfsd_mount,  	.kill_sb	= nfsd_umount,  }; +MODULE_ALIAS_FS("nfsd");  #ifdef CONFIG_PROC_FS  static int create_proc_exports_entry(void) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2a7eb536de0..2b2e2396a86 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1013,6 +1013,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,  	int			host_err;  	int			stable = *stablep;  	int			use_wgather; +	loff_t			pos = offset;  	dentry = file->f_path.dentry;  	inode = dentry->d_inode; @@ -1025,7 +1026,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,  	/* Write the data. */  	oldfs = get_fs(); set_fs(KERNEL_DS); -	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); +	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos);  	set_fs(oldfs);  	if (host_err < 0)  		goto out_nfserr; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3c991dc84f2..c7d1f9f18b0 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1361,6 +1361,7 @@ struct file_system_type nilfs_fs_type = {  	.kill_sb  = kill_block_super,  	.fs_flags = FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("nilfs2");  static void nilfs_inode_init_once(void *obj)  { diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 4a8289f8b16..82650d52d91 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3079,6 +3079,7 @@ static struct file_system_type ntfs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("ntfs");  /* Stable names for the slab caches. */  static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 4c5fc8d77dc..12bafb7265c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -640,6 +640,7 @@ static struct file_system_type dlmfs_fs_type = {  	.mount		= dlmfs_mount,  	.kill_sb	= kill_litter_super,  }; +MODULE_ALIAS_FS("ocfs2_dlmfs");  static int __init init_dlmfs_fs(void)  { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 9b6910dec4b..01b85165552 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1266,6 +1266,7 @@ static struct file_system_type ocfs2_fs_type = {  	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,  	.next           = NULL  }; +MODULE_ALIAS_FS("ocfs2");  static int ocfs2_check_set_options(struct super_block *sb,  				   struct mount_options *options) diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 25d715c7c87..d8b0afde217 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -572,6 +572,7 @@ static struct file_system_type omfs_fs_type = {  	.kill_sb = kill_block_super,  	.fs_flags = FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("omfs");  static int __init init_omfs_fs(void)  { diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index ae47fa7efb9..75885ffde44 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -432,6 +432,7 @@ static struct file_system_type openprom_fs_type = {  	.mount		= openprom_mount,  	.kill_sb	= kill_anon_super,  }; +MODULE_ALIAS_FS("openpromfs");  static void op_inode_init_once(void *data)  { diff --git a/fs/pipe.c b/fs/pipe.c index 64a494cef0a..2234f3f61f8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -863,6 +863,9 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)  {  	int ret = -ENOENT; +	if (!(filp->f_mode & (FMODE_READ|FMODE_WRITE))) +		return -EINVAL; +  	mutex_lock(&inode->i_mutex);  	if (inode->i_pipe) { diff --git a/fs/pnode.c b/fs/pnode.c index 3e000a51ac0..8b29d2164da 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -9,6 +9,7 @@  #include <linux/mnt_namespace.h>  #include <linux/mount.h>  #include <linux/fs.h> +#include <linux/nsproxy.h>  #include "internal.h"  #include "pnode.h" @@ -220,6 +221,7 @@ static struct mount *get_source(struct mount *dest,  int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,  		    struct mount *source_mnt, struct list_head *tree_list)  { +	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;  	struct mount *m, *child;  	int ret = 0;  	struct mount *prev_dest_mnt = dest_mnt; @@ -237,6 +239,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,  		source =  get_source(m, prev_dest_mnt, prev_src_mnt, &type); +		/* Notice when we are propagating across user namespaces */ +		if (m->mnt_ns->user_ns != user_ns) +			type |= CL_UNPRIVILEGED; +  		child = copy_tree(source, source->mnt.mnt_root, type);  		if (IS_ERR(child)) {  			ret = PTR_ERR(child); diff --git a/fs/pnode.h b/fs/pnode.h index 19b853a3445..a0493d5ebfb 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -23,6 +23,7 @@  #define CL_MAKE_SHARED 		0x08  #define CL_PRIVATE 		0x10  #define CL_SHARED_TO_SLAVE	0x20 +#define CL_UNPRIVILEGED		0x40  static inline void set_mnt_shared(struct mount *mnt)  { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a86aebc9ba7..869116c2afb 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -446,9 +446,10 @@ static const struct file_operations proc_reg_file_ops_no_compat = {  struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)  { -	struct inode *inode = iget_locked(sb, de->low_ino); +	struct inode *inode = new_inode_pseudo(sb); -	if (inode && (inode->i_state & I_NEW)) { +	if (inode) { +		inode->i_ino = de->low_ino;  		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;  		PROC_I(inode)->pde = de; @@ -476,7 +477,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)  				inode->i_fop = de->proc_fops;  			}  		} -		unlock_new_inode(inode);  	} else  	       pde_put(de);  	return inode; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index b7a47196c8c..66b51c0383d 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -118,7 +118,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)  	struct super_block *sb = inode->i_sb;  	struct proc_inode *ei = PROC_I(inode);  	struct task_struct *task; -	struct dentry *ns_dentry; +	struct path ns_path;  	void *error = ERR_PTR(-EACCES);  	task = get_proc_task(inode); @@ -128,14 +128,14 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)  	if (!ptrace_may_access(task, PTRACE_MODE_READ))  		goto out_put_task; -	ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); -	if (IS_ERR(ns_dentry)) { -		error = ERR_CAST(ns_dentry); +	ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); +	if (IS_ERR(ns_path.dentry)) { +		error = ERR_CAST(ns_path.dentry);  		goto out_put_task;  	} -	dput(nd->path.dentry); -	nd->path.dentry = ns_dentry; +	ns_path.mnt = mntget(nd->path.mnt); +	nd_jump_link(nd, &ns_path);  	error = NULL;  out_put_task: diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26ba..9c7fab1d23f 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@  #include <linux/sched.h>  #include <linux/module.h>  #include <linux/bitops.h> +#include <linux/user_namespace.h>  #include <linux/mount.h>  #include <linux/pid_namespace.h>  #include <linux/parser.h> @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,  	} else {  		ns = task_active_pid_ns(current);  		options = data; + +		if (!current_user_ns()->may_mount_proc) +			return ERR_PTR(-EPERM);  	}  	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 43098bb5723..2e8caa62da7 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -412,6 +412,7 @@ static struct file_system_type qnx4_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("qnx4");  static int __init init_qnx4_fs(void)  { diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 57199a52a35..8d941edfefa 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -672,6 +672,7 @@ static struct file_system_type qnx6_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("qnx6");  static int __init init_qnx6_fs(void)  { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 05ae3c97f7a..3e64169ef52 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1439,8 +1439,11 @@ static void __dquot_initialize(struct inode *inode, int type)  			 * did a write before quota was turned on  			 */  			rsv = inode_get_rsv_space(inode); -			if (unlikely(rsv)) +			if (unlikely(rsv)) { +				spin_lock(&dq_data_lock);  				dquot_resv_space(inode->i_dquot[cnt], rsv); +				spin_unlock(&dq_data_lock); +			}  		}  	}  out_err: diff --git a/fs/read_write.c b/fs/read_write.c index a698eff457f..e6ddc8dceb9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -17,6 +17,7 @@  #include <linux/splice.h>  #include <linux/compat.h>  #include "read_write.h" +#include "internal.h"  #include <asm/uaccess.h>  #include <asm/unistd.h> @@ -417,6 +418,33 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof  EXPORT_SYMBOL(do_sync_write); +ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +{ +	mm_segment_t old_fs; +	const char __user *p; +	ssize_t ret; + +	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) +		return -EINVAL; + +	old_fs = get_fs(); +	set_fs(get_ds()); +	p = (__force const char __user *)buf; +	if (count > MAX_RW_COUNT) +		count =  MAX_RW_COUNT; +	if (file->f_op->write) +		ret = file->f_op->write(file, p, count, pos); +	else +		ret = do_sync_write(file, p, count, pos); +	set_fs(old_fs); +	if (ret > 0) { +		fsnotify_modify(file); +		add_wchar(current, ret); +	} +	inc_syscw(current); +	return ret; +} +  ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)  {  	ssize_t ret; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 418bdc3a57d..f8a23c3078f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1147,8 +1147,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin  							 "on filesystem root.");  					return 0;  				} -				qf_names[qtype] = -				    kmalloc(strlen(arg) + 1, GFP_KERNEL); +				qf_names[qtype] = kstrdup(arg, GFP_KERNEL);  				if (!qf_names[qtype]) {  					reiserfs_warning(s, "reiserfs-2502",  							 "not enough memory " @@ -1156,7 +1155,6 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin  							 "quotafile name.");  					return 0;  				} -				strcpy(qf_names[qtype], arg);  				if (qtype == USRQUOTA)  					*mount_options |= 1 << REISERFS_USRQUOTA;  				else @@ -2434,6 +2432,7 @@ struct file_system_type reiserfs_fs_type = {  	.kill_sb = reiserfs_kill_sb,  	.fs_flags = FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("reiserfs");  MODULE_DESCRIPTION("ReiserFS journaled filesystem");  MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>"); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index c196369fe40..4cce1d9552f 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -187,8 +187,8 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,  	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))  		return -ENOSPC; -	if (name[0] == '.' && (name[1] == '\0' || -			       (name[1] == '.' && name[2] == '\0'))) +	if (name[0] == '.' && (namelen < 2 || +			       (namelen == 2 && name[1] == '.')))  		return 0;  	dentry = lookup_one_len(name, dbuf->xadir, namelen); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 7e8d3a80bda..15cbc41ee36 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -599,6 +599,7 @@ static struct file_system_type romfs_fs_type = {  	.kill_sb	= romfs_kill_sb,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("romfs");  /*   * inode storage initialiser diff --git a/fs/splice.c b/fs/splice.c index 718bd005638..29e394e49dd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,7 @@  #include <linux/security.h>  #include <linux/gfp.h>  #include <linux/socket.h> +#include "internal.h"  /*   * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -1048,9 +1049,10 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,  {  	int ret;  	void *data; +	loff_t tmp = sd->pos;  	data = buf->ops->map(pipe, buf, 0); -	ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); +	ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);  	buf->ops->unmap(pipe, buf, data);  	return ret; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 260e3928d4f..60553a9053c 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -489,6 +489,7 @@ static struct file_system_type squashfs_fs_type = {  	.kill_sb = kill_block_super,  	.fs_flags = FS_REQUIRES_DEV  }; +MODULE_ALIAS_FS("squashfs");  static const struct super_operations squashfs_super_ops = {  	.alloc_inode = squashfs_alloc_inode, diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25..e14512678c9 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1020,6 +1020,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)  		ino = parent_sd->s_ino;  		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)  			filp->f_pos++; +		else +			return 0;  	}  	if (filp->f_pos == 1) {  		if (parent_sd->s_parent) @@ -1028,6 +1030,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)  			ino = parent_sd->s_ino;  		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)  			filp->f_pos++; +		else +			return 0;  	}  	mutex_lock(&sysfs_mutex);  	for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); @@ -1058,10 +1062,21 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)  	return 0;  } +static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ +	struct inode *inode = file_inode(file); +	loff_t ret; + +	mutex_lock(&inode->i_mutex); +	ret = generic_file_llseek(file, offset, whence); +	mutex_unlock(&inode->i_mutex); + +	return ret; +}  const struct file_operations sysfs_dir_operations = {  	.read		= generic_read_dir,  	.readdir	= sysfs_readdir,  	.release	= sysfs_dir_release, -	.llseek		= generic_file_llseek, +	.llseek		= sysfs_dir_llseek,  }; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec73..afd83273e6c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@  #include <linux/module.h>  #include <linux/magic.h>  #include <linux/slab.h> +#include <linux/user_namespace.h>  #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,  	struct super_block *sb;  	int error; +	if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) +		return ERR_PTR(-EPERM); +  	info = kzalloc(sizeof(*info), GFP_KERNEL);  	if (!info)  		return ERR_PTR(-ENOMEM); diff --git a/fs/sysv/super.c b/fs/sysv/super.c index a38e87bdd78..d0c6a007ce8 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -545,6 +545,7 @@ static struct file_system_type sysv_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("sysv");  static struct file_system_type v7_fs_type = {  	.owner		= THIS_MODULE, @@ -553,6 +554,8 @@ static struct file_system_type v7_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("v7"); +MODULE_ALIAS("v7");  static int __init init_sysv_fs(void)  { @@ -586,5 +589,4 @@ static void __exit exit_sysv_fs(void)  module_init(init_sysv_fs)  module_exit(exit_sysv_fs) -MODULE_ALIAS("v7");  MODULE_LICENSE("GPL"); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ddc0f6ae65e..f21acf0ef01 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1568,6 +1568,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)  	c->remounting_rw = 1;  	c->ro_mount = 0; +	if (c->space_fixup) { +		err = ubifs_fixup_free_space(c); +		if (err) +			return err; +	} +  	err = check_free_space(c);  	if (err)  		goto out; @@ -1684,12 +1690,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)  		err = dbg_check_space_info(c);  	} -	if (c->space_fixup) { -		err = ubifs_fixup_free_space(c); -		if (err) -			goto out; -	} -  	mutex_unlock(&c->umount_mutex);  	return err; @@ -2174,6 +2174,7 @@ static struct file_system_type ubifs_fs_type = {  	.mount   = ubifs_mount,  	.kill_sb = kill_ubifs_super,  }; +MODULE_ALIAS_FS("ubifs");  /*   * Inode slab cache constructor. diff --git a/fs/udf/super.c b/fs/udf/super.c index bc5b30a819e..9ac4057a86c 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -118,6 +118,7 @@ static struct file_system_type udf_fstype = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("udf");  static struct kmem_cache *udf_inode_cachep; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index dc8e3a861d0..329f2f53b7e 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1500,6 +1500,7 @@ static struct file_system_type ufs_fs_type = {  	.kill_sb	= kill_block_super,  	.fs_flags	= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("ufs");  static int __init init_ufs_fs(void)  { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4e8f0df82d0..8459b5d8cb7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1334,6 +1334,12 @@ _xfs_buf_ioapply(  	int		size;  	int		i; +	/* +	 * Make sure we capture only current IO errors rather than stale errors +	 * left over from previous use of the buffer (e.g. failed readahead). +	 */ +	bp->b_error = 0; +  	if (bp->b_flags & XBF_WRITE) {  		if (bp->b_flags & XBF_SYNCIO)  			rw = WRITE_SYNC; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 912d83d8860..5a30dd899d2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -325,7 +325,7 @@ xfs_iomap_eof_want_preallocate(   * rather than falling short due to things like stripe unit/width alignment of   * real extents.   */ -STATIC int +STATIC xfs_fsblock_t  xfs_iomap_eof_prealloc_initial_size(  	struct xfs_mount	*mp,  	struct xfs_inode	*ip, @@ -413,7 +413,7 @@ xfs_iomap_prealloc_size(  		 * have a large file on a small filesystem and the above  		 * lowspace thresholds are smaller than MAXEXTLEN.  		 */ -		while (alloc_blocks >= freesp) +		while (alloc_blocks && alloc_blocks >= freesp)  			alloc_blocks >>= 4;  	} diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c407121873b..ea341cea68c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1561,6 +1561,7 @@ static struct file_system_type xfs_fs_type = {  	.kill_sb		= kill_block_super,  	.fs_flags		= FS_REQUIRES_DEV,  }; +MODULE_ALIAS_FS("xfs");  STATIC int __init  xfs_init_zones(void)  |