diff options
Diffstat (limited to 'fs/btrfs')
| -rw-r--r-- | fs/btrfs/backref.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/btrfs_inode.h | 4 | ||||
| -rw-r--r-- | fs/btrfs/ctree.c | 17 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 5 | ||||
| -rw-r--r-- | fs/btrfs/delayed-inode.c | 58 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 189 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 169 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 9 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.h | 2 | ||||
| -rw-r--r-- | fs/btrfs/free-space-cache.c | 80 | ||||
| -rw-r--r-- | fs/btrfs/inode-map.c | 28 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 90 | ||||
| -rw-r--r-- | fs/btrfs/ioctl.c | 15 | ||||
| -rw-r--r-- | fs/btrfs/relocation.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/scrub.c | 66 | ||||
| -rw-r--r-- | fs/btrfs/super.c | 87 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 12 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 5 | ||||
| -rw-r--r-- | fs/btrfs/volumes.h | 6 | 
19 files changed, 558 insertions, 288 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8855aad3929..22c64fff1bd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,  		return PTR_ERR(fspath);  	if (fspath > fspath_min) { -		ipath->fspath->val[i] = (u64)fspath; +		ipath->fspath->val[i] = (u64)(unsigned long)fspath;  		++ipath->fspath->elem_cnt;  		ipath->fspath->bytes_left = fspath - fspath_min;  	} else { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5a5d325a393..634608d2a6d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -147,14 +147,12 @@ struct btrfs_inode {  	 * the btrfs file release call will add this inode to the  	 * ordered operations list so that we make sure to flush out any  	 * new data the application may have written before commit. -	 * -	 * yes, its silly to have a single bitflag, but we might grow more -	 * of these.  	 */  	unsigned ordered_data_close:1;  	unsigned orphan_meta_reserved:1;  	unsigned dummy_inode:1;  	unsigned in_defrag:1; +	unsigned delalloc_meta_reserved:1;  	/*  	 * always compress this one file diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0fe615e4ea3..dede441bdee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root,  				   struct extent_buffer *buf)  { +	/* ensure we can see the force_cow */ +	smp_rmb(); + +	/* +	 * We do not need to cow a block if +	 * 1) this block is not created or changed in this transaction; +	 * 2) this block does not belong to TREE_RELOC tree; +	 * 3) the root is not forced COW. +	 * +	 * What is forced COW: +	 *    when we create snapshot during commiting the transaction, +	 *    after we've finished coping src root, we must COW the shared +	 *    block to ensure the metadata consistency. +	 */  	if (btrfs_header_generation(buf) == trans->transid &&  	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&  	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && -	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) +	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && +	    !root->force_cow)  		return 0;  	return 1;  } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9ba59ff929..04a5dfcee5a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -848,7 +848,8 @@ struct btrfs_free_cluster {  enum btrfs_caching_type {  	BTRFS_CACHE_NO		= 0,  	BTRFS_CACHE_STARTED	= 1, -	BTRFS_CACHE_FINISHED	= 2, +	BTRFS_CACHE_FAST	= 2, +	BTRFS_CACHE_FINISHED	= 3,  };  enum btrfs_disk_cache_state { @@ -1271,6 +1272,8 @@ struct btrfs_root {  	 * for stat.  It may be used for more later  	 */  	dev_t anon_dev; + +	int force_cow;  };  struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 3a1b939c9ae..5b163572e0c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,  static int btrfs_delayed_inode_reserve_metadata(  					struct btrfs_trans_handle *trans,  					struct btrfs_root *root, +					struct inode *inode,  					struct btrfs_delayed_node *node)  {  	struct btrfs_block_rsv *src_rsv;  	struct btrfs_block_rsv *dst_rsv;  	u64 num_bytes;  	int ret; +	int release = false;  	src_rsv = trans->block_rsv;  	dst_rsv = &root->fs_info->delayed_block_rsv; @@ -652,12 +654,65 @@ static int btrfs_delayed_inode_reserve_metadata(  		if (!ret)  			node->bytes_reserved = num_bytes;  		return ret; +	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) { +		spin_lock(&BTRFS_I(inode)->lock); +		if (BTRFS_I(inode)->delalloc_meta_reserved) { +			BTRFS_I(inode)->delalloc_meta_reserved = 0; +			spin_unlock(&BTRFS_I(inode)->lock); +			release = true; +			goto migrate; +		} +		spin_unlock(&BTRFS_I(inode)->lock); + +		/* Ok we didn't have space pre-reserved.  This shouldn't happen +		 * too often but it can happen if we do delalloc to an existing +		 * inode which gets dirtied because of the time update, and then +		 * isn't touched again until after the transaction commits and +		 * then we try to write out the data.  First try to be nice and +		 * reserve something strictly for us.  If not be a pain and try +		 * to steal from the delalloc block rsv. +		 */ +		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); +		if (!ret) +			goto out; + +		ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); +		if (!ret) +			goto out; + +		/* +		 * Ok this is a problem, let's just steal from the global rsv +		 * since this really shouldn't happen that often. +		 */ +		WARN_ON(1); +		ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, +					      dst_rsv, num_bytes); +		goto out;  	} +migrate:  	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + +out: +	/* +	 * Migrate only takes a reservation, it doesn't touch the size of the +	 * block_rsv.  This is to simplify people who don't normally have things +	 * migrated from their block rsv.  If they go to release their +	 * reservation, that will decrease the size as well, so if migrate +	 * reduced size we'd end up with a negative size.  But for the +	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation, +	 * but we could in fact do this reserve/migrate dance several times +	 * between the time we did the original reservation and we'd clean it +	 * up.  So to take care of this, release the space for the meta +	 * reservation here.  I think it may be time for a documentation page on +	 * how block rsvs. work. +	 */  	if (!ret)  		node->bytes_reserved = num_bytes; +	if (release) +		btrfs_block_rsv_release(root, src_rsv, num_bytes); +  	return ret;  } @@ -1708,7 +1763,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,  		goto release_node;  	} -	ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); +	ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, +						   delayed_node);  	if (ret)  		goto release_node; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 102c176fc29..632f8f3cc9d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -620,7 +620,7 @@ out:  static int btree_io_failed_hook(struct bio *failed_bio,  			 struct page *page, u64 start, u64 end, -			 u64 mirror_num, struct extent_state *state) +			 int mirror_num, struct extent_state *state)  {  	struct extent_io_tree *tree;  	unsigned long len; @@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	u64 features;  	struct btrfs_key location;  	struct buffer_head *bh; -	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), -						 GFP_NOFS); -	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), -						 GFP_NOFS); +	struct btrfs_super_block *disk_super;  	struct btrfs_root *tree_root = btrfs_sb(sb); -	struct btrfs_fs_info *fs_info = NULL; -	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), -						GFP_NOFS); -	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), -					      GFP_NOFS); +	struct btrfs_fs_info *fs_info = tree_root->fs_info; +	struct btrfs_root *extent_root; +	struct btrfs_root *csum_root; +	struct btrfs_root *chunk_root; +	struct btrfs_root *dev_root;  	struct btrfs_root *log_tree_root; -  	int ret;  	int err = -EINVAL;  	int num_backups_tried = 0;  	int backup_index = 0; -	struct btrfs_super_block *disk_super; +	extent_root = fs_info->extent_root = +		kzalloc(sizeof(struct btrfs_root), GFP_NOFS); +	csum_root = fs_info->csum_root = +		kzalloc(sizeof(struct btrfs_root), GFP_NOFS); +	chunk_root = fs_info->chunk_root = +		kzalloc(sizeof(struct btrfs_root), GFP_NOFS); +	dev_root = fs_info->dev_root = +		kzalloc(sizeof(struct btrfs_root), GFP_NOFS); -	if (!extent_root || !tree_root || !tree_root->fs_info || -	    !chunk_root || !dev_root || !csum_root) { +	if (!extent_root || !csum_root || !chunk_root || !dev_root) {  		err = -ENOMEM;  		goto fail;  	} -	fs_info = tree_root->fs_info;  	ret = init_srcu_struct(&fs_info->subvol_srcu);  	if (ret) { @@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	mutex_init(&fs_info->reloc_mutex);  	init_completion(&fs_info->kobj_unregister); -	fs_info->tree_root = tree_root; -	fs_info->extent_root = extent_root; -	fs_info->csum_root = csum_root; -	fs_info->chunk_root = chunk_root; -	fs_info->dev_root = dev_root; -	fs_info->fs_devices = fs_devices;  	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);  	INIT_LIST_HEAD(&fs_info->space_info);  	btrfs_mapping_init(&fs_info->mapping_tree); @@ -2465,21 +2460,20 @@ fail_sb_buffer:  	btrfs_stop_workers(&fs_info->caching_workers);  fail_alloc:  fail_iput: +	btrfs_mapping_tree_free(&fs_info->mapping_tree); +  	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);  	iput(fs_info->btree_inode); - -	btrfs_close_devices(fs_info->fs_devices); -	btrfs_mapping_tree_free(&fs_info->mapping_tree);  fail_bdi:  	bdi_destroy(&fs_info->bdi);  fail_srcu:  	cleanup_srcu_struct(&fs_info->subvol_srcu);  fail: +	btrfs_close_devices(fs_info->fs_devices);  	free_fs_info(fs_info);  	return ERR_PTR(err);  recovery_tree_root: -  	if (!btrfs_test_opt(tree_root, RECOVERY))  		goto fail_tree_roots; @@ -2579,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device,  	int errors = 0;  	u32 crc;  	u64 bytenr; -	int last_barrier = 0;  	if (max_mirrors == 0)  		max_mirrors = BTRFS_SUPER_MIRROR_MAX; -	/* make sure only the last submit_bh does a barrier */ -	if (do_barriers) { -		for (i = 0; i < max_mirrors; i++) { -			bytenr = btrfs_sb_offset(i); -			if (bytenr + BTRFS_SUPER_INFO_SIZE >= -			    device->total_bytes) -				break; -			last_barrier = i; -		} -	} -  	for (i = 0; i < max_mirrors; i++) {  		bytenr = btrfs_sb_offset(i);  		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2640,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device,  			bh->b_end_io = btrfs_end_buffer_write_sync;  		} -		if (i == last_barrier && do_barriers) -			ret = submit_bh(WRITE_FLUSH_FUA, bh); -		else -			ret = submit_bh(WRITE_SYNC, bh); - +		/* +		 * we fua the first super.  The others we allow +		 * to go down lazy. +		 */ +		ret = submit_bh(WRITE_FUA, bh);  		if (ret)  			errors++;  	}  	return errors < i ? 0 : -1;  } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ +	if (err) { +		if (err == -EOPNOTSUPP) +			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); +		clear_bit(BIO_UPTODATE, &bio->bi_flags); +	} +	if (bio->bi_private) +		complete(bio->bi_private); +	bio_put(bio); +} + +/* + * trigger flushes for one the devices.  If you pass wait == 0, the flushes are + * sent down.  With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ +	struct bio *bio; +	int ret = 0; + +	if (device->nobarriers) +		return 0; + +	if (wait) { +		bio = device->flush_bio; +		if (!bio) +			return 0; + +		wait_for_completion(&device->flush_wait); + +		if (bio_flagged(bio, BIO_EOPNOTSUPP)) { +			printk("btrfs: disabling barriers on dev %s\n", +			       device->name); +			device->nobarriers = 1; +		} +		if (!bio_flagged(bio, BIO_UPTODATE)) { +			ret = -EIO; +		} + +		/* drop the reference from the wait == 0 run */ +		bio_put(bio); +		device->flush_bio = NULL; + +		return ret; +	} + +	/* +	 * one reference for us, and we leave it for the +	 * caller +	 */ +	device->flush_bio = NULL;; +	bio = bio_alloc(GFP_NOFS, 0); +	if (!bio) +		return -ENOMEM; + +	bio->bi_end_io = btrfs_end_empty_barrier; +	bio->bi_bdev = device->bdev; +	init_completion(&device->flush_wait); +	bio->bi_private = &device->flush_wait; +	device->flush_bio = bio; + +	bio_get(bio); +	submit_bio(WRITE_FLUSH, bio); + +	return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ +	struct list_head *head; +	struct btrfs_device *dev; +	int errors = 0; +	int ret; + +	/* send down all the barriers */ +	head = &info->fs_devices->devices; +	list_for_each_entry_rcu(dev, head, dev_list) { +		if (!dev->bdev) { +			errors++; +			continue; +		} +		if (!dev->in_fs_metadata || !dev->writeable) +			continue; + +		ret = write_dev_flush(dev, 0); +		if (ret) +			errors++; +	} + +	/* wait for all the barriers */ +	list_for_each_entry_rcu(dev, head, dev_list) { +		if (!dev->bdev) { +			errors++; +			continue; +		} +		if (!dev->in_fs_metadata || !dev->writeable) +			continue; + +		ret = write_dev_flush(dev, 1); +		if (ret) +			errors++; +	} +	if (errors) +		return -EIO; +	return 0; +} +  int write_all_supers(struct btrfs_root *root, int max_mirrors)  {  	struct list_head *head; @@ -2672,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)  	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);  	head = &root->fs_info->fs_devices->devices; + +	if (do_barriers) +		barrier_all_devices(root->fs_info); +  	list_for_each_entry_rcu(dev, head, dev_list) {  		if (!dev->bdev) {  			total_errors++; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9879bd47463..930ae894973 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  			     struct btrfs_root *root,  			     int load_cache_only)  { +	DEFINE_WAIT(wait);  	struct btrfs_fs_info *fs_info = cache->fs_info;  	struct btrfs_caching_control *caching_ctl;  	int ret = 0; -	smp_mb(); -	if (cache->cached != BTRFS_CACHE_NO) +	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); +	BUG_ON(!caching_ctl); + +	INIT_LIST_HEAD(&caching_ctl->list); +	mutex_init(&caching_ctl->mutex); +	init_waitqueue_head(&caching_ctl->wait); +	caching_ctl->block_group = cache; +	caching_ctl->progress = cache->key.objectid; +	atomic_set(&caching_ctl->count, 1); +	caching_ctl->work.func = caching_thread; + +	spin_lock(&cache->lock); +	/* +	 * This should be a rare occasion, but this could happen I think in the +	 * case where one thread starts to load the space cache info, and then +	 * some other thread starts a transaction commit which tries to do an +	 * allocation while the other thread is still loading the space cache +	 * info.  The previous loop should have kept us from choosing this block +	 * group, but if we've moved to the state where we will wait on caching +	 * block groups we need to first check if we're doing a fast load here, +	 * so we can wait for it to finish, otherwise we could end up allocating +	 * from a block group who's cache gets evicted for one reason or +	 * another. +	 */ +	while (cache->cached == BTRFS_CACHE_FAST) { +		struct btrfs_caching_control *ctl; + +		ctl = cache->caching_ctl; +		atomic_inc(&ctl->count); +		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); +		spin_unlock(&cache->lock); + +		schedule(); + +		finish_wait(&ctl->wait, &wait); +		put_caching_control(ctl); +		spin_lock(&cache->lock); +	} + +	if (cache->cached != BTRFS_CACHE_NO) { +		spin_unlock(&cache->lock); +		kfree(caching_ctl);  		return 0; +	} +	WARN_ON(cache->caching_ctl); +	cache->caching_ctl = caching_ctl; +	cache->cached = BTRFS_CACHE_FAST; +	spin_unlock(&cache->lock);  	/*  	 * We can't do the read from on-disk cache during a commit since we need @@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  	if (trans && (!trans->transaction->in_commit) &&  	    (root && root != root->fs_info->tree_root) &&  	    btrfs_test_opt(root, SPACE_CACHE)) { -		spin_lock(&cache->lock); -		if (cache->cached != BTRFS_CACHE_NO) { -			spin_unlock(&cache->lock); -			return 0; -		} -		cache->cached = BTRFS_CACHE_STARTED; -		spin_unlock(&cache->lock); -  		ret = load_free_space_cache(fs_info, cache);  		spin_lock(&cache->lock);  		if (ret == 1) { +			cache->caching_ctl = NULL;  			cache->cached = BTRFS_CACHE_FINISHED;  			cache->last_byte_to_unpin = (u64)-1;  		} else { -			cache->cached = BTRFS_CACHE_NO; +			if (load_cache_only) { +				cache->caching_ctl = NULL; +				cache->cached = BTRFS_CACHE_NO; +			} else { +				cache->cached = BTRFS_CACHE_STARTED; +			}  		}  		spin_unlock(&cache->lock); +		wake_up(&caching_ctl->wait);  		if (ret == 1) { +			put_caching_control(caching_ctl);  			free_excluded_extents(fs_info->extent_root, cache);  			return 0;  		} +	} else { +		/* +		 * We are not going to do the fast caching, set cached to the +		 * appropriate value and wakeup any waiters. +		 */ +		spin_lock(&cache->lock); +		if (load_cache_only) { +			cache->caching_ctl = NULL; +			cache->cached = BTRFS_CACHE_NO; +		} else { +			cache->cached = BTRFS_CACHE_STARTED; +		} +		spin_unlock(&cache->lock); +		wake_up(&caching_ctl->wait);  	} -	if (load_cache_only) -		return 0; - -	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); -	BUG_ON(!caching_ctl); - -	INIT_LIST_HEAD(&caching_ctl->list); -	mutex_init(&caching_ctl->mutex); -	init_waitqueue_head(&caching_ctl->wait); -	caching_ctl->block_group = cache; -	caching_ctl->progress = cache->key.objectid; -	/* one for caching kthread, one for caching block group list */ -	atomic_set(&caching_ctl->count, 2); -	caching_ctl->work.func = caching_thread; - -	spin_lock(&cache->lock); -	if (cache->cached != BTRFS_CACHE_NO) { -		spin_unlock(&cache->lock); -		kfree(caching_ctl); +	if (load_cache_only) { +		put_caching_control(caching_ctl);  		return 0;  	} -	cache->caching_ctl = caching_ctl; -	cache->cached = BTRFS_CACHE_STARTED; -	spin_unlock(&cache->lock);  	down_write(&fs_info->extent_commit_sem); +	atomic_inc(&caching_ctl->count);  	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);  	up_write(&fs_info->extent_commit_sem); @@ -3797,16 +3838,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root,  	kfree(rsv);  } -int btrfs_block_rsv_add(struct btrfs_root *root, -			struct btrfs_block_rsv *block_rsv, -			u64 num_bytes) +static inline int __block_rsv_add(struct btrfs_root *root, +				  struct btrfs_block_rsv *block_rsv, +				  u64 num_bytes, int flush)  {  	int ret;  	if (num_bytes == 0)  		return 0; -	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); +	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);  	if (!ret) {  		block_rsv_add_bytes(block_rsv, num_bytes, 1);  		return 0; @@ -3815,22 +3856,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root,  	return ret;  } +int btrfs_block_rsv_add(struct btrfs_root *root, +			struct btrfs_block_rsv *block_rsv, +			u64 num_bytes) +{ +	return __block_rsv_add(root, block_rsv, num_bytes, 1); +} +  int btrfs_block_rsv_add_noflush(struct btrfs_root *root,  				struct btrfs_block_rsv *block_rsv,  				u64 num_bytes)  { -	int ret; - -	if (num_bytes == 0) -		return 0; - -	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0); -	if (!ret) { -		block_rsv_add_bytes(block_rsv, num_bytes, 1); -		return 0; -	} - -	return ret; +	return __block_rsv_add(root, block_rsv, num_bytes, 0);  }  int btrfs_block_rsv_check(struct btrfs_root *root, @@ -4064,23 +4101,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,   */  static unsigned drop_outstanding_extent(struct inode *inode)  { +	unsigned drop_inode_space = 0;  	unsigned dropped_extents = 0;  	BUG_ON(!BTRFS_I(inode)->outstanding_extents);  	BTRFS_I(inode)->outstanding_extents--; +	if (BTRFS_I(inode)->outstanding_extents == 0 && +	    BTRFS_I(inode)->delalloc_meta_reserved) { +		drop_inode_space = 1; +		BTRFS_I(inode)->delalloc_meta_reserved = 0; +	} +  	/*  	 * If we have more or the same amount of outsanding extents than we have  	 * reserved then we need to leave the reserved extents count alone.  	 */  	if (BTRFS_I(inode)->outstanding_extents >=  	    BTRFS_I(inode)->reserved_extents) -		return 0; +		return drop_inode_space;  	dropped_extents = BTRFS_I(inode)->reserved_extents -  		BTRFS_I(inode)->outstanding_extents;  	BTRFS_I(inode)->reserved_extents -= dropped_extents; -	return dropped_extents; +	return dropped_extents + drop_inode_space;  }  /** @@ -4166,9 +4210,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  		nr_extents = BTRFS_I(inode)->outstanding_extents -  			BTRFS_I(inode)->reserved_extents;  		BTRFS_I(inode)->reserved_extents += nr_extents; +	} -		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); +	/* +	 * Add an item to reserve for updating the inode when we complete the +	 * delalloc io. +	 */ +	if (!BTRFS_I(inode)->delalloc_meta_reserved) { +		nr_extents++; +		BTRFS_I(inode)->delalloc_meta_reserved = 1;  	} + +	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);  	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);  	spin_unlock(&BTRFS_I(inode)->lock); @@ -5166,13 +5219,15 @@ search:  		}  have_block_group: -		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { +		cached = block_group_cache_done(block_group); +		if (unlikely(!cached)) {  			u64 free_percent; +			found_uncached_bg = true;  			ret = cache_block_group(block_group, trans,  						orig_root, 1);  			if (block_group->cached == BTRFS_CACHE_FINISHED) -				goto have_block_group; +				goto alloc;  			free_percent = btrfs_block_group_used(&block_group->item);  			free_percent *= 100; @@ -5194,7 +5249,6 @@ have_block_group:  							orig_root, 0);  				BUG_ON(ret);  			} -			found_uncached_bg = true;  			/*  			 * If loop is set for cached only, try the next block @@ -5204,10 +5258,7 @@ have_block_group:  				goto loop;  		} -		cached = block_group_cache_done(block_group); -		if (unlikely(!cached)) -			found_uncached_bg = true; - +alloc:  		if (unlikely(block_group->ro))  			goto loop; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1f87c4d0e7a..9472d3de5e5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2285,8 +2285,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)  				clean_io_failure(start, page);  		}  		if (!uptodate) { -			u64 failed_mirror; -			failed_mirror = (u64)bio->bi_bdev; +			int failed_mirror; +			failed_mirror = (int)(unsigned long)bio->bi_bdev;  			if (tree->ops && tree->ops->readpage_io_failed_hook)  				ret = tree->ops->readpage_io_failed_hook(  						bio, page, start, end, @@ -3366,6 +3366,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  		return -ENOMEM;  	path->leave_spinning = 1; +	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); +	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); +  	/*  	 * lookup the last file extent.  We're not using i_size here  	 * because there might be preallocation past i_size @@ -3413,7 +3416,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,  			 &cached_state, GFP_NOFS); -	em = get_extent_skip_holes(inode, off, last_for_get_extent, +	em = get_extent_skip_holes(inode, start, last_for_get_extent,  				   get_extent);  	if (!em)  		goto out; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index feb9be0e23b..7604c300132 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,7 +70,7 @@ struct extent_io_ops {  			      unsigned long bio_flags);  	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);  	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, -				       u64 start, u64 end, u64 failed_mirror, +				       u64 start, u64 end, int failed_mirror,  				       struct extent_state *state);  	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,  					u64 start, u64 end, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 7a15fcfb3e1..6e5b7e46369 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,  		}  	} +	for (i = 0; i < io_ctl->num_pages; i++) { +		clear_page_dirty_for_io(io_ctl->pages[i]); +		set_page_extent_mapped(io_ctl->pages[i]); +	} +  	return 0;  } @@ -537,6 +542,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,  			    struct btrfs_free_space *entry, u8 *type)  {  	struct btrfs_free_space_entry *e; +	int ret; + +	if (!io_ctl->cur) { +		ret = io_ctl_check_crc(io_ctl, io_ctl->index); +		if (ret) +			return ret; +	}  	e = io_ctl->cur;  	entry->offset = le64_to_cpu(e->offset); @@ -550,10 +562,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,  	io_ctl_unmap_page(io_ctl); -	if (io_ctl->index >= io_ctl->num_pages) -		return 0; - -	return io_ctl_check_crc(io_ctl, io_ctl->index); +	return 0;  }  static int io_ctl_read_bitmap(struct io_ctl *io_ctl, @@ -561,9 +570,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,  {  	int ret; -	if (io_ctl->cur && io_ctl->cur != io_ctl->orig) -		io_ctl_unmap_page(io_ctl); -  	ret = io_ctl_check_crc(io_ctl, io_ctl->index);  	if (ret)  		return ret; @@ -699,6 +705,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  		num_entries--;  	} +	io_ctl_unmap_page(&io_ctl); +  	/*  	 * We add the bitmaps at the end of the entries in order that  	 * the bitmap entries are added to the cache. @@ -1841,7 +1849,13 @@ again:  		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),  					  1, 0);  		if (!info) { -			WARN_ON(1); +			/* the tree logging code might be calling us before we +			 * have fully loaded the free space rbtree for this +			 * block group.  So it is possible the entry won't +			 * be in the rbtree yet at all.  The caching code +			 * will make sure not to put it in the rbtree if +			 * the logging code has pinned it. +			 */  			goto out_lock;  		}  	} @@ -2448,16 +2462,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,  {  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	struct btrfs_free_space *entry; -	struct rb_node *node;  	int ret = -ENOSPC; +	u64 bitmap_offset = offset_to_bitmap(ctl, offset);  	if (ctl->total_bitmaps == 0)  		return -ENOSPC;  	/* -	 * First check our cached list of bitmaps and see if there is an entry -	 * here that will work. +	 * The bitmap that covers offset won't be in the list unless offset +	 * is just its start offset.  	 */ +	entry = list_first_entry(bitmaps, struct btrfs_free_space, list); +	if (entry->offset != bitmap_offset) { +		entry = tree_search_offset(ctl, bitmap_offset, 1, 0); +		if (entry && list_empty(&entry->list)) +			list_add(&entry->list, bitmaps); +	} +  	list_for_each_entry(entry, bitmaps, list) {  		if (entry->bytes < min_bytes)  			continue; @@ -2468,38 +2489,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,  	}  	/* -	 * If we do have entries on our list and we are here then we didn't find -	 * anything, so go ahead and get the next entry after the last entry in -	 * this list and start the search from there. +	 * The bitmaps list has all the bitmaps that record free space +	 * starting after offset, so no more search is required.  	 */ -	if (!list_empty(bitmaps)) { -		entry = list_entry(bitmaps->prev, struct btrfs_free_space, -				   list); -		node = rb_next(&entry->offset_index); -		if (!node) -			return -ENOSPC; -		entry = rb_entry(node, struct btrfs_free_space, offset_index); -		goto search; -	} - -	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); -	if (!entry) -		return -ENOSPC; - -search: -	node = &entry->offset_index; -	do { -		entry = rb_entry(node, struct btrfs_free_space, offset_index); -		node = rb_next(&entry->offset_index); -		if (!entry->bitmap) -			continue; -		if (entry->bytes < min_bytes) -			continue; -		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, -					   bytes, min_bytes); -	} while (ret && node); - -	return ret; +	return -ENOSPC;  }  /* @@ -2517,8 +2510,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,  			     u64 offset, u64 bytes, u64 empty_size)  {  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; -	struct list_head bitmaps;  	struct btrfs_free_space *entry, *tmp; +	LIST_HEAD(bitmaps);  	u64 min_bytes;  	int ret; @@ -2557,7 +2550,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,  		goto out;  	} -	INIT_LIST_HEAD(&bitmaps);  	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,  				      bytes, min_bytes);  	if (ret) diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 53dcbdf446c..f8962a957d6 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,  	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;  	struct btrfs_path *path;  	struct inode *inode; +	struct btrfs_block_rsv *rsv; +	u64 num_bytes;  	u64 alloc_hint = 0;  	int ret;  	int prealloc; @@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,  	if (!path)  		return -ENOMEM; +	rsv = trans->block_rsv; +	trans->block_rsv = &root->fs_info->trans_block_rsv; + +	num_bytes = trans->bytes_reserved; +	/* +	 * 1 item for inode item insertion if need +	 * 3 items for inode item update (in the worst case) +	 * 1 item for free space object +	 * 3 items for pre-allocation +	 */ +	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); +	ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, +					  trans->bytes_reserved); +	if (ret) +		goto out;  again:  	inode = lookup_free_ino_inode(root, path);  	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {  		ret = PTR_ERR(inode); -		goto out; +		goto out_release;  	}  	if (IS_ERR(inode)) { @@ -434,7 +451,7 @@ again:  		ret = create_free_ino_inode(root, trans, path);  		if (ret) -			goto out; +			goto out_release;  		goto again;  	} @@ -477,11 +494,14 @@ again:  	}  	btrfs_free_reserved_data_space(inode, prealloc); +	ret = btrfs_write_out_ino_cache(root, trans, path);  out_put:  	iput(inode); +out_release: +	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);  out: -	if (ret == 0) -		ret = btrfs_write_out_ino_cache(root, trans, path); +	trans->block_rsv = rsv; +	trans->bytes_reserved = num_bytes;  	btrfs_free_path(path);  	return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 966ddcc4c63..526dd51a196 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode,  				   struct page *locked_page,  				   u64 start, u64 end, int *page_started,  				   unsigned long *nr_written, int unlock); +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, +				struct btrfs_root *root, struct inode *inode);  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,  				     struct inode *inode,  struct inode *dir, @@ -1741,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  				trans = btrfs_join_transaction(root);  			BUG_ON(IS_ERR(trans));  			trans->block_rsv = &root->fs_info->delalloc_block_rsv; -			ret = btrfs_update_inode(trans, root, inode); +			ret = btrfs_update_inode_fallback(trans, root, inode);  			BUG_ON(ret);  		}  		goto out; @@ -1791,7 +1793,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  	ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);  	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { -		ret = btrfs_update_inode(trans, root, inode); +		ret = btrfs_update_inode_fallback(trans, root, inode);  		BUG_ON(ret);  	}  	ret = 0; @@ -2199,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		if (ret)  			goto out;  	} +	/* release the path since we're done with it */ +	btrfs_release_path(path); +  	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;  	if (root->orphan_block_rsv) @@ -2426,7 +2431,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  /*   * copy everything in the in-memory inode into the btree.   */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,  				struct btrfs_root *root, struct inode *inode)  {  	struct btrfs_inode_item *inode_item; @@ -2434,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,  	struct extent_buffer *leaf;  	int ret; -	/* -	 * If the inode is a free space inode, we can deadlock during commit -	 * if we put it into the delayed code. -	 * -	 * The data relocation inode should also be directly updated -	 * without delay -	 */ -	if (!btrfs_is_free_space_inode(root, inode) -	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { -		ret = btrfs_delayed_update_inode(trans, root, inode); -		if (!ret) -			btrfs_set_inode_last_trans(trans, inode); -		return ret; -	} -  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; @@ -2477,6 +2467,43 @@ failed:  }  /* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +				struct btrfs_root *root, struct inode *inode) +{ +	int ret; + +	/* +	 * If the inode is a free space inode, we can deadlock during commit +	 * if we put it into the delayed code. +	 * +	 * The data relocation inode should also be directly updated +	 * without delay +	 */ +	if (!btrfs_is_free_space_inode(root, inode) +	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { +		ret = btrfs_delayed_update_inode(trans, root, inode); +		if (!ret) +			btrfs_set_inode_last_trans(trans, inode); +		return ret; +	} + +	return btrfs_update_inode_item(trans, root, inode); +} + +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, +				struct btrfs_root *root, struct inode *inode) +{ +	int ret; + +	ret = btrfs_update_inode(trans, root, inode); +	if (ret == -ENOSPC) +		return btrfs_update_inode_item(trans, root, inode); +	return ret; +} + +/*   * unlink helper that gets used here in inode.c and in the tree logging   * recovery code.  It remove a link in a directory with a given name, and   * also drops the back refs in the inode to the directory @@ -5632,7 +5659,7 @@ again:  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {  		ret = btrfs_ordered_update_i_size(inode, 0, ordered);  		if (!ret) -			err = btrfs_update_inode(trans, root, inode); +			err = btrfs_update_inode_fallback(trans, root, inode);  		goto out;  	} @@ -5670,7 +5697,7 @@ again:  	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);  	ret = btrfs_ordered_update_i_size(inode, 0, ordered);  	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) -		btrfs_update_inode(trans, root, inode); +		btrfs_update_inode_fallback(trans, root, inode);  	ret = 0;  out_unlock:  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, @@ -6529,14 +6556,16 @@ end_trans:  		ret = btrfs_orphan_del(NULL, inode);  	} -	trans->block_rsv = &root->fs_info->trans_block_rsv; -	ret = btrfs_update_inode(trans, root, inode); -	if (ret && !err) -		err = ret; +	if (trans) { +		trans->block_rsv = &root->fs_info->trans_block_rsv; +		ret = btrfs_update_inode(trans, root, inode); +		if (ret && !err) +			err = ret; -	nr = trans->blocks_used; -	ret = btrfs_end_transaction_throttle(trans, root); -	btrfs_btree_balance_dirty(root, nr); +		nr = trans->blocks_used; +		ret = btrfs_end_transaction_throttle(trans, root); +		btrfs_btree_balance_dirty(root, nr); +	}  out:  	btrfs_free_block_rsv(root, rsv); @@ -6605,6 +6634,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->orphan_meta_reserved = 0;  	ei->dummy_inode = 0;  	ei->in_defrag = 0; +	ei->delalloc_meta_reserved = 0;  	ei->force_compress = BTRFS_COMPRESS_NONE;  	ei->delayed_node = NULL; @@ -6764,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt,  			 struct dentry *dentry, struct kstat *stat)  {  	struct inode *inode = dentry->d_inode; +	u32 blocksize = inode->i_sb->s_blocksize; +  	generic_fillattr(inode, stat);  	stat->dev = BTRFS_I(inode)->root->anon_dev;  	stat->blksize = PAGE_CACHE_SIZE; -	stat->blocks = (inode_get_bytes(inode) + -			BTRFS_I(inode)->delalloc_bytes) >> 9; +	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + +		ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;  	return 0;  } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4a34c472f12..a90e749ed6d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1216,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,  		*devstr = '\0';  		devstr = vol_args->name;  		devid = simple_strtoull(devstr, &end, 10); -		printk(KERN_INFO "resizing devid %llu\n", +		printk(KERN_INFO "btrfs: resizing devid %llu\n",  		       (unsigned long long)devid);  	}  	device = btrfs_find_device(root, devid, NULL, NULL);  	if (!device) { -		printk(KERN_INFO "resizer unable to find device %llu\n", +		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",  		       (unsigned long long)devid);  		ret = -EINVAL;  		goto out_unlock; @@ -1267,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,  	do_div(new_size, root->sectorsize);  	new_size *= root->sectorsize; -	printk(KERN_INFO "new size for %s is %llu\n", +	printk(KERN_INFO "btrfs: new size for %s is %llu\n",  		device->name, (unsigned long long)new_size);  	if (new_size > old_size) { @@ -2930,11 +2930,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)  		goto out;  	for (i = 0; i < ipath->fspath->elem_cnt; ++i) { -		rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; +		rel_ptr = ipath->fspath->val[i] - +			  (u64)(unsigned long)ipath->fspath->val;  		ipath->fspath->val[i] = rel_ptr;  	} -	ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); +	ret = copy_to_user((void *)(unsigned long)ipa->fspath, +			   (void *)(unsigned long)ipath->fspath, size);  	if (ret) {  		ret = -EFAULT;  		goto out; @@ -3017,7 +3019,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,  	if (ret < 0)  		goto out; -	ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); +	ret = copy_to_user((void *)(unsigned long)loi->inodes, +			   (void *)(unsigned long)inodes, size);  	if (ret)  		ret = -EFAULT; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 24d654ce7a0..dff29d5e151 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,  			list_add_tail(&new_edge->list[UPPER],  				      &new_node->lower);  		} +	} else { +		list_add_tail(&new_node->lower, &cache->leaves);  	}  	rb_node = tree_insert(&cache->rb_root, new_node->bytenr, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ed11d3866af..fab420db512 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)  			swarn->logical, swarn->dev->name,  			(unsigned long long)swarn->sector, root, inum, offset,  			min(isize - offset, (u64)PAGE_SIZE), nlink, -			(char *)ipath->fspath->val[i]); +			(char *)(unsigned long)ipath->fspath->val[i]);  	free_ipath(ipath);  	return 0; @@ -944,50 +944,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)  static int scrub_submit(struct scrub_dev *sdev)  {  	struct scrub_bio *sbio; -	struct bio *bio; -	int i;  	if (sdev->curr == -1)  		return 0;  	sbio = sdev->bios[sdev->curr]; - -	bio = bio_alloc(GFP_NOFS, sbio->count); -	if (!bio) -		goto nomem; - -	bio->bi_private = sbio; -	bio->bi_end_io = scrub_bio_end_io; -	bio->bi_bdev = sdev->dev->bdev; -	bio->bi_sector = sbio->physical >> 9; - -	for (i = 0; i < sbio->count; ++i) { -		struct page *page; -		int ret; - -		page = alloc_page(GFP_NOFS); -		if (!page) -			goto nomem; - -		ret = bio_add_page(bio, page, PAGE_SIZE, 0); -		if (!ret) { -			__free_page(page); -			goto nomem; -		} -	} -  	sbio->err = 0;  	sdev->curr = -1;  	atomic_inc(&sdev->in_flight); -	submit_bio(READ, bio); +	submit_bio(READ, sbio->bio);  	return 0; - -nomem: -	scrub_free_bio(bio); - -	return -ENOMEM;  }  static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -995,6 +963,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,  		      u8 *csum, int force)  {  	struct scrub_bio *sbio; +	struct page *page; +	int ret;  again:  	/* @@ -1015,12 +985,22 @@ again:  	}  	sbio = sdev->bios[sdev->curr];  	if (sbio->count == 0) { +		struct bio *bio; +  		sbio->physical = physical;  		sbio->logical = logical; +		bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); +		if (!bio) +			return -ENOMEM; + +		bio->bi_private = sbio; +		bio->bi_end_io = scrub_bio_end_io; +		bio->bi_bdev = sdev->dev->bdev; +		bio->bi_sector = sbio->physical >> 9; +		sbio->err = 0; +		sbio->bio = bio;  	} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||  		   sbio->logical + sbio->count * PAGE_SIZE != logical) { -		int ret; -  		ret = scrub_submit(sdev);  		if (ret)  			return ret; @@ -1030,6 +1010,20 @@ again:  	sbio->spag[sbio->count].generation = gen;  	sbio->spag[sbio->count].have_csum = 0;  	sbio->spag[sbio->count].mirror_num = mirror_num; + +	page = alloc_page(GFP_NOFS); +	if (!page) +		return -ENOMEM; + +	ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); +	if (!ret) { +		__free_page(page); +		ret = scrub_submit(sdev); +		if (ret) +			return ret; +		goto again; +	} +  	if (csum) {  		sbio->spag[sbio->count].have_csum = 1;  		memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 57080dffdfc..17ee7fc5e64 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -197,7 +197,7 @@ static match_table_t tokens = {  	{Opt_subvolrootid, "subvolrootid=%d"},  	{Opt_defrag, "autodefrag"},  	{Opt_inode_cache, "inode_cache"}, -	{Opt_no_space_cache, "no_space_cache"}, +	{Opt_no_space_cache, "nospace_cache"},  	{Opt_recovery, "recovery"},  	{Opt_err, NULL},  }; @@ -448,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,  		token = match_token(p, tokens, args);  		switch (token) {  		case Opt_subvol: +			kfree(*subvol_name);  			*subvol_name = match_strdup(&args[0]);  			break;  		case Opt_subvolid: @@ -710,7 +711,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)  	if (btrfs_test_opt(root, SPACE_CACHE))  		seq_puts(seq, ",space_cache");  	else -		seq_puts(seq, ",no_space_cache"); +		seq_puts(seq, ",nospace_cache");  	if (btrfs_test_opt(root, CLEAR_CACHE))  		seq_puts(seq, ",clear_cache");  	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) @@ -824,13 +825,9 @@ static char *setup_root_args(char *args)  static struct dentry *mount_subvol(const char *subvol_name, int flags,  				   const char *device_name, char *data)  { -	struct super_block *s;  	struct dentry *root;  	struct vfsmount *mnt; -	struct mnt_namespace *ns_private;  	char *newargs; -	struct path path; -	int error;  	newargs = setup_root_args(data);  	if (!newargs) @@ -841,39 +838,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,  	if (IS_ERR(mnt))  		return ERR_CAST(mnt); -	ns_private = create_mnt_ns(mnt); -	if (IS_ERR(ns_private)) { -		mntput(mnt); -		return ERR_CAST(ns_private); -	} - -	/* -	 * This will trigger the automount of the subvol so we can just -	 * drop the mnt we have here and return the dentry that we -	 * found. -	 */ -	error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name, -				LOOKUP_FOLLOW, &path); -	put_mnt_ns(ns_private); -	if (error) -		return ERR_PTR(error); +	root = mount_subtree(mnt, subvol_name); -	if (!is_subvolume_inode(path.dentry->d_inode)) { -		path_put(&path); -		mntput(mnt); -		error = -EINVAL; +	if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { +		struct super_block *s = root->d_sb; +		dput(root); +		root = ERR_PTR(-EINVAL); +		deactivate_locked_super(s);  		printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",  				subvol_name); -		return ERR_PTR(-EINVAL);  	} -	/* Get a ref to the sb and the dentry we found and return it */ -	s = path.mnt->mnt_sb; -	atomic_inc(&s->s_active); -	root = dget(path.dentry); -	path_put(&path); -	down_write(&s->s_umount); -  	return root;  } @@ -890,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  	struct super_block *s;  	struct dentry *root;  	struct btrfs_fs_devices *fs_devices = NULL; -	struct btrfs_root *tree_root = NULL;  	struct btrfs_fs_info *fs_info = NULL;  	fmode_t mode = FMODE_READ;  	char *subvol_name = NULL; @@ -904,8 +878,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  	error = btrfs_parse_early_options(data, mode, fs_type,  					  &subvol_name, &subvol_objectid,  					  &subvol_rootid, &fs_devices); -	if (error) +	if (error) { +		kfree(subvol_name);  		return ERR_PTR(error); +	}  	if (subvol_name) {  		root = mount_subvol(subvol_name, flags, device_name, data); @@ -917,15 +893,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  	if (error)  		return ERR_PTR(error); -	error = btrfs_open_devices(fs_devices, mode, fs_type); -	if (error) -		return ERR_PTR(error); - -	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { -		error = -EACCES; -		goto error_close_devices; -	} -  	/*  	 * Setup a dummy root and fs_info for test/set super.  This is because  	 * we don't actually fill this stuff out until open_ctree, but we need @@ -933,24 +900,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  	 * then open_ctree will properly initialize everything later.  	 */  	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); -	tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); -	if (!fs_info || !tree_root) { +	if (!fs_info) +		return ERR_PTR(-ENOMEM); + +	fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); +	if (!fs_info->tree_root) {  		error = -ENOMEM; -		goto error_close_devices; +		goto error_fs_info;  	} -	fs_info->tree_root = tree_root; +	fs_info->tree_root->fs_info = fs_info;  	fs_info->fs_devices = fs_devices; -	tree_root->fs_info = fs_info;  	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);  	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);  	if (!fs_info->super_copy || !fs_info->super_for_commit) {  		error = -ENOMEM; +		goto error_fs_info; +	} + +	error = btrfs_open_devices(fs_devices, mode, fs_type); +	if (error) +		goto error_fs_info; + +	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { +		error = -EACCES;  		goto error_close_devices;  	}  	bdev = fs_devices->latest_bdev; -	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); +	s = sget(fs_type, btrfs_test_super, btrfs_set_super, +		 fs_info->tree_root);  	if (IS_ERR(s)) {  		error = PTR_ERR(s);  		goto error_close_devices; @@ -959,12 +938,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  	if (s->s_root) {  		if ((flags ^ s->s_flags) & MS_RDONLY) {  			deactivate_locked_super(s); -			return ERR_PTR(-EBUSY); +			error = -EBUSY; +			goto error_close_devices;  		}  		btrfs_close_devices(fs_devices);  		free_fs_info(fs_info); -		kfree(tree_root);  	} else {  		char b[BDEVNAME_SIZE]; @@ -991,8 +970,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  error_close_devices:  	btrfs_close_devices(fs_devices); +error_fs_info:  	free_fs_info(fs_info); -	kfree(tree_root);  	return ERR_PTR(error);  } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 960835eaf4d..81376d94cd3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,  			btrfs_save_ino_cache(root, trans); +			/* see comments in should_cow_block() */ +			root->force_cow = 0; +			smp_wmb(); +  			if (root->commit_root != root->node) {  				mutex_lock(&root->fs_commit_mutex);  				switch_commit_root(root); @@ -882,8 +886,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);  	if (to_reserve > 0) { -		ret = btrfs_block_rsv_add(root, &pending->block_rsv, -					  to_reserve); +		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, +						  to_reserve);  		if (ret) {  			pending->error = ret;  			goto fail; @@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	btrfs_tree_unlock(old);  	free_extent_buffer(old); +	/* see comments in should_cow_block() */ +	root->force_cow = 1; +	smp_wmb(); +  	btrfs_set_root_node(new_root_item, tmp);  	/* record when the snapshot was created in key.offset */  	key.offset = trans->transid; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f8e2943101a..c37433d3cd8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -999,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,  	key.objectid = device->devid;  	key.offset = start;  	key.type = BTRFS_DEV_EXTENT_KEY; - +again:  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);  	if (ret > 0) {  		ret = btrfs_previous_item(root, path, key.objectid, @@ -1012,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,  					struct btrfs_dev_extent);  		BUG_ON(found_key.offset > start || found_key.offset +  		       btrfs_dev_extent_length(leaf, extent) < start); +		key = found_key; +		btrfs_release_path(path); +		goto again;  	} else if (ret == 0) {  		leaf = path->nodes[0];  		extent = btrfs_item_ptr(leaf, path->slots[0], diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ab5b1c49f35..78f2d4d4f37 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -100,6 +100,12 @@ struct btrfs_device {  	struct reada_zone *reada_curr_zone;  	struct radix_tree_root reada_zones;  	struct radix_tree_root reada_extents; + +	/* for sending down flush barriers */ +	struct bio *flush_bio; +	struct completion flush_wait; +	int nobarriers; +  };  struct btrfs_fs_devices {  |