diff options
Diffstat (limited to 'fs/btrfs/volumes.c')
| -rw-r--r-- | fs/btrfs/volumes.c | 1042 | 
1 files changed, 838 insertions, 204 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 88b969aeeb7..5cce6aa7401 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,7 +25,6 @@  #include <linux/capability.h>  #include <linux/ratelimit.h>  #include <linux/kthread.h> -#include <asm/div64.h>  #include "compat.h"  #include "ctree.h"  #include "extent_map.h" @@ -36,6 +35,8 @@  #include "async-thread.h"  #include "check-integrity.h"  #include "rcu-string.h" +#include "math.h" +#include "dev-replace.h"  static int init_first_rw_device(struct btrfs_trans_handle *trans,  				struct btrfs_root *root, @@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)  	kfree(fs_devices);  } +static void btrfs_kobject_uevent(struct block_device *bdev, +				 enum kobject_action action) +{ +	int ret; + +	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); +	if (ret) +		pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", +			action, +			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), +			&disk_to_dev(bdev->bd_disk)->kobj); +} +  void btrfs_cleanup_fs_uuids(void)  {  	struct btrfs_fs_devices *fs_devices; @@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)  	return NULL;  } +static int +btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, +		      int flush, struct block_device **bdev, +		      struct buffer_head **bh) +{ +	int ret; + +	*bdev = blkdev_get_by_path(device_path, flags, holder); + +	if (IS_ERR(*bdev)) { +		ret = PTR_ERR(*bdev); +		printk(KERN_INFO "btrfs: open %s failed\n", device_path); +		goto error; +	} + +	if (flush) +		filemap_write_and_wait((*bdev)->bd_inode->i_mapping); +	ret = set_blocksize(*bdev, 4096); +	if (ret) { +		blkdev_put(*bdev, flags); +		goto error; +	} +	invalidate_bdev(*bdev); +	*bh = btrfs_read_dev_super(*bdev); +	if (!*bh) { +		ret = -EINVAL; +		blkdev_put(*bdev, flags); +		goto error; +	} + +	return 0; + +error: +	*bdev = NULL; +	*bh = NULL; +	return ret; +} +  static void requeue_list(struct btrfs_pending_bios *pending_bios,  			struct bio *head, struct bio *tail)  { @@ -467,7 +519,8 @@ error:  	return ERR_PTR(-ENOMEM);  } -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, +			       struct btrfs_fs_devices *fs_devices, int step)  {  	struct btrfs_device *device, *next; @@ -480,8 +533,9 @@ again:  	/* This is the initialized path, it is safe to release the devices. */  	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {  		if (device->in_fs_metadata) { -			if (!latest_transid || -			    device->generation > latest_transid) { +			if (!device->is_tgtdev_for_dev_replace && +			    (!latest_transid || +			     device->generation > latest_transid)) {  				latest_devid = device->devid;  				latest_transid = device->generation;  				latest_bdev = device->bdev; @@ -489,6 +543,21 @@ again:  			continue;  		} +		if (device->devid == BTRFS_DEV_REPLACE_DEVID) { +			/* +			 * In the first step, keep the device which has +			 * the correct fsid and the devid that is used +			 * for the dev_replace procedure. +			 * In the second step, the dev_replace state is +			 * read from the device tree and it is known +			 * whether the procedure is really active or +			 * not, which means whether this device is +			 * used or whether it should be removed. +			 */ +			if (step == 0 || device->is_tgtdev_for_dev_replace) { +				continue; +			} +		}  		if (device->bdev) {  			blkdev_put(device->bdev, device->mode);  			device->bdev = NULL; @@ -497,7 +566,8 @@ again:  		if (device->writeable) {  			list_del_init(&device->dev_alloc_list);  			device->writeable = 0; -			fs_devices->rw_devices--; +			if (!device->is_tgtdev_for_dev_replace) +				fs_devices->rw_devices--;  		}  		list_del_init(&device->dev_list);  		fs_devices->num_devices--; @@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)  		if (device->bdev)  			fs_devices->open_devices--; -		if (device->writeable) { +		if (device->writeable && !device->is_tgtdev_for_dev_replace) {  			list_del_init(&device->dev_alloc_list);  			fs_devices->rw_devices--;  		} @@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  		if (!device->name)  			continue; -		bdev = blkdev_get_by_path(device->name->str, flags, holder); -		if (IS_ERR(bdev)) { -			printk(KERN_INFO "open %s failed\n", device->name->str); -			goto error; -		} -		filemap_write_and_wait(bdev->bd_inode->i_mapping); -		invalidate_bdev(bdev); -		set_blocksize(bdev, 4096); - -		bh = btrfs_read_dev_super(bdev); -		if (!bh) -			goto error_close; +		ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, +					    &bdev, &bh); +		if (ret) +			continue;  		disk_super = (struct btrfs_super_block *)bh->b_data;  		devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  			fs_devices->rotating = 1;  		fs_devices->open_devices++; -		if (device->writeable) { +		if (device->writeable && !device->is_tgtdev_for_dev_replace) {  			fs_devices->rw_devices++;  			list_add(&device->dev_alloc_list,  				 &fs_devices->alloc_list); @@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  error_brelse:  		brelse(bh); -error_close:  		blkdev_put(bdev, flags); -error:  		continue;  	}  	if (fs_devices->open_devices == 0) { @@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,  	u64 total_devices;  	flags |= FMODE_EXCL; -	bdev = blkdev_get_by_path(path, flags, holder); - -	if (IS_ERR(bdev)) { -		ret = PTR_ERR(bdev); -		goto error; -	} -  	mutex_lock(&uuid_mutex); -	ret = set_blocksize(bdev, 4096); +	ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);  	if (ret) -		goto error_close; -	bh = btrfs_read_dev_super(bdev); -	if (!bh) { -		ret = -EINVAL; -		goto error_close; -	} +		goto error;  	disk_super = (struct btrfs_super_block *)bh->b_data;  	devid = btrfs_stack_device_id(&disk_super->dev_item);  	transid = btrfs_super_generation(disk_super);  	total_devices = btrfs_super_num_devices(disk_super); -	if (disk_super->label[0]) +	if (disk_super->label[0]) { +		if (disk_super->label[BTRFS_LABEL_SIZE - 1]) +			disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';  		printk(KERN_INFO "device label %s ", disk_super->label); -	else +	} else {  		printk(KERN_INFO "device fsid %pU ", disk_super->fsid); +	}  	printk(KERN_CONT "devid %llu transid %llu %s\n",  	       (unsigned long long)devid, (unsigned long long)transid, path);  	ret = device_list_add(path, disk_super, devid, fs_devices_ret);  	if (!ret && fs_devices_ret)  		(*fs_devices_ret)->total_devices = total_devices;  	brelse(bh); -error_close: -	mutex_unlock(&uuid_mutex);  	blkdev_put(bdev, flags);  error: +	mutex_unlock(&uuid_mutex);  	return ret;  } @@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,  	*length = 0; -	if (start >= device->total_bytes) +	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)  		return 0;  	path = btrfs_alloc_path(); @@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,  	max_hole_size = 0;  	hole_size = 0; -	if (search_start >= search_end) { +	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {  		ret = -ENOSPC;  		goto error;  	} @@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,  	struct btrfs_key key;  	WARN_ON(!device->in_fs_metadata); +	WARN_ON(device->is_tgtdev_for_dev_replace);  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; @@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		root->fs_info->avail_system_alloc_bits |  		root->fs_info->avail_metadata_alloc_bits; -	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && -	    root->fs_info->fs_devices->num_devices <= 4) { +	num_devices = root->fs_info->fs_devices->num_devices; +	btrfs_dev_replace_lock(&root->fs_info->dev_replace); +	if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { +		WARN_ON(num_devices < 1); +		num_devices--; +	} +	btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {  		printk(KERN_ERR "btrfs: unable to go below four devices "  		       "on raid10\n");  		ret = -EINVAL;  		goto out;  	} -	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && -	    root->fs_info->fs_devices->num_devices <= 2) { +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {  		printk(KERN_ERR "btrfs: unable to go below two "  		       "devices on raid1\n");  		ret = -EINVAL; @@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		 * is held.  		 */  		list_for_each_entry(tmp, devices, dev_list) { -			if (tmp->in_fs_metadata && !tmp->bdev) { +			if (tmp->in_fs_metadata && +			    !tmp->is_tgtdev_for_dev_replace && +			    !tmp->bdev) {  				device = tmp;  				break;  			} @@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  			goto out;  		}  	} else { -		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, -					  root->fs_info->bdev_holder); -		if (IS_ERR(bdev)) { -			ret = PTR_ERR(bdev); +		ret = btrfs_get_bdev_and_sb(device_path, +					    FMODE_READ | FMODE_EXCL, +					    root->fs_info->bdev_holder, 0, +					    &bdev, &bh); +		if (ret)  			goto out; -		} - -		set_blocksize(bdev, 4096); -		invalidate_bdev(bdev); -		bh = btrfs_read_dev_super(bdev); -		if (!bh) { -			ret = -EINVAL; -			goto error_close; -		}  		disk_super = (struct btrfs_super_block *)bh->b_data;  		devid = btrfs_stack_device_id(&disk_super->dev_item);  		dev_uuid = disk_super->dev_item.uuid; -		device = btrfs_find_device(root, devid, dev_uuid, +		device = btrfs_find_device(root->fs_info, devid, dev_uuid,  					   disk_super->fsid);  		if (!device) {  			ret = -ENOENT; @@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		}  	} +	if (device->is_tgtdev_for_dev_replace) { +		pr_err("btrfs: unable to remove the dev_replace target dev\n"); +		ret = -EINVAL; +		goto error_brelse; +	} +  	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {  		printk(KERN_ERR "btrfs: unable to remove the only writeable "  		       "device\n"); @@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	if (ret)  		goto error_undo; +	/* +	 * TODO: the superblock still includes this device in its num_devices +	 * counter although write_all_supers() is not locked out. This +	 * could give a filesystem state which requires a degraded mount. +	 */  	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);  	if (ret)  		goto error_undo; @@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	spin_unlock(&root->fs_info->free_chunk_lock);  	device->in_fs_metadata = 0; -	btrfs_scrub_cancel_dev(root, device); +	btrfs_scrub_cancel_dev(root->fs_info, device);  	/*  	 * the device list mutex makes sure that we don't change @@ -1475,11 +1537,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		free_fs_devices(cur_devices);  	} +	root->fs_info->num_tolerated_disk_barrier_failures = +		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); +  	/*  	 * at this point, the device is zero sized.  We want to  	 * remove it from the devices list and zero out the old super  	 */ -	if (clear_super) { +	if (clear_super && disk_super) {  		/* make sure this device isn't detected as part of  		 * the FS anymore  		 */ @@ -1490,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	ret = 0; +	/* Notify udev that device has changed */ +	btrfs_kobject_uevent(bdev, KOBJ_CHANGE); +  error_brelse:  	brelse(bh); -error_close:  	if (bdev)  		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);  out: @@ -1509,6 +1576,112 @@ error_undo:  	goto error_brelse;  } +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, +				 struct btrfs_device *srcdev) +{ +	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); +	list_del_rcu(&srcdev->dev_list); +	list_del_rcu(&srcdev->dev_alloc_list); +	fs_info->fs_devices->num_devices--; +	if (srcdev->missing) { +		fs_info->fs_devices->missing_devices--; +		fs_info->fs_devices->rw_devices++; +	} +	if (srcdev->can_discard) +		fs_info->fs_devices->num_can_discard--; +	if (srcdev->bdev) +		fs_info->fs_devices->open_devices--; + +	call_rcu(&srcdev->rcu, free_device); +} + +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, +				      struct btrfs_device *tgtdev) +{ +	struct btrfs_device *next_device; + +	WARN_ON(!tgtdev); +	mutex_lock(&fs_info->fs_devices->device_list_mutex); +	if (tgtdev->bdev) { +		btrfs_scratch_superblock(tgtdev); +		fs_info->fs_devices->open_devices--; +	} +	fs_info->fs_devices->num_devices--; +	if (tgtdev->can_discard) +		fs_info->fs_devices->num_can_discard++; + +	next_device = list_entry(fs_info->fs_devices->devices.next, +				 struct btrfs_device, dev_list); +	if (tgtdev->bdev == fs_info->sb->s_bdev) +		fs_info->sb->s_bdev = next_device->bdev; +	if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) +		fs_info->fs_devices->latest_bdev = next_device->bdev; +	list_del_rcu(&tgtdev->dev_list); + +	call_rcu(&tgtdev->rcu, free_device); + +	mutex_unlock(&fs_info->fs_devices->device_list_mutex); +} + +int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, +			      struct btrfs_device **device) +{ +	int ret = 0; +	struct btrfs_super_block *disk_super; +	u64 devid; +	u8 *dev_uuid; +	struct block_device *bdev; +	struct buffer_head *bh; + +	*device = NULL; +	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, +				    root->fs_info->bdev_holder, 0, &bdev, &bh); +	if (ret) +		return ret; +	disk_super = (struct btrfs_super_block *)bh->b_data; +	devid = btrfs_stack_device_id(&disk_super->dev_item); +	dev_uuid = disk_super->dev_item.uuid; +	*device = btrfs_find_device(root->fs_info, devid, dev_uuid, +				    disk_super->fsid); +	brelse(bh); +	if (!*device) +		ret = -ENOENT; +	blkdev_put(bdev, FMODE_READ); +	return ret; +} + +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, +					 char *device_path, +					 struct btrfs_device **device) +{ +	*device = NULL; +	if (strcmp(device_path, "missing") == 0) { +		struct list_head *devices; +		struct btrfs_device *tmp; + +		devices = &root->fs_info->fs_devices->devices; +		/* +		 * It is safe to read the devices since the volume_mutex +		 * is held by the caller. +		 */ +		list_for_each_entry(tmp, devices, dev_list) { +			if (tmp->in_fs_metadata && !tmp->bdev) { +				*device = tmp; +				break; +			} +		} + +		if (!*device) { +			pr_err("btrfs: no missing device found\n"); +			return -ENOENT; +		} + +		return 0; +	} else { +		return btrfs_find_device_by_path(root, device_path, device); +	} +} +  /*   * does all the dirty work required for changing file system's UUID.   */ @@ -1627,7 +1800,8 @@ next_slot:  		read_extent_buffer(leaf, fs_uuid,  				   (unsigned long)btrfs_device_fsid(dev_item),  				   BTRFS_UUID_SIZE); -		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); +		device = btrfs_find_device(root->fs_info, devid, dev_uuid, +					   fs_uuid);  		BUG_ON(!device); /* Logic error */  		if (device->fs_devices->seeding) { @@ -1675,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  	filemap_write_and_wait(bdev->bd_inode->i_mapping);  	devices = &root->fs_info->fs_devices->devices; -	/* -	 * we have the volume lock, so we don't need the extra -	 * device list mutex while reading the list here. -	 */ + +	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);  	list_for_each_entry(device, devices, dev_list) {  		if (device->bdev == bdev) {  			ret = -EEXIST; +			mutex_unlock( +				&root->fs_info->fs_devices->device_list_mutex);  			goto error;  		}  	} +	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);  	device = kzalloc(sizeof(*device), GFP_NOFS);  	if (!device) { @@ -1734,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  	device->dev_root = root->fs_info->dev_root;  	device->bdev = bdev;  	device->in_fs_metadata = 1; +	device->is_tgtdev_for_dev_replace = 0;  	device->mode = FMODE_EXCL;  	set_blocksize(device->bdev, 4096); @@ -1775,15 +1951,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  	if (seeding_dev) {  		ret = init_first_rw_device(trans, root, device); -		if (ret) +		if (ret) { +			btrfs_abort_transaction(trans, root, ret);  			goto error_trans; +		}  		ret = btrfs_finish_sprout(trans, root); -		if (ret) +		if (ret) { +			btrfs_abort_transaction(trans, root, ret);  			goto error_trans; +		}  	} else {  		ret = btrfs_add_device(trans, root, device); -		if (ret) +		if (ret) { +			btrfs_abort_transaction(trans, root, ret);  			goto error_trans; +		}  	}  	/* @@ -1793,6 +1975,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  	btrfs_clear_space_info_full(root->fs_info);  	unlock_chunks(root); +	root->fs_info->num_tolerated_disk_barrier_failures = +		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);  	ret = btrfs_commit_transaction(trans, root);  	if (seeding_dev) { @@ -1808,13 +1992,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  				    "Failed to relocate sys chunks after "  				    "device initialization. This can be fixed "  				    "using the \"btrfs balance\" command."); +		trans = btrfs_attach_transaction(root); +		if (IS_ERR(trans)) { +			if (PTR_ERR(trans) == -ENOENT) +				return 0; +			return PTR_ERR(trans); +		} +		ret = btrfs_commit_transaction(trans, root);  	}  	return ret;  error_trans:  	unlock_chunks(root); -	btrfs_abort_transaction(trans, root, ret);  	btrfs_end_transaction(trans, root);  	rcu_string_free(device->name);  	kfree(device); @@ -1827,6 +2017,98 @@ error:  	return ret;  } +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, +				  struct btrfs_device **device_out) +{ +	struct request_queue *q; +	struct btrfs_device *device; +	struct block_device *bdev; +	struct btrfs_fs_info *fs_info = root->fs_info; +	struct list_head *devices; +	struct rcu_string *name; +	int ret = 0; + +	*device_out = NULL; +	if (fs_info->fs_devices->seeding) +		return -EINVAL; + +	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, +				  fs_info->bdev_holder); +	if (IS_ERR(bdev)) +		return PTR_ERR(bdev); + +	filemap_write_and_wait(bdev->bd_inode->i_mapping); + +	devices = &fs_info->fs_devices->devices; +	list_for_each_entry(device, devices, dev_list) { +		if (device->bdev == bdev) { +			ret = -EEXIST; +			goto error; +		} +	} + +	device = kzalloc(sizeof(*device), GFP_NOFS); +	if (!device) { +		ret = -ENOMEM; +		goto error; +	} + +	name = rcu_string_strdup(device_path, GFP_NOFS); +	if (!name) { +		kfree(device); +		ret = -ENOMEM; +		goto error; +	} +	rcu_assign_pointer(device->name, name); + +	q = bdev_get_queue(bdev); +	if (blk_queue_discard(q)) +		device->can_discard = 1; +	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); +	device->writeable = 1; +	device->work.func = pending_bios_fn; +	generate_random_uuid(device->uuid); +	device->devid = BTRFS_DEV_REPLACE_DEVID; +	spin_lock_init(&device->io_lock); +	device->generation = 0; +	device->io_width = root->sectorsize; +	device->io_align = root->sectorsize; +	device->sector_size = root->sectorsize; +	device->total_bytes = i_size_read(bdev->bd_inode); +	device->disk_total_bytes = device->total_bytes; +	device->dev_root = fs_info->dev_root; +	device->bdev = bdev; +	device->in_fs_metadata = 1; +	device->is_tgtdev_for_dev_replace = 1; +	device->mode = FMODE_EXCL; +	set_blocksize(device->bdev, 4096); +	device->fs_devices = fs_info->fs_devices; +	list_add(&device->dev_list, &fs_info->fs_devices->devices); +	fs_info->fs_devices->num_devices++; +	fs_info->fs_devices->open_devices++; +	if (device->can_discard) +		fs_info->fs_devices->num_can_discard++; +	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + +	*device_out = device; +	return ret; + +error: +	blkdev_put(bdev, FMODE_EXCL); +	return ret; +} + +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, +					      struct btrfs_device *tgtdev) +{ +	WARN_ON(fs_info->fs_devices->rw_devices == 0); +	tgtdev->io_width = fs_info->dev_root->sectorsize; +	tgtdev->io_align = fs_info->dev_root->sectorsize; +	tgtdev->sector_size = fs_info->dev_root->sectorsize; +	tgtdev->dev_root = fs_info->dev_root; +	tgtdev->in_fs_metadata = 1; +} +  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,  					struct btrfs_device *device)  { @@ -1883,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,  	if (!device->writeable)  		return -EACCES; -	if (new_size <= device->total_bytes) +	if (new_size <= device->total_bytes || +	    device->is_tgtdev_for_dev_replace)  		return -EINVAL;  	btrfs_set_super_total_bytes(super_copy, old_total + diff); @@ -2321,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,  	return 1;  } -static u64 div_factor_fine(u64 num, int factor) -{ -	if (factor <= 0) -		return 0; -	if (factor >= 100) -		return num; - -	num *= factor; -	do_div(num, 100); -	return num; -} -  static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,  			      struct btrfs_balance_args *bargs)  { @@ -2497,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,  	return 1;  } -static u64 div_factor(u64 num, int factor) -{ -	if (factor == 10) -		return num; -	num *= factor; -	do_div(num, 10); -	return num; -} -  static int __btrfs_balance(struct btrfs_fs_info *fs_info)  {  	struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -2533,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)  		size_to_free = div_factor(old_size, 1);  		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);  		if (!device->writeable || -		    device->total_bytes - device->bytes_used > size_to_free) +		    device->total_bytes - device->bytes_used > size_to_free || +		    device->is_tgtdev_for_dev_replace)  			continue;  		ret = btrfs_shrink_device(device, old_size - size_to_free); @@ -2711,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  	u64 allowed;  	int mixed = 0;  	int ret; +	u64 num_devices;  	if (btrfs_fs_closing(fs_info) ||  	    atomic_read(&fs_info->balance_pause_req) || @@ -2739,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		}  	} +	num_devices = fs_info->fs_devices->num_devices; +	btrfs_dev_replace_lock(&fs_info->dev_replace); +	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { +		BUG_ON(num_devices < 1); +		num_devices--; +	} +	btrfs_dev_replace_unlock(&fs_info->dev_replace);  	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; -	if (fs_info->fs_devices->num_devices == 1) +	if (num_devices == 1)  		allowed |= BTRFS_BLOCK_GROUP_DUP; -	else if (fs_info->fs_devices->num_devices < 4) +	else if (num_devices < 4)  		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);  	else  		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | @@ -2804,6 +3075,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		}  	} +	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { +		int num_tolerated_disk_barrier_failures; +		u64 target = bctl->sys.target; + +		num_tolerated_disk_barrier_failures = +			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); +		if (num_tolerated_disk_barrier_failures > 0 && +		    (target & +		     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | +		      BTRFS_AVAIL_ALLOC_BIT_SINGLE))) +			num_tolerated_disk_barrier_failures = 0; +		else if (num_tolerated_disk_barrier_failures > 1 && +			 (target & +			  (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) +			num_tolerated_disk_barrier_failures = 1; + +		fs_info->num_tolerated_disk_barrier_failures = +			num_tolerated_disk_barrier_failures; +	} +  	ret = insert_balance_item(fs_info->tree_root, bctl);  	if (ret && ret != -EEXIST)  		goto out; @@ -2836,6 +3127,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		__cancel_balance(fs_info);  	} +	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { +		fs_info->num_tolerated_disk_barrier_failures = +			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); +	} +  	wake_up(&fs_info->balance_wait_q);  	return ret; @@ -2860,6 +3156,7 @@ static int balance_kthread(void *data)  		ret = btrfs_balance(fs_info->balance_ctl, NULL);  	} +	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);  	mutex_unlock(&fs_info->balance_mutex);  	mutex_unlock(&fs_info->volume_mutex); @@ -2882,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)  		return 0;  	} +	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));  	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");  	if (IS_ERR(tsk))  		return PTR_ERR(tsk); @@ -3038,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)  	u64 old_size = device->total_bytes;  	u64 diff = device->total_bytes - new_size; -	if (new_size >= device->total_bytes) +	if (device->is_tgtdev_for_dev_replace)  		return -EINVAL;  	path = btrfs_alloc_path(); @@ -3193,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)  	return 0;  } +struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { +	{ 2, 1, 0, 4, 2, 2 /* raid10 */ }, +	{ 1, 1, 2, 2, 2, 2 /* raid1 */ }, +	{ 1, 2, 1, 1, 1, 2 /* dup */ }, +	{ 1, 1, 0, 2, 1, 1 /* raid0 */ }, +	{ 1, 1, 0, 1, 1, 1 /* single */ }, +}; +  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  			       struct btrfs_root *extent_root,  			       struct map_lookup **map_ret, @@ -3222,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	int ndevs;  	int i;  	int j; +	int index;  	BUG_ON(!alloc_profile_is_valid(type, 0));  	if (list_empty(&fs_devices->alloc_list))  		return -ENOSPC; -	sub_stripes = 1; -	dev_stripes = 1; -	devs_increment = 1; -	ncopies = 1; -	devs_max = 0;	/* 0 == as many as possible */ -	devs_min = 1; +	index = __get_raid_index(type); -	/* -	 * define the properties of each RAID type. -	 * FIXME: move this to a global table and use it in all RAID -	 * calculation code -	 */ -	if (type & (BTRFS_BLOCK_GROUP_DUP)) { -		dev_stripes = 2; -		ncopies = 2; -		devs_max = 1; -	} else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { -		devs_min = 2; -	} else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { -		devs_increment = 2; -		ncopies = 2; -		devs_max = 2; -		devs_min = 2; -	} else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { -		sub_stripes = 2; -		devs_increment = 2; -		ncopies = 2; -		devs_min = 4; -	} else { -		devs_max = 1; -	} +	sub_stripes = btrfs_raid_array[index].sub_stripes; +	dev_stripes = btrfs_raid_array[index].dev_stripes; +	devs_max = btrfs_raid_array[index].devs_max; +	devs_min = btrfs_raid_array[index].devs_min; +	devs_increment = btrfs_raid_array[index].devs_increment; +	ncopies = btrfs_raid_array[index].ncopies;  	if (type & BTRFS_BLOCK_GROUP_DATA) {  		max_stripe_size = 1024 * 1024 * 1024; @@ -3305,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		cur = cur->next;  		if (!device->writeable) { -			printk(KERN_ERR +			WARN(1, KERN_ERR  			       "btrfs: read-only device in alloc_list\n"); -			WARN_ON(1);  			continue;  		} -		if (!device->in_fs_metadata) +		if (!device->in_fs_metadata || +		    device->is_tgtdev_for_dev_replace)  			continue;  		if (device->total_bytes > device->bytes_used) @@ -3340,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		devices_info[ndevs].total_avail = total_avail;  		devices_info[ndevs].dev = device;  		++ndevs; +		WARN_ON(ndevs > fs_devices->rw_devices);  	}  	/* @@ -3608,12 +3893,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,  	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,  				  &sys_chunk_size, &sys_stripe_size,  				  sys_chunk_offset, alloc_profile); -	if (ret) -		goto abort; +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	}  	ret = btrfs_add_device(trans, fs_info->chunk_root, device); -	if (ret) -		goto abort; +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	}  	/*  	 * Modifying chunk tree needs allocating new blocks from both @@ -3623,19 +3912,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,  	 */  	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,  				   chunk_size, stripe_size); -	if (ret) -		goto abort; +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	}  	ret = __finish_chunk_alloc(trans, extent_root, sys_map,  				   sys_chunk_offset, sys_chunk_size,  				   sys_stripe_size);  	if (ret) -		goto abort; +		btrfs_abort_transaction(trans, root, ret); -	return 0; +out: -abort: -	btrfs_abort_transaction(trans, root, ret);  	return ret;  } @@ -3694,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)  	}  } -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  { +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;  	struct extent_map *em;  	struct map_lookup *map;  	struct extent_map_tree *em_tree = &map_tree->map_tree; @@ -3715,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)  	else  		ret = 1;  	free_extent_map(em); + +	btrfs_dev_replace_lock(&fs_info->dev_replace); +	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) +		ret++; +	btrfs_dev_replace_unlock(&fs_info->dev_replace); +  	return ret;  } -static int find_live_mirror(struct map_lookup *map, int first, int num, -			    int optimal) +static int find_live_mirror(struct btrfs_fs_info *fs_info, +			    struct map_lookup *map, int first, int num, +			    int optimal, int dev_replace_is_ongoing)  {  	int i; -	if (map->stripes[optimal].dev->bdev) -		return optimal; -	for (i = first; i < first + num; i++) { -		if (map->stripes[i].dev->bdev) -			return i; +	int tolerance; +	struct btrfs_device *srcdev; + +	if (dev_replace_is_ongoing && +	    fs_info->dev_replace.cont_reading_from_srcdev_mode == +	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) +		srcdev = fs_info->dev_replace.srcdev; +	else +		srcdev = NULL; + +	/* +	 * try to avoid the drive that is the source drive for a +	 * dev-replace procedure, only choose it if no other non-missing +	 * mirror is available +	 */ +	for (tolerance = 0; tolerance < 2; tolerance++) { +		if (map->stripes[optimal].dev->bdev && +		    (tolerance || map->stripes[optimal].dev != srcdev)) +			return optimal; +		for (i = first; i < first + num; i++) { +			if (map->stripes[i].dev->bdev && +			    (tolerance || map->stripes[i].dev != srcdev)) +				return i; +		}  	} +  	/* we couldn't find one that doesn't fail.  Just return something  	 * and the io error handling code will clean up eventually  	 */  	return optimal;  } -static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  			     u64 logical, u64 *length,  			     struct btrfs_bio **bbio_ret,  			     int mirror_num)  {  	struct extent_map *em;  	struct map_lookup *map; +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;  	struct extent_map_tree *em_tree = &map_tree->map_tree;  	u64 offset;  	u64 stripe_offset; @@ -3754,13 +4072,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	int num_stripes;  	int max_errors = 0;  	struct btrfs_bio *bbio = NULL; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	int dev_replace_is_ongoing = 0; +	int num_alloc_stripes; +	int patch_the_first_stripe_for_dev_replace = 0; +	u64 physical_to_patch_in_first_stripe = 0;  	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, logical, *length);  	read_unlock(&em_tree->lock);  	if (!em) { -		printk(KERN_CRIT "unable to find logical %llu len %llu\n", +		printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",  		       (unsigned long long)logical,  		       (unsigned long long)*length);  		BUG(); @@ -3770,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	map = (struct map_lookup *)em->bdev;  	offset = logical - em->start; -	if (mirror_num > map->num_stripes) -		mirror_num = 0; -  	stripe_nr = offset;  	/*  	 * stripe_nr counts the total number of stripes we have to stride @@ -3799,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	if (!bbio_ret)  		goto out; +	btrfs_dev_replace_lock(dev_replace); +	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); +	if (!dev_replace_is_ongoing) +		btrfs_dev_replace_unlock(dev_replace); + +	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && +	    !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && +	    dev_replace->tgtdev != NULL) { +		/* +		 * in dev-replace case, for repair case (that's the only +		 * case where the mirror is selected explicitly when +		 * calling btrfs_map_block), blocks left of the left cursor +		 * can also be read from the target drive. +		 * For REQ_GET_READ_MIRRORS, the target drive is added as +		 * the last one to the array of stripes. For READ, it also +		 * needs to be supported using the same mirror number. +		 * If the requested block is not left of the left cursor, +		 * EIO is returned. This can happen because btrfs_num_copies() +		 * returns one more in the dev-replace case. +		 */ +		u64 tmp_length = *length; +		struct btrfs_bio *tmp_bbio = NULL; +		int tmp_num_stripes; +		u64 srcdev_devid = dev_replace->srcdev->devid; +		int index_srcdev = 0; +		int found = 0; +		u64 physical_of_found = 0; + +		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, +			     logical, &tmp_length, &tmp_bbio, 0); +		if (ret) { +			WARN_ON(tmp_bbio != NULL); +			goto out; +		} + +		tmp_num_stripes = tmp_bbio->num_stripes; +		if (mirror_num > tmp_num_stripes) { +			/* +			 * REQ_GET_READ_MIRRORS does not contain this +			 * mirror, that means that the requested area +			 * is not left of the left cursor +			 */ +			ret = -EIO; +			kfree(tmp_bbio); +			goto out; +		} + +		/* +		 * process the rest of the function using the mirror_num +		 * of the source drive. Therefore look it up first. +		 * At the end, patch the device pointer to the one of the +		 * target drive. +		 */ +		for (i = 0; i < tmp_num_stripes; i++) { +			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { +				/* +				 * In case of DUP, in order to keep it +				 * simple, only add the mirror with the +				 * lowest physical address +				 */ +				if (found && +				    physical_of_found <= +				     tmp_bbio->stripes[i].physical) +					continue; +				index_srcdev = i; +				found = 1; +				physical_of_found = +					tmp_bbio->stripes[i].physical; +			} +		} + +		if (found) { +			mirror_num = index_srcdev + 1; +			patch_the_first_stripe_for_dev_replace = 1; +			physical_to_patch_in_first_stripe = physical_of_found; +		} else { +			WARN_ON(1); +			ret = -EIO; +			kfree(tmp_bbio); +			goto out; +		} + +		kfree(tmp_bbio); +	} else if (mirror_num > map->num_stripes) { +		mirror_num = 0; +	} +  	num_stripes = 1;  	stripe_index = 0;  	stripe_nr_orig = stripe_nr; @@ -3813,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  					    stripe_nr_end - stripe_nr_orig);  		stripe_index = do_div(stripe_nr, map->num_stripes);  	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { -		if (rw & (REQ_WRITE | REQ_DISCARD)) +		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))  			num_stripes = map->num_stripes;  		else if (mirror_num)  			stripe_index = mirror_num - 1;  		else { -			stripe_index = find_live_mirror(map, 0, +			stripe_index = find_live_mirror(fs_info, map, 0,  					    map->num_stripes, -					    current->pid % map->num_stripes); +					    current->pid % map->num_stripes, +					    dev_replace_is_ongoing);  			mirror_num = stripe_index + 1;  		}  	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) { -		if (rw & (REQ_WRITE | REQ_DISCARD)) { +		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {  			num_stripes = map->num_stripes;  		} else if (mirror_num) {  			stripe_index = mirror_num - 1; @@ -3839,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  		stripe_index = do_div(stripe_nr, factor);  		stripe_index *= map->sub_stripes; -		if (rw & REQ_WRITE) +		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))  			num_stripes = map->sub_stripes;  		else if (rw & REQ_DISCARD)  			num_stripes = min_t(u64, map->sub_stripes * @@ -3849,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  			stripe_index += mirror_num - 1;  		else {  			int old_stripe_index = stripe_index; -			stripe_index = find_live_mirror(map, stripe_index, +			stripe_index = find_live_mirror(fs_info, map, +					      stripe_index,  					      map->sub_stripes, stripe_index + -					      current->pid % map->sub_stripes); +					      current->pid % map->sub_stripes, +					      dev_replace_is_ongoing);  			mirror_num = stripe_index - old_stripe_index + 1;  		}  	} else { @@ -3865,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	}  	BUG_ON(stripe_index >= map->num_stripes); -	bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); +	num_alloc_stripes = num_stripes; +	if (dev_replace_is_ongoing) { +		if (rw & (REQ_WRITE | REQ_DISCARD)) +			num_alloc_stripes <<= 1; +		if (rw & REQ_GET_READ_MIRRORS) +			num_alloc_stripes++; +	} +	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);  	if (!bbio) {  		ret = -ENOMEM;  		goto out; @@ -3952,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  		}  	} -	if (rw & REQ_WRITE) { +	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {  		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |  				 BTRFS_BLOCK_GROUP_RAID10 |  				 BTRFS_BLOCK_GROUP_DUP)) { @@ -3960,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  		}  	} +	if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && +	    dev_replace->tgtdev != NULL) { +		int index_where_to_add; +		u64 srcdev_devid = dev_replace->srcdev->devid; + +		/* +		 * duplicate the write operations while the dev replace +		 * procedure is running. Since the copying of the old disk +		 * to the new disk takes place at run time while the +		 * filesystem is mounted writable, the regular write +		 * operations to the old disk have to be duplicated to go +		 * to the new disk as well. +		 * Note that device->missing is handled by the caller, and +		 * that the write to the old disk is already set up in the +		 * stripes array. +		 */ +		index_where_to_add = num_stripes; +		for (i = 0; i < num_stripes; i++) { +			if (bbio->stripes[i].dev->devid == srcdev_devid) { +				/* write to new disk, too */ +				struct btrfs_bio_stripe *new = +					bbio->stripes + index_where_to_add; +				struct btrfs_bio_stripe *old = +					bbio->stripes + i; + +				new->physical = old->physical; +				new->length = old->length; +				new->dev = dev_replace->tgtdev; +				index_where_to_add++; +				max_errors++; +			} +		} +		num_stripes = index_where_to_add; +	} else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && +		   dev_replace->tgtdev != NULL) { +		u64 srcdev_devid = dev_replace->srcdev->devid; +		int index_srcdev = 0; +		int found = 0; +		u64 physical_of_found = 0; + +		/* +		 * During the dev-replace procedure, the target drive can +		 * also be used to read data in case it is needed to repair +		 * a corrupt block elsewhere. This is possible if the +		 * requested area is left of the left cursor. In this area, +		 * the target drive is a full copy of the source drive. +		 */ +		for (i = 0; i < num_stripes; i++) { +			if (bbio->stripes[i].dev->devid == srcdev_devid) { +				/* +				 * In case of DUP, in order to keep it +				 * simple, only add the mirror with the +				 * lowest physical address +				 */ +				if (found && +				    physical_of_found <= +				     bbio->stripes[i].physical) +					continue; +				index_srcdev = i; +				found = 1; +				physical_of_found = bbio->stripes[i].physical; +			} +		} +		if (found) { +			u64 length = map->stripe_len; + +			if (physical_of_found + length <= +			    dev_replace->cursor_left) { +				struct btrfs_bio_stripe *tgtdev_stripe = +					bbio->stripes + num_stripes; + +				tgtdev_stripe->physical = physical_of_found; +				tgtdev_stripe->length = +					bbio->stripes[index_srcdev].length; +				tgtdev_stripe->dev = dev_replace->tgtdev; + +				num_stripes++; +			} +		} +	} +  	*bbio_ret = bbio;  	bbio->num_stripes = num_stripes;  	bbio->max_errors = max_errors;  	bbio->mirror_num = mirror_num; + +	/* +	 * this is the case that REQ_READ && dev_replace_is_ongoing && +	 * mirror_num == num_stripes + 1 && dev_replace target drive is +	 * available as a mirror +	 */ +	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { +		WARN_ON(num_stripes > 1); +		bbio->stripes[0].dev = dev_replace->tgtdev; +		bbio->stripes[0].physical = physical_to_patch_in_first_stripe; +		bbio->mirror_num = map->num_stripes + 1; +	}  out: +	if (dev_replace_is_ongoing) +		btrfs_dev_replace_unlock(dev_replace);  	free_extent_map(em);  	return ret;  } -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		      u64 logical, u64 *length,  		      struct btrfs_bio **bbio_ret, int mirror_num)  { -	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, +	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,  				 mirror_num);  } @@ -4192,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,  				   &device->work);  } +static int bio_size_ok(struct block_device *bdev, struct bio *bio, +		       sector_t sector) +{ +	struct bio_vec *prev; +	struct request_queue *q = bdev_get_queue(bdev); +	unsigned short max_sectors = queue_max_sectors(q); +	struct bvec_merge_data bvm = { +		.bi_bdev = bdev, +		.bi_sector = sector, +		.bi_rw = bio->bi_rw, +	}; + +	if (bio->bi_vcnt == 0) { +		WARN_ON(1); +		return 1; +	} + +	prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; +	if ((bio->bi_size >> 9) > max_sectors) +		return 0; + +	if (!q->merge_bvec_fn) +		return 1; + +	bvm.bi_size = bio->bi_size - prev->bv_len; +	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) +		return 0; +	return 1; +} + +static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, +			      struct bio *bio, u64 physical, int dev_nr, +			      int rw, int async) +{ +	struct btrfs_device *dev = bbio->stripes[dev_nr].dev; + +	bio->bi_private = bbio; +	bio->bi_private = merge_stripe_index_into_bio_private( +			bio->bi_private, (unsigned int)dev_nr); +	bio->bi_end_io = btrfs_end_bio; +	bio->bi_sector = physical >> 9; +#ifdef DEBUG +	{ +		struct rcu_string *name; + +		rcu_read_lock(); +		name = rcu_dereference(dev->name); +		pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " +			 "(%s id %llu), size=%u\n", rw, +			 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, +			 name->str, dev->devid, bio->bi_size); +		rcu_read_unlock(); +	} +#endif +	bio->bi_bdev = dev->bdev; +	if (async) +		schedule_bio(root, dev, rw, bio); +	else +		btrfsic_submit_bio(rw, bio); +} + +static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, +			      struct bio *first_bio, struct btrfs_device *dev, +			      int dev_nr, int rw, int async) +{ +	struct bio_vec *bvec = first_bio->bi_io_vec; +	struct bio *bio; +	int nr_vecs = bio_get_nr_vecs(dev->bdev); +	u64 physical = bbio->stripes[dev_nr].physical; + +again: +	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); +	if (!bio) +		return -ENOMEM; + +	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { +		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, +				 bvec->bv_offset) < bvec->bv_len) { +			u64 len = bio->bi_size; + +			atomic_inc(&bbio->stripes_pending); +			submit_stripe_bio(root, bbio, bio, physical, dev_nr, +					  rw, async); +			physical += len; +			goto again; +		} +		bvec++; +	} + +	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); +	return 0; +} + +static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +{ +	atomic_inc(&bbio->error); +	if (atomic_dec_and_test(&bbio->stripes_pending)) { +		bio->bi_private = bbio->private; +		bio->bi_end_io = bbio->end_io; +		bio->bi_bdev = (struct block_device *) +			(unsigned long)bbio->mirror_num; +		bio->bi_sector = logical >> 9; +		kfree(bbio); +		bio_endio(bio, -EIO); +	} +} +  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  		  int mirror_num, int async_submit)  { -	struct btrfs_mapping_tree *map_tree;  	struct btrfs_device *dev;  	struct bio *first_bio = bio;  	u64 logical = (u64)bio->bi_sector << 9; @@ -4207,17 +4825,16 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	struct btrfs_bio *bbio = NULL;  	length = bio->bi_size; -	map_tree = &root->fs_info->mapping_tree;  	map_length = length; -	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, +	ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,  			      mirror_num); -	if (ret) /* -ENOMEM */ +	if (ret)  		return ret;  	total_devs = bbio->num_stripes;  	if (map_length < length) { -		printk(KERN_CRIT "mapping failed logical %llu bio len %llu " +		printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "  		       "len %llu\n", (unsigned long long)logical,  		       (unsigned long long)length,  		       (unsigned long long)map_length); @@ -4230,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	atomic_set(&bbio->stripes_pending, bbio->num_stripes);  	while (dev_nr < total_devs) { +		dev = bbio->stripes[dev_nr].dev; +		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { +			bbio_error(bbio, first_bio, logical); +			dev_nr++; +			continue; +		} + +		/* +		 * Check and see if we're ok with this bio based on it's size +		 * and offset with the given device. +		 */ +		if (!bio_size_ok(dev->bdev, first_bio, +				 bbio->stripes[dev_nr].physical >> 9)) { +			ret = breakup_stripe_bio(root, bbio, first_bio, dev, +						 dev_nr, rw, async_submit); +			BUG_ON(ret); +			dev_nr++; +			continue; +		} +  		if (dev_nr < total_devs - 1) {  			bio = bio_clone(first_bio, GFP_NOFS);  			BUG_ON(!bio); /* -ENOMEM */  		} else {  			bio = first_bio;  		} -		bio->bi_private = bbio; -		bio->bi_private = merge_stripe_index_into_bio_private( -				bio->bi_private, (unsigned int)dev_nr); -		bio->bi_end_io = btrfs_end_bio; -		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; -		dev = bbio->stripes[dev_nr].dev; -		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { -#ifdef DEBUG -			struct rcu_string *name; -			rcu_read_lock(); -			name = rcu_dereference(dev->name); -			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " -				 "(%s id %llu), size=%u\n", rw, -				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, -				 name->str, dev->devid, bio->bi_size); -			rcu_read_unlock(); -#endif -			bio->bi_bdev = dev->bdev; -			if (async_submit) -				schedule_bio(root, dev, rw, bio); -			else -				btrfsic_submit_bio(rw, bio); -		} else { -			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; -			bio->bi_sector = logical >> 9; -			bio_endio(bio, -EIO); -		} +		submit_stripe_bio(root, bbio, bio, +				  bbio->stripes[dev_nr].physical, dev_nr, rw, +				  async_submit);  		dev_nr++;  	}  	return 0;  } -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,  				       u8 *uuid, u8 *fsid)  {  	struct btrfs_device *device;  	struct btrfs_fs_devices *cur_devices; -	cur_devices = root->fs_info->fs_devices; +	cur_devices = fs_info->fs_devices;  	while (cur_devices) {  		if (!fsid ||  		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { @@ -4356,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  	em->bdev = (struct block_device *)map;  	em->start = logical;  	em->len = length; +	em->orig_start = 0;  	em->block_start = 0;  	em->block_len = em->len; @@ -4373,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  		read_extent_buffer(leaf, uuid, (unsigned long)  				   btrfs_stripe_dev_uuid_nr(chunk, i),  				   BTRFS_UUID_SIZE); -		map->stripes[i].dev = btrfs_find_device(root, devid, uuid, -							NULL); +		map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, +							uuid, NULL);  		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {  			kfree(map);  			free_extent_map(em); @@ -4415,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,  	device->io_align = btrfs_device_io_align(leaf, dev_item);  	device->io_width = btrfs_device_io_width(leaf, dev_item);  	device->sector_size = btrfs_device_sector_size(leaf, dev_item); +	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); +	device->is_tgtdev_for_dev_replace = 0;  	ptr = (unsigned long)btrfs_device_uuid(dev_item);  	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); @@ -4492,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,  			return ret;  	} -	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); +	device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);  	if (!device || !device->bdev) {  		if (!btrfs_test_opt(root, DEGRADED))  			return -EIO; @@ -4525,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,  	fill_device_from_item(leaf, dev_item, device);  	device->dev_root = root->fs_info->dev_root;  	device->in_fs_metadata = 1; -	if (device->writeable) { +	if (device->writeable && !device->is_tgtdev_for_dev_replace) {  		device->fs_devices->total_rw_bytes += device->total_bytes;  		spin_lock(&root->fs_info->free_chunk_lock);  		root->fs_info->free_chunk_space += device->total_bytes - @@ -4884,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,  	int i;  	mutex_lock(&fs_devices->device_list_mutex); -	dev = btrfs_find_device(root, stats->devid, NULL, NULL); +	dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);  	mutex_unlock(&fs_devices->device_list_mutex);  	if (!dev) { @@ -4912,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,  		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;  	return 0;  } + +int btrfs_scratch_superblock(struct btrfs_device *device) +{ +	struct buffer_head *bh; +	struct btrfs_super_block *disk_super; + +	bh = btrfs_read_dev_super(device->bdev); +	if (!bh) +		return -EINVAL; +	disk_super = (struct btrfs_super_block *)bh->b_data; + +	memset(&disk_super->magic, 0, sizeof(disk_super->magic)); +	set_buffer_dirty(bh); +	sync_dirty_buffer(bh); +	brelse(bh); + +	return 0; +}  |