diff options
| -rw-r--r-- | fs/btrfs/Kconfig | 3 | ||||
| -rw-r--r-- | fs/btrfs/Makefile | 2 | ||||
| -rw-r--r-- | fs/btrfs/compression.c | 4 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 44 | ||||
| -rw-r--r-- | fs/btrfs/delayed-ref.h | 9 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 62 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.h | 7 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 156 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 40 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.h | 2 | ||||
| -rw-r--r-- | fs/btrfs/free-space-cache.c | 50 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 18 | ||||
| -rw-r--r-- | fs/btrfs/raid56.c | 2080 | ||||
| -rw-r--r-- | fs/btrfs/raid56.h | 51 | ||||
| -rw-r--r-- | fs/btrfs/scrub.c | 8 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 9 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 380 | ||||
| -rw-r--r-- | fs/btrfs/volumes.h | 9 | 
18 files changed, 2814 insertions, 120 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index d33f01c08b6..5f583c8a36d 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -6,6 +6,9 @@ config BTRFS_FS  	select ZLIB_DEFLATE  	select LZO_COMPRESS  	select LZO_DECOMPRESS +	select RAID6_PQ +	select XOR_BLOCKS +  	help  	  Btrfs is a new filesystem with extents, writable snapshotting,  	  support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7df3e0f0ee5..3932224f99e 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \  	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \  	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \  	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ -	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o +	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o  btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 94ab2f80e7e..15b94089abc 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,  		page = compressed_pages[pg_index];  		page->mapping = inode->i_mapping;  		if (bio->bi_size) -			ret = io_tree->ops->merge_bio_hook(page, 0, +			ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,  							   PAGE_CACHE_SIZE,  							   bio, 0);  		else @@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  		page->index = em_start >> PAGE_CACHE_SHIFT;  		if (comp_bio->bi_size) -			ret = tree->ops->merge_bio_hook(page, 0, +			ret = tree->ops->merge_bio_hook(READ, page, 0,  							PAGE_CACHE_SIZE,  							comp_bio, 0);  		else diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1679051f4d3..3dcedfe4f75 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -506,6 +506,7 @@ struct btrfs_super_block {  #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)  #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6) +#define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)  #define BTRFS_FEATURE_COMPAT_SUPP		0ULL  #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL @@ -515,6 +516,7 @@ struct btrfs_super_block {  	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\  	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\  	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\ +	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\  	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)  /* @@ -956,6 +958,8 @@ struct btrfs_dev_replace_item {  #define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)  #define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)  #define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5    (1 << 7) +#define BTRFS_BLOCK_GROUP_RAID6    (1 << 8)  #define BTRFS_BLOCK_GROUP_RESERVED	BTRFS_AVAIL_ALLOC_BIT_SINGLE  enum btrfs_raid_types { @@ -964,6 +968,8 @@ enum btrfs_raid_types {  	BTRFS_RAID_DUP,  	BTRFS_RAID_RAID0,  	BTRFS_RAID_SINGLE, +	BTRFS_RAID_RAID5, +	BTRFS_RAID_RAID6,  	BTRFS_NR_RAID_TYPES  }; @@ -973,6 +979,8 @@ enum btrfs_raid_types {  #define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \  					 BTRFS_BLOCK_GROUP_RAID1 |   \ +					 BTRFS_BLOCK_GROUP_RAID5 |   \ +					 BTRFS_BLOCK_GROUP_RAID6 |   \  					 BTRFS_BLOCK_GROUP_DUP |     \  					 BTRFS_BLOCK_GROUP_RAID10)  /* @@ -1197,6 +1205,10 @@ struct btrfs_block_group_cache {  	u64 flags;  	u64 sectorsize;  	u64 cache_generation; + +	/* for raid56, this is a full stripe, without parity */ +	unsigned long full_stripe_len; +  	unsigned int ro:1;  	unsigned int dirty:1;  	unsigned int iref:1; @@ -1242,6 +1254,23 @@ enum btrfs_orphan_cleanup_state {  	ORPHAN_CLEANUP_DONE	= 2,  }; +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { +	struct list_head hash_list; +	wait_queue_head_t wait; +	spinlock_t lock; +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { +	struct list_head stripe_cache; +	spinlock_t cache_lock; +	int cache_size; +	struct btrfs_stripe_hash table[]; +}; + +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 +  /* fs_info */  struct reloc_control;  struct btrfs_device; @@ -1341,6 +1370,13 @@ struct btrfs_fs_info {  	struct mutex cleaner_mutex;  	struct mutex chunk_mutex;  	struct mutex volume_mutex; + +	/* this is used during read/modify/write to make sure +	 * no two ios are trying to mod the same stripe at the same +	 * time +	 */ +	struct btrfs_stripe_hash_table *stripe_hash_table; +  	/*  	 * this protects the ordered operations list only while we are  	 * processing all of the entries on it.  This way we make @@ -1423,6 +1459,8 @@ struct btrfs_fs_info {  	struct btrfs_workers flush_workers;  	struct btrfs_workers endio_workers;  	struct btrfs_workers endio_meta_workers; +	struct btrfs_workers endio_raid56_workers; +	struct btrfs_workers rmw_workers;  	struct btrfs_workers endio_meta_write_workers;  	struct btrfs_workers endio_write_workers;  	struct btrfs_workers endio_freespace_worker; @@ -3490,9 +3528,9 @@ int btrfs_writepages(struct address_space *mapping,  		     struct writeback_control *wbc);  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,  			     struct btrfs_root *new_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, -			 size_t size, struct bio *bio, unsigned long bio_flags); - +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +			 size_t size, struct bio *bio, +			 unsigned long bio_flags);  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);  int btrfs_readpage(struct file *file, struct page *page);  void btrfs_evict_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 7939149f8f2..f75fcaf79ae 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root {  	unsigned long num_heads_ready;  	/* +	 * bumped when someone is making progress on the delayed +	 * refs, so that other procs know they are just adding to +	 * contention intead of helping +	 */ +	atomic_t procs_running_refs; +	atomic_t ref_seq; +	wait_queue_head_t wait; + +	/*  	 * set when the tree is flushing before a transaction commit,  	 * used by the throttling code to decide if new updates need  	 * to be run right away diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 779b401cd95..eb7c1430852 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,7 @@  #include "check-integrity.h"  #include "rcu-string.h"  #include "dev-replace.h" +#include "raid56.h"  #ifdef CONFIG_X86  #include <asm/cpufeature.h> @@ -640,8 +641,15 @@ err:  		btree_readahead_hook(root, eb, eb->start, ret);  	} -	if (ret) +	if (ret) { +		/* +		 * our io error hook is going to dec the io pages +		 * again, we have to make sure it has something +		 * to decrement +		 */ +		atomic_inc(&eb->io_pages);  		clear_extent_buffer_uptodate(eb); +	}  	free_extent_buffer(eb);  out:  	return ret; @@ -655,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)  	eb = (struct extent_buffer *)page->private;  	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);  	eb->read_mirror = failed_mirror; +	atomic_dec(&eb->io_pages);  	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))  		btree_readahead_hook(root, eb, eb->start, -EIO);  	return -EIO;	/* we fixed nothing */ @@ -671,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)  	end_io_wq->work.flags = 0;  	if (bio->bi_rw & REQ_WRITE) { -		if (end_io_wq->metadata == 1) +		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)  			btrfs_queue_worker(&fs_info->endio_meta_write_workers,  					   &end_io_wq->work); -		else if (end_io_wq->metadata == 2) +		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)  			btrfs_queue_worker(&fs_info->endio_freespace_worker,  					   &end_io_wq->work); +		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) +			btrfs_queue_worker(&fs_info->endio_raid56_workers, +					   &end_io_wq->work);  		else  			btrfs_queue_worker(&fs_info->endio_write_workers,  					   &end_io_wq->work);  	} else { -		if (end_io_wq->metadata) +		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) +			btrfs_queue_worker(&fs_info->endio_raid56_workers, +					   &end_io_wq->work); +		else if (end_io_wq->metadata)  			btrfs_queue_worker(&fs_info->endio_meta_workers,  					   &end_io_wq->work);  		else @@ -696,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)   * 0 - if data   * 1 - if normal metadta   * 2 - if writing to the free space cache area + * 3 - raid parity work   */  int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,  			int metadata) @@ -2179,6 +2195,12 @@ int open_ctree(struct super_block *sb,  	init_waitqueue_head(&fs_info->transaction_blocked_wait);  	init_waitqueue_head(&fs_info->async_submit_wait); +	ret = btrfs_alloc_stripe_hash_table(fs_info); +	if (ret) { +		err = -ENOMEM; +		goto fail_alloc; +	} +  	__setup_root(4096, 4096, 4096, 4096, tree_root,  		     fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -2349,6 +2371,12 @@ int open_ctree(struct super_block *sb,  	btrfs_init_workers(&fs_info->endio_meta_write_workers,  			   "endio-meta-write", fs_info->thread_pool_size,  			   &fs_info->generic_worker); +	btrfs_init_workers(&fs_info->endio_raid56_workers, +			   "endio-raid56", fs_info->thread_pool_size, +			   &fs_info->generic_worker); +	btrfs_init_workers(&fs_info->rmw_workers, +			   "rmw", fs_info->thread_pool_size, +			   &fs_info->generic_worker);  	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",  			   fs_info->thread_pool_size,  			   &fs_info->generic_worker); @@ -2367,6 +2395,8 @@ int open_ctree(struct super_block *sb,  	 */  	fs_info->endio_workers.idle_thresh = 4;  	fs_info->endio_meta_workers.idle_thresh = 4; +	fs_info->endio_raid56_workers.idle_thresh = 4; +	fs_info->rmw_workers.idle_thresh = 2;  	fs_info->endio_write_workers.idle_thresh = 2;  	fs_info->endio_meta_write_workers.idle_thresh = 2; @@ -2383,6 +2413,8 @@ int open_ctree(struct super_block *sb,  	ret |= btrfs_start_workers(&fs_info->fixup_workers);  	ret |= btrfs_start_workers(&fs_info->endio_workers);  	ret |= btrfs_start_workers(&fs_info->endio_meta_workers); +	ret |= btrfs_start_workers(&fs_info->rmw_workers); +	ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);  	ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);  	ret |= btrfs_start_workers(&fs_info->endio_write_workers);  	ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); @@ -2726,6 +2758,8 @@ fail_sb_buffer:  	btrfs_stop_workers(&fs_info->workers);  	btrfs_stop_workers(&fs_info->endio_workers);  	btrfs_stop_workers(&fs_info->endio_meta_workers); +	btrfs_stop_workers(&fs_info->endio_raid56_workers); +	btrfs_stop_workers(&fs_info->rmw_workers);  	btrfs_stop_workers(&fs_info->endio_meta_write_workers);  	btrfs_stop_workers(&fs_info->endio_write_workers);  	btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -2747,6 +2781,7 @@ fail_bdi:  fail_srcu:  	cleanup_srcu_struct(&fs_info->subvol_srcu);  fail: +	btrfs_free_stripe_hash_table(fs_info);  	btrfs_close_devices(fs_info->fs_devices);  	return err; @@ -3094,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(  				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)  				      == 0)))  					num_tolerated_disk_barrier_failures = 0; -				else if (num_tolerated_disk_barrier_failures > 1 -					 && -					 (flags & (BTRFS_BLOCK_GROUP_RAID1 | -						   BTRFS_BLOCK_GROUP_RAID10))) -					num_tolerated_disk_barrier_failures = 1; +				else if (num_tolerated_disk_barrier_failures > 1) { +					if (flags & (BTRFS_BLOCK_GROUP_RAID1 | +					    BTRFS_BLOCK_GROUP_RAID5 | +					    BTRFS_BLOCK_GROUP_RAID10)) { +						num_tolerated_disk_barrier_failures = 1; +					} else if (flags & +						   BTRFS_BLOCK_GROUP_RAID5) { +						num_tolerated_disk_barrier_failures = 2; +					} +				}  			}  		}  		up_read(&sinfo->groups_sem); @@ -3402,6 +3442,8 @@ int close_ctree(struct btrfs_root *root)  	btrfs_stop_workers(&fs_info->workers);  	btrfs_stop_workers(&fs_info->endio_workers);  	btrfs_stop_workers(&fs_info->endio_meta_workers); +	btrfs_stop_workers(&fs_info->endio_raid56_workers); +	btrfs_stop_workers(&fs_info->rmw_workers);  	btrfs_stop_workers(&fs_info->endio_meta_write_workers);  	btrfs_stop_workers(&fs_info->endio_write_workers);  	btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -3424,6 +3466,8 @@ int close_ctree(struct btrfs_root *root)  	bdi_destroy(&fs_info->bdi);  	cleanup_srcu_struct(&fs_info->subvol_srcu); +	btrfs_free_stripe_hash_table(fs_info); +  	return 0;  } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 305c33efb0e..034d7dc552b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,6 +25,13 @@  #define BTRFS_SUPER_MIRROR_MAX	 3  #define BTRFS_SUPER_MIRROR_SHIFT 12 +enum { +	BTRFS_WQ_ENDIO_DATA = 0, +	BTRFS_WQ_ENDIO_METADATA = 1, +	BTRFS_WQ_ENDIO_FREE_SPACE = 2, +	BTRFS_WQ_ENDIO_RAID56 = 3, +}; +  static inline u64 btrfs_sb_offset(int mirror)  {  	u64 start = 16 * 1024; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5cd44e23959..b3ecca447dd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@  #include "print-tree.h"  #include "transaction.h"  #include "volumes.h" +#include "raid56.h"  #include "locking.h"  #include "free-space-cache.h"  #include "math.h" @@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,  		*actual_bytes = discarded_bytes; +	if (ret == -EOPNOTSUPP) +		ret = 0;  	return ret;  } @@ -2440,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,  	return ret;  } +static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, +		      int count) +{ +	int val = atomic_read(&delayed_refs->ref_seq); + +	if (val < seq || val >= seq + count) +		return 1; +	return 0; +} +  /*   * this starts processing the delayed reference count updates and   * extent insertions we have queued up so far.  count can be @@ -2474,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  	delayed_refs = &trans->transaction->delayed_refs;  	INIT_LIST_HEAD(&cluster); +	if (count == 0) { +		count = delayed_refs->num_entries * 2; +		run_most = 1; +	} + +	if (!run_all && !run_most) { +		int old; +		int seq = atomic_read(&delayed_refs->ref_seq); + +progress: +		old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); +		if (old) { +			DEFINE_WAIT(__wait); +			if (delayed_refs->num_entries < 16348) +				return 0; + +			prepare_to_wait(&delayed_refs->wait, &__wait, +					TASK_UNINTERRUPTIBLE); + +			old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); +			if (old) { +				schedule(); +				finish_wait(&delayed_refs->wait, &__wait); + +				if (!refs_newer(delayed_refs, seq, 256)) +					goto progress; +				else +					return 0; +			} else { +				finish_wait(&delayed_refs->wait, &__wait); +				goto again; +			} +		} + +	} else { +		atomic_inc(&delayed_refs->procs_running_refs); +	} +  again:  	loops = 0;  	spin_lock(&delayed_refs->lock); @@ -2482,10 +2533,6 @@ again:  	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);  #endif -	if (count == 0) { -		count = delayed_refs->num_entries * 2; -		run_most = 1; -	}  	while (1) {  		if (!(run_all || run_most) &&  		    delayed_refs->num_heads_ready < 64) @@ -2508,9 +2555,12 @@ again:  			btrfs_release_ref_cluster(&cluster);  			spin_unlock(&delayed_refs->lock);  			btrfs_abort_transaction(trans, root, ret); +			atomic_dec(&delayed_refs->procs_running_refs);  			return ret;  		} +		atomic_add(ret, &delayed_refs->ref_seq); +  		count -= min_t(unsigned long, ret, count);  		if (count == 0) @@ -2579,6 +2629,11 @@ again:  		goto again;  	}  out: +	atomic_dec(&delayed_refs->procs_running_refs); +	smp_mb(); +	if (waitqueue_active(&delayed_refs->wait)) +		wake_up(&delayed_refs->wait); +  	spin_unlock(&delayed_refs->lock);  	assert_qgroups_uptodate(trans);  	return 0; @@ -3284,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)  	u64 num_devices = root->fs_info->fs_devices->rw_devices +  		root->fs_info->fs_devices->missing_devices;  	u64 target; +	u64 tmp;  	/*  	 * see if restripe for this chunk_type is in progress, if so @@ -3300,30 +3356,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)  	}  	spin_unlock(&root->fs_info->balance_lock); +	/* First, mask out the RAID levels which aren't possible */  	if (num_devices == 1) -		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); +		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | +			   BTRFS_BLOCK_GROUP_RAID5); +	if (num_devices < 3) +		flags &= ~BTRFS_BLOCK_GROUP_RAID6;  	if (num_devices < 4)  		flags &= ~BTRFS_BLOCK_GROUP_RAID10; -	if ((flags & BTRFS_BLOCK_GROUP_DUP) && -	    (flags & (BTRFS_BLOCK_GROUP_RAID1 | -		      BTRFS_BLOCK_GROUP_RAID10))) { -		flags &= ~BTRFS_BLOCK_GROUP_DUP; -	} +	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | +		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | +		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); +	flags &= ~tmp; -	if ((flags & BTRFS_BLOCK_GROUP_RAID1) && -	    (flags & BTRFS_BLOCK_GROUP_RAID10)) { -		flags &= ~BTRFS_BLOCK_GROUP_RAID1; -	} - -	if ((flags & BTRFS_BLOCK_GROUP_RAID0) && -	    ((flags & BTRFS_BLOCK_GROUP_RAID1) | -	     (flags & BTRFS_BLOCK_GROUP_RAID10) | -	     (flags & BTRFS_BLOCK_GROUP_DUP))) { -		flags &= ~BTRFS_BLOCK_GROUP_RAID0; -	} +	if (tmp & BTRFS_BLOCK_GROUP_RAID6) +		tmp = BTRFS_BLOCK_GROUP_RAID6; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID5) +		tmp = BTRFS_BLOCK_GROUP_RAID5; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID10) +		tmp = BTRFS_BLOCK_GROUP_RAID10; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID1) +		tmp = BTRFS_BLOCK_GROUP_RAID1; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID0) +		tmp = BTRFS_BLOCK_GROUP_RAID0; -	return extended_to_chunk(flags); +	return extended_to_chunk(flags | tmp);  }  static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) @@ -3347,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)  u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)  {  	u64 flags; +	u64 ret;  	if (data)  		flags = BTRFS_BLOCK_GROUP_DATA; @@ -3355,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)  	else  		flags = BTRFS_BLOCK_GROUP_METADATA; -	return get_alloc_profile(root, flags); +	ret = get_alloc_profile(root, flags); +	return ret;  }  /* @@ -3530,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)  {  	u64 num_dev; -	if (type & BTRFS_BLOCK_GROUP_RAID10 || -	    type & BTRFS_BLOCK_GROUP_RAID0) +	if (type & (BTRFS_BLOCK_GROUP_RAID10 | +		    BTRFS_BLOCK_GROUP_RAID0 | +		    BTRFS_BLOCK_GROUP_RAID5 | +		    BTRFS_BLOCK_GROUP_RAID6))  		num_dev = root->fs_info->fs_devices->rw_devices;  	else if (type & BTRFS_BLOCK_GROUP_RAID1)  		num_dev = 2; @@ -3706,7 +3768,9 @@ static int can_overcommit(struct btrfs_root *root,  	/*  	 * If we have dup, raid1 or raid10 then only half of the free -	 * space is actually useable. +	 * space is actually useable.  For raid56, the space info used +	 * doesn't include the parity drive, so we don't have to +	 * change the math  	 */  	if (profile & (BTRFS_BLOCK_GROUP_DUP |  		       BTRFS_BLOCK_GROUP_RAID1 | @@ -5539,10 +5603,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,  	return ret;  } -static u64 stripe_align(struct btrfs_root *root, u64 val) +static u64 stripe_align(struct btrfs_root *root, +			struct btrfs_block_group_cache *cache, +			u64 val, u64 num_bytes)  { -	u64 mask = ((u64)root->stripesize - 1); -	u64 ret = (val + mask) & ~mask; +	u64 mask; +	u64 ret; +	mask = ((u64)root->stripesize - 1); +	ret = (val + mask) & ~mask;  	return ret;  } @@ -5599,8 +5667,12 @@ int __get_raid_index(u64 flags)  		return BTRFS_RAID_DUP;  	else if (flags & BTRFS_BLOCK_GROUP_RAID0)  		return BTRFS_RAID_RAID0; -	else -		return BTRFS_RAID_SINGLE; +	else if (flags & BTRFS_BLOCK_GROUP_RAID5) +		return BTRFS_RAID_RAID5; +	else if (flags & BTRFS_BLOCK_GROUP_RAID6) +		return BTRFS_RAID_RAID6; + +	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */  }  static int get_block_group_index(struct btrfs_block_group_cache *cache) @@ -5743,6 +5815,8 @@ search:  		if (!block_group_bits(block_group, data)) {  		    u64 extra = BTRFS_BLOCK_GROUP_DUP |  				BTRFS_BLOCK_GROUP_RAID1 | +				BTRFS_BLOCK_GROUP_RAID5 | +				BTRFS_BLOCK_GROUP_RAID6 |  				BTRFS_BLOCK_GROUP_RAID10;  			/* @@ -5771,6 +5845,7 @@ have_block_group:  		 * lets look there  		 */  		if (last_ptr) { +			unsigned long aligned_cluster;  			/*  			 * the refill lock keeps out other  			 * people trying to start a new cluster @@ -5837,11 +5912,15 @@ refill_cluster:  				goto unclustered_alloc;  			} +			aligned_cluster = max_t(unsigned long, +						empty_cluster + empty_size, +					      block_group->full_stripe_len); +  			/* allocate a cluster in this block group */  			ret = btrfs_find_space_cluster(trans, root,  					       block_group, last_ptr,  					       search_start, num_bytes, -					       empty_cluster + empty_size); +					       aligned_cluster);  			if (ret == 0) {  				/*  				 * now pull our allocation out of this @@ -5912,7 +5991,8 @@ unclustered_alloc:  			goto loop;  		}  checks: -		search_start = stripe_align(root, offset); +		search_start = stripe_align(root, used_block_group, +					    offset, num_bytes);  		/* move on to the next group */  		if (search_start + num_bytes > @@ -7284,6 +7364,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)  		root->fs_info->fs_devices->missing_devices;  	stripped = BTRFS_BLOCK_GROUP_RAID0 | +		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |  		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;  	if (num_devices == 1) { @@ -7837,7 +7918,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		btrfs_release_path(path);  		cache->flags = btrfs_block_group_flags(&cache->item);  		cache->sectorsize = root->sectorsize; - +		cache->full_stripe_len = btrfs_full_stripe_len(root, +					       &root->fs_info->mapping_tree, +					       found_key.objectid);  		btrfs_init_free_space_ctl(cache);  		/* @@ -7891,6 +7974,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		if (!(get_alloc_profile(root, space_info->flags) &  		      (BTRFS_BLOCK_GROUP_RAID10 |  		       BTRFS_BLOCK_GROUP_RAID1 | +		       BTRFS_BLOCK_GROUP_RAID5 | +		       BTRFS_BLOCK_GROUP_RAID6 |  		       BTRFS_BLOCK_GROUP_DUP)))  			continue;  		/* @@ -7966,6 +8051,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;  	cache->sectorsize = root->sectorsize;  	cache->fs_info = root->fs_info; +	cache->full_stripe_len = btrfs_full_stripe_len(root, +					       &root->fs_info->mapping_tree, +					       chunk_offset);  	atomic_set(&cache->count, 1);  	spin_lock_init(&cache->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5c00d6aeae7..66f999b97cb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1895,13 +1895,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,  	if (ret)  		err = ret; -	if (did_repair) { -		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, -					rec->start + rec->len - 1, -					EXTENT_DAMAGED, GFP_NOFS); -		if (ret && !err) -			err = ret; -	} +	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, +				rec->start + rec->len - 1, +				EXTENT_DAMAGED, GFP_NOFS); +	if (ret && !err) +		err = ret;  	kfree(rec);  	return err; @@ -1932,10 +1930,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,  	u64 map_length = 0;  	u64 sector;  	struct btrfs_bio *bbio = NULL; +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;  	int ret;  	BUG_ON(!mirror_num); +	/* we can't repair anything in raid56 yet */ +	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) +		return 0; +  	bio = bio_alloc(GFP_NOFS, 1);  	if (!bio)  		return -EIO; @@ -2052,6 +2055,7 @@ static int clean_io_failure(u64 start, struct page *page)  						failrec->failed_mirror);  			did_repair = !ret;  		} +		ret = 0;  	}  out: @@ -2487,13 +2491,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,  	return ret;  } -static int merge_bio(struct extent_io_tree *tree, struct page *page, +static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,  		     unsigned long offset, size_t size, struct bio *bio,  		     unsigned long bio_flags)  {  	int ret = 0;  	if (tree->ops && tree->ops->merge_bio_hook) -		ret = tree->ops->merge_bio_hook(page, offset, size, bio, +		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,  						bio_flags);  	BUG_ON(ret < 0);  	return ret; @@ -2528,7 +2532,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,  				sector;  		if (prev_bio_flags != bio_flags || !contig || -		    merge_bio(tree, page, offset, page_size, bio, bio_flags) || +		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||  		    bio_add_page(bio, page, page_size, offset) < page_size) {  			ret = submit_one_bio(rw, bio, mirror_num,  					     prev_bio_flags); @@ -4162,6 +4166,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)  static void check_buffer_tree_ref(struct extent_buffer *eb)  { +	int refs;  	/* the ref bit is tricky.  We have to make sure it is set  	 * if we have the buffer dirty.   Otherwise the  	 * code to free a buffer can end up dropping a dirty @@ -4182,6 +4187,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)  	 * So bump the ref count first, then set the bit.  If someone  	 * beat us to it, drop the ref we added.  	 */ +	refs = atomic_read(&eb->refs); +	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) +		return; +  	spin_lock(&eb->refs_lock);  	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))  		atomic_inc(&eb->refs); @@ -4383,9 +4392,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)  void free_extent_buffer(struct extent_buffer *eb)  { +	int refs; +	int old;  	if (!eb)  		return; +	while (1) { +		refs = atomic_read(&eb->refs); +		if (refs <= 3) +			break; +		old = atomic_cmpxchg(&eb->refs, refs, refs - 1); +		if (old == refs) +			return; +	} +  	spin_lock(&eb->refs_lock);  	if (atomic_read(&eb->refs) == 2 &&  	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ff182322d11..dc81868d975 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -72,7 +72,7 @@ struct extent_io_ops {  	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);  	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);  	extent_submit_bio_hook_t *submit_bio_hook; -	int (*merge_bio_hook)(struct page *page, unsigned long offset, +	int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,  			      size_t size, struct bio *bio,  			      unsigned long bio_flags);  	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index c8090f18c21..1f84fc09c1a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1465,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,  }  static struct btrfs_free_space * -find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, +		unsigned long align)  {  	struct btrfs_free_space *entry;  	struct rb_node *node; +	u64 ctl_off; +	u64 tmp; +	u64 align_off;  	int ret;  	if (!ctl->free_space_offset.rb_node) @@ -1483,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)  		if (entry->bytes < *bytes)  			continue; +		/* make sure the space returned is big enough +		 * to match our requested alignment +		 */ +		if (*bytes >= align) { +			ctl_off = entry->offset - ctl->start; +			tmp = ctl_off + align - 1;; +			do_div(tmp, align); +			tmp = tmp * align + ctl->start; +			align_off = tmp - entry->offset; +		} else { +			align_off = 0; +			tmp = entry->offset; +		} + +		if (entry->bytes < *bytes + align_off) +			continue; +  		if (entry->bitmap) { -			ret = search_bitmap(ctl, entry, offset, bytes); -			if (!ret) +			ret = search_bitmap(ctl, entry, &tmp, bytes); +			if (!ret) { +				*offset = tmp;  				return entry; +			}  			continue;  		} -		*offset = entry->offset; -		*bytes = entry->bytes; +		*offset = tmp; +		*bytes = entry->bytes - align_off;  		return entry;  	} @@ -2101,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,  	struct btrfs_free_space *entry = NULL;  	u64 bytes_search = bytes + empty_size;  	u64 ret = 0; +	u64 align_gap = 0; +	u64 align_gap_len = 0;  	spin_lock(&ctl->tree_lock); -	entry = find_free_space(ctl, &offset, &bytes_search); +	entry = find_free_space(ctl, &offset, &bytes_search, +				block_group->full_stripe_len);  	if (!entry)  		goto out; @@ -2113,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,  		if (!entry->bytes)  			free_bitmap(ctl, entry);  	} else { +  		unlink_free_space(ctl, entry); -		entry->offset += bytes; -		entry->bytes -= bytes; +		align_gap_len = offset - entry->offset; +		align_gap = entry->offset; + +		entry->offset = offset + bytes; +		WARN_ON(entry->bytes < bytes + align_gap_len); + +		entry->bytes -= bytes + align_gap_len;  		if (!entry->bytes)  			kmem_cache_free(btrfs_free_space_cachep, entry);  		else @@ -2125,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,  out:  	spin_unlock(&ctl->tree_lock); +	if (align_gap_len) +		__btrfs_add_free_space(ctl, align_gap, align_gap_len);  	return ret;  } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1aa98be54ce..4e6a11c2cfd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -40,6 +40,7 @@  #include <linux/ratelimit.h>  #include <linux/mount.h>  #include <linux/btrfs.h> +#include <linux/blkdev.h>  #include "compat.h"  #include "ctree.h"  #include "disk-io.h" @@ -1605,7 +1606,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,   * extent_io.c merge_bio_hook, this must check the chunk tree to make sure   * we don't create bios that span stripes or chunks   */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,  			 size_t size, struct bio *bio,  			 unsigned long bio_flags)  { @@ -1620,7 +1621,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,  	length = bio->bi_size;  	map_length = length; -	ret = btrfs_map_block(root->fs_info, READ, logical, +	ret = btrfs_map_block(root->fs_info, rw, logical,  			      &map_length, NULL, 0);  	/* Will always return 0 with map_multi == NULL */  	BUG_ON(ret < 0); @@ -6464,19 +6465,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,  	int async_submit = 0;  	map_length = orig_bio->bi_size; -	ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, +	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,  			      &map_length, NULL, 0);  	if (ret) {  		bio_put(orig_bio);  		return -EIO;  	} -  	if (map_length >= orig_bio->bi_size) {  		bio = orig_bio;  		goto submit;  	} -	async_submit = 1; +	/* async crcs make it difficult to collect full stripe writes. */ +	if (btrfs_get_alloc_profile(root, 1) & +	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) +		async_submit = 0; +	else +		async_submit = 1; +  	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);  	if (!bio)  		return -ENOMEM; @@ -6518,7 +6524,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,  			bio->bi_end_io = btrfs_end_dio_bio;  			map_length = orig_bio->bi_size; -			ret = btrfs_map_block(root->fs_info, READ, +			ret = btrfs_map_block(root->fs_info, rw,  					      start_sector << 9,  					      &map_length, NULL, 0);  			if (ret) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 00000000000..e34e568534d --- /dev/null +++ b/fs/btrfs/raid56.c @@ -0,0 +1,2080 @@ +/* + * Copyright (C) 2012 Fusion-io  All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/ratelimit.h> +#include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <linux/hash.h> +#include <linux/list_sort.h> +#include <linux/raid/xor.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" + +/* set when additional merges to this rbio are not allowed */ +#define RBIO_RMW_LOCKED_BIT	1 + +/* + * set when this rbio is sitting in the hash, but it is just a cache + * of past RMW + */ +#define RBIO_CACHE_BIT		2 + +/* + * set when it is safe to trust the stripe_pages for caching + */ +#define RBIO_CACHE_READY_BIT	3 + + +#define RBIO_CACHE_SIZE 1024 + +struct btrfs_raid_bio { +	struct btrfs_fs_info *fs_info; +	struct btrfs_bio *bbio; + +	/* +	 * logical block numbers for the start of each stripe +	 * The last one or two are p/q.  These are sorted, +	 * so raid_map[0] is the start of our full stripe +	 */ +	u64 *raid_map; + +	/* while we're doing rmw on a stripe +	 * we put it into a hash table so we can +	 * lock the stripe and merge more rbios +	 * into it. +	 */ +	struct list_head hash_list; + +	/* +	 * LRU list for the stripe cache +	 */ +	struct list_head stripe_cache; + +	/* +	 * for scheduling work in the helper threads +	 */ +	struct btrfs_work work; + +	/* +	 * bio list and bio_list_lock are used +	 * to add more bios into the stripe +	 * in hopes of avoiding the full rmw +	 */ +	struct bio_list bio_list; +	spinlock_t bio_list_lock; + +	/* also protected by the bio_list_lock, the +	 * plug list is used by the plugging code +	 * to collect partial bios while plugged.  The +	 * stripe locking code also uses it to hand off +	 * the stripe lock to the next pending IO +	 */ +	struct list_head plug_list; + +	/* +	 * flags that tell us if it is safe to +	 * merge with this bio +	 */ +	unsigned long flags; + +	/* size of each individual stripe on disk */ +	int stripe_len; + +	/* number of data stripes (no p/q) */ +	int nr_data; + +	/* +	 * set if we're doing a parity rebuild +	 * for a read from higher up, which is handled +	 * differently from a parity rebuild as part of +	 * rmw +	 */ +	int read_rebuild; + +	/* first bad stripe */ +	int faila; + +	/* second bad stripe (for raid6 use) */ +	int failb; + +	/* +	 * number of pages needed to represent the full +	 * stripe +	 */ +	int nr_pages; + +	/* +	 * size of all the bios in the bio_list.  This +	 * helps us decide if the rbio maps to a full +	 * stripe or not +	 */ +	int bio_list_bytes; + +	atomic_t refs; + +	/* +	 * these are two arrays of pointers.  We allocate the +	 * rbio big enough to hold them both and setup their +	 * locations when the rbio is allocated +	 */ + +	/* pointers to pages that we allocated for +	 * reading/writing stripes directly from the disk (including P/Q) +	 */ +	struct page **stripe_pages; + +	/* +	 * pointers to the pages in the bio_list.  Stored +	 * here for faster lookup +	 */ +	struct page **bio_pages; +}; + +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); +static noinline void finish_rmw(struct btrfs_raid_bio *rbio); +static void rmw_work(struct btrfs_work *work); +static void read_rebuild_work(struct btrfs_work *work); +static void async_rmw_stripe(struct btrfs_raid_bio *rbio); +static void async_read_rebuild(struct btrfs_raid_bio *rbio); +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); +static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void index_rbio_pages(struct btrfs_raid_bio *rbio); +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); + +/* + * the stripe hash table is used for locking, and to collect + * bios in hopes of making a full stripe + */ +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) +{ +	struct btrfs_stripe_hash_table *table; +	struct btrfs_stripe_hash_table *x; +	struct btrfs_stripe_hash *cur; +	struct btrfs_stripe_hash *h; +	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; +	int i; + +	if (info->stripe_hash_table) +		return 0; + +	table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS); +	if (!table) +		return -ENOMEM; + +	spin_lock_init(&table->cache_lock); +	INIT_LIST_HEAD(&table->stripe_cache); + +	h = table->table; + +	for (i = 0; i < num_entries; i++) { +		cur = h + i; +		INIT_LIST_HEAD(&cur->hash_list); +		spin_lock_init(&cur->lock); +		init_waitqueue_head(&cur->wait); +	} + +	x = cmpxchg(&info->stripe_hash_table, NULL, table); +	if (x) +		kfree(x); +	return 0; +} + +/* + * caching an rbio means to copy anything from the + * bio_pages array into the stripe_pages array.  We + * use the page uptodate bit in the stripe cache array + * to indicate if it has valid data + * + * once the caching is done, we set the cache ready + * bit. + */ +static void cache_rbio_pages(struct btrfs_raid_bio *rbio) +{ +	int i; +	char *s; +	char *d; +	int ret; + +	ret = alloc_rbio_pages(rbio); +	if (ret) +		return; + +	for (i = 0; i < rbio->nr_pages; i++) { +		if (!rbio->bio_pages[i]) +			continue; + +		s = kmap(rbio->bio_pages[i]); +		d = kmap(rbio->stripe_pages[i]); + +		memcpy(d, s, PAGE_CACHE_SIZE); + +		kunmap(rbio->bio_pages[i]); +		kunmap(rbio->stripe_pages[i]); +		SetPageUptodate(rbio->stripe_pages[i]); +	} +	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); +} + +/* + * we hash on the first logical address of the stripe + */ +static int rbio_bucket(struct btrfs_raid_bio *rbio) +{ +	u64 num = rbio->raid_map[0]; + +	/* +	 * we shift down quite a bit.  We're using byte +	 * addressing, and most of the lower bits are zeros. +	 * This tends to upset hash_64, and it consistently +	 * returns just one or two different values. +	 * +	 * shifting off the lower bits fixes things. +	 */ +	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); +} + +/* + * stealing an rbio means taking all the uptodate pages from the stripe + * array in the source rbio and putting them into the destination rbio + */ +static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) +{ +	int i; +	struct page *s; +	struct page *d; + +	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) +		return; + +	for (i = 0; i < dest->nr_pages; i++) { +		s = src->stripe_pages[i]; +		if (!s || !PageUptodate(s)) { +			continue; +		} + +		d = dest->stripe_pages[i]; +		if (d) +			__free_page(d); + +		dest->stripe_pages[i] = s; +		src->stripe_pages[i] = NULL; +	} +} + +/* + * merging means we take the bio_list from the victim and + * splice it into the destination.  The victim should + * be discarded afterwards. + * + * must be called with dest->rbio_list_lock held + */ +static void merge_rbio(struct btrfs_raid_bio *dest, +		       struct btrfs_raid_bio *victim) +{ +	bio_list_merge(&dest->bio_list, &victim->bio_list); +	dest->bio_list_bytes += victim->bio_list_bytes; +	bio_list_init(&victim->bio_list); +} + +/* + * used to prune items that are in the cache.  The caller + * must hold the hash table lock. + */ +static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ +	int bucket = rbio_bucket(rbio); +	struct btrfs_stripe_hash_table *table; +	struct btrfs_stripe_hash *h; +	int freeit = 0; + +	/* +	 * check the bit again under the hash table lock. +	 */ +	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) +		return; + +	table = rbio->fs_info->stripe_hash_table; +	h = table->table + bucket; + +	/* hold the lock for the bucket because we may be +	 * removing it from the hash table +	 */ +	spin_lock(&h->lock); + +	/* +	 * hold the lock for the bio list because we need +	 * to make sure the bio list is empty +	 */ +	spin_lock(&rbio->bio_list_lock); + +	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { +		list_del_init(&rbio->stripe_cache); +		table->cache_size -= 1; +		freeit = 1; + +		/* if the bio list isn't empty, this rbio is +		 * still involved in an IO.  We take it out +		 * of the cache list, and drop the ref that +		 * was held for the list. +		 * +		 * If the bio_list was empty, we also remove +		 * the rbio from the hash_table, and drop +		 * the corresponding ref +		 */ +		if (bio_list_empty(&rbio->bio_list)) { +			if (!list_empty(&rbio->hash_list)) { +				list_del_init(&rbio->hash_list); +				atomic_dec(&rbio->refs); +				BUG_ON(!list_empty(&rbio->plug_list)); +			} +		} +	} + +	spin_unlock(&rbio->bio_list_lock); +	spin_unlock(&h->lock); + +	if (freeit) +		__free_raid_bio(rbio); +} + +/* + * prune a given rbio from the cache + */ +static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ +	struct btrfs_stripe_hash_table *table; +	unsigned long flags; + +	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) +		return; + +	table = rbio->fs_info->stripe_hash_table; + +	spin_lock_irqsave(&table->cache_lock, flags); +	__remove_rbio_from_cache(rbio); +	spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove everything in the cache + */ +void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +{ +	struct btrfs_stripe_hash_table *table; +	unsigned long flags; +	struct btrfs_raid_bio *rbio; + +	table = info->stripe_hash_table; + +	spin_lock_irqsave(&table->cache_lock, flags); +	while (!list_empty(&table->stripe_cache)) { +		rbio = list_entry(table->stripe_cache.next, +				  struct btrfs_raid_bio, +				  stripe_cache); +		__remove_rbio_from_cache(rbio); +	} +	spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove all cached entries and free the hash table + * used by unmount + */ +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) +{ +	if (!info->stripe_hash_table) +		return; +	btrfs_clear_rbio_cache(info); +	kfree(info->stripe_hash_table); +	info->stripe_hash_table = NULL; +} + +/* + * insert an rbio into the stripe cache.  It + * must have already been prepared by calling + * cache_rbio_pages + * + * If this rbio was already cached, it gets + * moved to the front of the lru. + * + * If the size of the rbio cache is too big, we + * prune an item. + */ +static void cache_rbio(struct btrfs_raid_bio *rbio) +{ +	struct btrfs_stripe_hash_table *table; +	unsigned long flags; + +	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) +		return; + +	table = rbio->fs_info->stripe_hash_table; + +	spin_lock_irqsave(&table->cache_lock, flags); +	spin_lock(&rbio->bio_list_lock); + +	/* bump our ref if we were not in the list before */ +	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) +		atomic_inc(&rbio->refs); + +	if (!list_empty(&rbio->stripe_cache)){ +		list_move(&rbio->stripe_cache, &table->stripe_cache); +	} else { +		list_add(&rbio->stripe_cache, &table->stripe_cache); +		table->cache_size += 1; +	} + +	spin_unlock(&rbio->bio_list_lock); + +	if (table->cache_size > RBIO_CACHE_SIZE) { +		struct btrfs_raid_bio *found; + +		found = list_entry(table->stripe_cache.prev, +				  struct btrfs_raid_bio, +				  stripe_cache); + +		if (found != rbio) +			__remove_rbio_from_cache(found); +	} + +	spin_unlock_irqrestore(&table->cache_lock, flags); +	return; +} + +/* + * helper function to run the xor_blocks api.  It is only + * able to do MAX_XOR_BLOCKS at a time, so we need to + * loop through. + */ +static void run_xor(void **pages, int src_cnt, ssize_t len) +{ +	int src_off = 0; +	int xor_src_cnt = 0; +	void *dest = pages[src_cnt]; + +	while(src_cnt > 0) { +		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); +		xor_blocks(xor_src_cnt, len, dest, pages + src_off); + +		src_cnt -= xor_src_cnt; +		src_off += xor_src_cnt; +	} +} + +/* + * returns true if the bio list inside this rbio + * covers an entire stripe (no rmw required). + * Must be called with the bio list lock held, or + * at a time when you know it is impossible to add + * new bios into the list + */ +static int __rbio_is_full(struct btrfs_raid_bio *rbio) +{ +	unsigned long size = rbio->bio_list_bytes; +	int ret = 1; + +	if (size != rbio->nr_data * rbio->stripe_len) +		ret = 0; + +	BUG_ON(size > rbio->nr_data * rbio->stripe_len); +	return ret; +} + +static int rbio_is_full(struct btrfs_raid_bio *rbio) +{ +	unsigned long flags; +	int ret; + +	spin_lock_irqsave(&rbio->bio_list_lock, flags); +	ret = __rbio_is_full(rbio); +	spin_unlock_irqrestore(&rbio->bio_list_lock, flags); +	return ret; +} + +/* + * returns 1 if it is safe to merge two rbios together. + * The merging is safe if the two rbios correspond to + * the same stripe and if they are both going in the same + * direction (read vs write), and if neither one is + * locked for final IO + * + * The caller is responsible for locking such that + * rmw_locked is safe to test + */ +static int rbio_can_merge(struct btrfs_raid_bio *last, +			  struct btrfs_raid_bio *cur) +{ +	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || +	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) +		return 0; + +	/* +	 * we can't merge with cached rbios, since the +	 * idea is that when we merge the destination +	 * rbio is going to run our IO for us.  We can +	 * steal from cached rbio's though, other functions +	 * handle that. +	 */ +	if (test_bit(RBIO_CACHE_BIT, &last->flags) || +	    test_bit(RBIO_CACHE_BIT, &cur->flags)) +		return 0; + +	if (last->raid_map[0] != +	    cur->raid_map[0]) +		return 0; + +	/* reads can't merge with writes */ +	if (last->read_rebuild != +	    cur->read_rebuild) { +		return 0; +	} + +	return 1; +} + +/* + * helper to index into the pstripe + */ +static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +{ +	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; +	return rbio->stripe_pages[index]; +} + +/* + * helper to index into the qstripe, returns null + * if there is no qstripe + */ +static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +{ +	if (rbio->nr_data + 1 == rbio->bbio->num_stripes) +		return NULL; + +	index += ((rbio->nr_data + 1) * rbio->stripe_len) >> +		PAGE_CACHE_SHIFT; +	return rbio->stripe_pages[index]; +} + +/* + * The first stripe in the table for a logical address + * has the lock.  rbios are added in one of three ways: + * + * 1) Nobody has the stripe locked yet.  The rbio is given + * the lock and 0 is returned.  The caller must start the IO + * themselves. + * + * 2) Someone has the stripe locked, but we're able to merge + * with the lock owner.  The rbio is freed and the IO will + * start automatically along with the existing rbio.  1 is returned. + * + * 3) Someone has the stripe locked, but we're not able to merge. + * The rbio is added to the lock owner's plug list, or merged into + * an rbio already on the plug list.  When the lock owner unlocks, + * the next rbio on the list is run and the IO is started automatically. + * 1 is returned + * + * If we return 0, the caller still owns the rbio and must continue with + * IO submission.  If we return 1, the caller must assume the rbio has + * already been freed. + */ +static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) +{ +	int bucket = rbio_bucket(rbio); +	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; +	struct btrfs_raid_bio *cur; +	struct btrfs_raid_bio *pending; +	unsigned long flags; +	DEFINE_WAIT(wait); +	struct btrfs_raid_bio *freeit = NULL; +	struct btrfs_raid_bio *cache_drop = NULL; +	int ret = 0; +	int walk = 0; + +	spin_lock_irqsave(&h->lock, flags); +	list_for_each_entry(cur, &h->hash_list, hash_list) { +		walk++; +		if (cur->raid_map[0] == rbio->raid_map[0]) { +			spin_lock(&cur->bio_list_lock); + +			/* can we steal this cached rbio's pages? */ +			if (bio_list_empty(&cur->bio_list) && +			    list_empty(&cur->plug_list) && +			    test_bit(RBIO_CACHE_BIT, &cur->flags) && +			    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { +				list_del_init(&cur->hash_list); +				atomic_dec(&cur->refs); + +				steal_rbio(cur, rbio); +				cache_drop = cur; +				spin_unlock(&cur->bio_list_lock); + +				goto lockit; +			} + +			/* can we merge into the lock owner? */ +			if (rbio_can_merge(cur, rbio)) { +				merge_rbio(cur, rbio); +				spin_unlock(&cur->bio_list_lock); +				freeit = rbio; +				ret = 1; +				goto out; +			} + + +			/* +			 * we couldn't merge with the running +			 * rbio, see if we can merge with the +			 * pending ones.  We don't have to +			 * check for rmw_locked because there +			 * is no way they are inside finish_rmw +			 * right now +			 */ +			list_for_each_entry(pending, &cur->plug_list, +					    plug_list) { +				if (rbio_can_merge(pending, rbio)) { +					merge_rbio(pending, rbio); +					spin_unlock(&cur->bio_list_lock); +					freeit = rbio; +					ret = 1; +					goto out; +				} +			} + +			/* no merging, put us on the tail of the plug list, +			 * our rbio will be started with the currently +			 * running rbio unlocks +			 */ +			list_add_tail(&rbio->plug_list, &cur->plug_list); +			spin_unlock(&cur->bio_list_lock); +			ret = 1; +			goto out; +		} +	} +lockit: +	atomic_inc(&rbio->refs); +	list_add(&rbio->hash_list, &h->hash_list); +out: +	spin_unlock_irqrestore(&h->lock, flags); +	if (cache_drop) +		remove_rbio_from_cache(cache_drop); +	if (freeit) +		__free_raid_bio(freeit); +	return ret; +} + +/* + * called as rmw or parity rebuild is completed.  If the plug list has more + * rbios waiting for this stripe, the next one on the list will be started + */ +static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) +{ +	int bucket; +	struct btrfs_stripe_hash *h; +	unsigned long flags; +	int keep_cache = 0; + +	bucket = rbio_bucket(rbio); +	h = rbio->fs_info->stripe_hash_table->table + bucket; + +	if (list_empty(&rbio->plug_list)) +		cache_rbio(rbio); + +	spin_lock_irqsave(&h->lock, flags); +	spin_lock(&rbio->bio_list_lock); + +	if (!list_empty(&rbio->hash_list)) { +		/* +		 * if we're still cached and there is no other IO +		 * to perform, just leave this rbio here for others +		 * to steal from later +		 */ +		if (list_empty(&rbio->plug_list) && +		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) { +			keep_cache = 1; +			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); +			BUG_ON(!bio_list_empty(&rbio->bio_list)); +			goto done; +		} + +		list_del_init(&rbio->hash_list); +		atomic_dec(&rbio->refs); + +		/* +		 * we use the plug list to hold all the rbios +		 * waiting for the chance to lock this stripe. +		 * hand the lock over to one of them. +		 */ +		if (!list_empty(&rbio->plug_list)) { +			struct btrfs_raid_bio *next; +			struct list_head *head = rbio->plug_list.next; + +			next = list_entry(head, struct btrfs_raid_bio, +					  plug_list); + +			list_del_init(&rbio->plug_list); + +			list_add(&next->hash_list, &h->hash_list); +			atomic_inc(&next->refs); +			spin_unlock(&rbio->bio_list_lock); +			spin_unlock_irqrestore(&h->lock, flags); + +			if (next->read_rebuild) +				async_read_rebuild(next); +			else { +				steal_rbio(rbio, next); +				async_rmw_stripe(next); +			} + +			goto done_nolock; +		} else  if (waitqueue_active(&h->wait)) { +			spin_unlock(&rbio->bio_list_lock); +			spin_unlock_irqrestore(&h->lock, flags); +			wake_up(&h->wait); +			goto done_nolock; +		} +	} +done: +	spin_unlock(&rbio->bio_list_lock); +	spin_unlock_irqrestore(&h->lock, flags); + +done_nolock: +	if (!keep_cache) +		remove_rbio_from_cache(rbio); +} + +static void __free_raid_bio(struct btrfs_raid_bio *rbio) +{ +	int i; + +	WARN_ON(atomic_read(&rbio->refs) < 0); +	if (!atomic_dec_and_test(&rbio->refs)) +		return; + +	WARN_ON(!list_empty(&rbio->stripe_cache)); +	WARN_ON(!list_empty(&rbio->hash_list)); +	WARN_ON(!bio_list_empty(&rbio->bio_list)); + +	for (i = 0; i < rbio->nr_pages; i++) { +		if (rbio->stripe_pages[i]) { +			__free_page(rbio->stripe_pages[i]); +			rbio->stripe_pages[i] = NULL; +		} +	} +	kfree(rbio->raid_map); +	kfree(rbio->bbio); +	kfree(rbio); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ +	unlock_stripe(rbio); +	__free_raid_bio(rbio); +} + +/* + * this frees the rbio and runs through all the bios in the + * bio_list and calls end_io on them + */ +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +{ +	struct bio *cur = bio_list_get(&rbio->bio_list); +	struct bio *next; +	free_raid_bio(rbio); + +	while (cur) { +		next = cur->bi_next; +		cur->bi_next = NULL; +		if (uptodate) +			set_bit(BIO_UPTODATE, &cur->bi_flags); +		bio_endio(cur, err); +		cur = next; +	} +} + +/* + * end io function used by finish_rmw.  When we finally + * get here, we've written a full stripe + */ +static void raid_write_end_io(struct bio *bio, int err) +{ +	struct btrfs_raid_bio *rbio = bio->bi_private; + +	if (err) +		fail_bio_stripe(rbio, bio); + +	bio_put(bio); + +	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) +		return; + +	err = 0; + +	/* OK, we have read all the stripes we need to. */ +	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) +		err = -EIO; + +	rbio_orig_end_io(rbio, err, 0); +	return; +} + +/* + * the read/modify/write code wants to use the original bio for + * any pages it included, and then use the rbio for everything + * else.  This function decides if a given index (stripe number) + * and page number in that stripe fall inside the original bio + * or the rbio. + * + * if you set bio_list_only, you'll get a NULL back for any ranges + * that are outside the bio_list + * + * This doesn't take any refs on anything, you get a bare page pointer + * and the caller must bump refs as required. + * + * You must call index_rbio_pages once before you can trust + * the answers from this function. + */ +static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, +				 int index, int pagenr, int bio_list_only) +{ +	int chunk_page; +	struct page *p = NULL; + +	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + +	spin_lock_irq(&rbio->bio_list_lock); +	p = rbio->bio_pages[chunk_page]; +	spin_unlock_irq(&rbio->bio_list_lock); + +	if (p || bio_list_only) +		return p; + +	return rbio->stripe_pages[chunk_page]; +} + +/* + * number of pages we need for the entire stripe across all the + * drives + */ +static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) +{ +	unsigned long nr = stripe_len * nr_stripes; +	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +/* + * allocation and initial setup for the btrfs_raid_bio.  Not + * this does not allocate any pages for rbio->pages. + */ +static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, +			  struct btrfs_bio *bbio, u64 *raid_map, +			  u64 stripe_len) +{ +	struct btrfs_raid_bio *rbio; +	int nr_data = 0; +	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); +	void *p; + +	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, +			GFP_NOFS); +	if (!rbio) { +		kfree(raid_map); +		kfree(bbio); +		return ERR_PTR(-ENOMEM); +	} + +	bio_list_init(&rbio->bio_list); +	INIT_LIST_HEAD(&rbio->plug_list); +	spin_lock_init(&rbio->bio_list_lock); +	INIT_LIST_HEAD(&rbio->stripe_cache); +	INIT_LIST_HEAD(&rbio->hash_list); +	rbio->bbio = bbio; +	rbio->raid_map = raid_map; +	rbio->fs_info = root->fs_info; +	rbio->stripe_len = stripe_len; +	rbio->nr_pages = num_pages; +	rbio->faila = -1; +	rbio->failb = -1; +	atomic_set(&rbio->refs, 1); + +	/* +	 * the stripe_pages and bio_pages array point to the extra +	 * memory we allocated past the end of the rbio +	 */ +	p = rbio + 1; +	rbio->stripe_pages = p; +	rbio->bio_pages = p + sizeof(struct page *) * num_pages; + +	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) +		nr_data = bbio->num_stripes - 2; +	else +		nr_data = bbio->num_stripes - 1; + +	rbio->nr_data = nr_data; +	return rbio; +} + +/* allocate pages for all the stripes in the bio, including parity */ +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) +{ +	int i; +	struct page *page; + +	for (i = 0; i < rbio->nr_pages; i++) { +		if (rbio->stripe_pages[i]) +			continue; +		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); +		if (!page) +			return -ENOMEM; +		rbio->stripe_pages[i] = page; +		ClearPageUptodate(page); +	} +	return 0; +} + +/* allocate pages for just the p/q stripes */ +static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) +{ +	int i; +	struct page *page; + +	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + +	for (; i < rbio->nr_pages; i++) { +		if (rbio->stripe_pages[i]) +			continue; +		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); +		if (!page) +			return -ENOMEM; +		rbio->stripe_pages[i] = page; +	} +	return 0; +} + +/* + * add a single page from a specific stripe into our list of bios for IO + * this will try to merge into existing bios if possible, and returns + * zero if all went well. + */ +int rbio_add_io_page(struct btrfs_raid_bio *rbio, +		     struct bio_list *bio_list, +		     struct page *page, +		     int stripe_nr, +		     unsigned long page_index, +		     unsigned long bio_max_len) +{ +	struct bio *last = bio_list->tail; +	u64 last_end = 0; +	int ret; +	struct bio *bio; +	struct btrfs_bio_stripe *stripe; +	u64 disk_start; + +	stripe = &rbio->bbio->stripes[stripe_nr]; +	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); + +	/* if the device is missing, just fail this stripe */ +	if (!stripe->dev->bdev) +		return fail_rbio_index(rbio, stripe_nr); + +	/* see if we can add this page onto our existing bio */ +	if (last) { +		last_end = (u64)last->bi_sector << 9; +		last_end += last->bi_size; + +		/* +		 * we can't merge these if they are from different +		 * devices or if they are not contiguous +		 */ +		if (last_end == disk_start && stripe->dev->bdev && +		    test_bit(BIO_UPTODATE, &last->bi_flags) && +		    last->bi_bdev == stripe->dev->bdev) { +			ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); +			if (ret == PAGE_CACHE_SIZE) +				return 0; +		} +	} + +	/* put a new bio on the list */ +	bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); +	if (!bio) +		return -ENOMEM; + +	bio->bi_size = 0; +	bio->bi_bdev = stripe->dev->bdev; +	bio->bi_sector = disk_start >> 9; +	set_bit(BIO_UPTODATE, &bio->bi_flags); + +	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); +	bio_list_add(bio_list, bio); +	return 0; +} + +/* + * while we're doing the read/modify/write cycle, we could + * have errors in reading pages off the disk.  This checks + * for errors and if we're not able to read the page it'll + * trigger parity reconstruction.  The rmw will be finished + * after we've reconstructed the failed stripes + */ +static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) +{ +	if (rbio->faila >= 0 || rbio->failb >= 0) { +		BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); +		__raid56_parity_recover(rbio); +	} else { +		finish_rmw(rbio); +	} +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) +{ +	int index; +	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); +	index += page; +	return rbio->stripe_pages[index]; +} + +/* + * helper function to walk our bio list and populate the bio_pages array with + * the result.  This seems expensive, but it is faster than constantly + * searching through the bio list as we setup the IO in finish_rmw or stripe + * reconstruction. + * + * This must be called before you trust the answers from page_in_rbio + */ +static void index_rbio_pages(struct btrfs_raid_bio *rbio) +{ +	struct bio *bio; +	u64 start; +	unsigned long stripe_offset; +	unsigned long page_index; +	struct page *p; +	int i; + +	spin_lock_irq(&rbio->bio_list_lock); +	bio_list_for_each(bio, &rbio->bio_list) { +		start = (u64)bio->bi_sector << 9; +		stripe_offset = start - rbio->raid_map[0]; +		page_index = stripe_offset >> PAGE_CACHE_SHIFT; + +		for (i = 0; i < bio->bi_vcnt; i++) { +			p = bio->bi_io_vec[i].bv_page; +			rbio->bio_pages[page_index + i] = p; +		} +	} +	spin_unlock_irq(&rbio->bio_list_lock); +} + +/* + * this is called from one of two situations.  We either + * have a full stripe from the higher layers, or we've read all + * the missing bits off disk. + * + * This will calculate the parity and then send down any + * changed blocks. + */ +static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +{ +	struct btrfs_bio *bbio = rbio->bbio; +	void *pointers[bbio->num_stripes]; +	int stripe_len = rbio->stripe_len; +	int nr_data = rbio->nr_data; +	int stripe; +	int pagenr; +	int p_stripe = -1; +	int q_stripe = -1; +	struct bio_list bio_list; +	struct bio *bio; +	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; +	int ret; + +	bio_list_init(&bio_list); + +	if (bbio->num_stripes - rbio->nr_data == 1) { +		p_stripe = bbio->num_stripes - 1; +	} else if (bbio->num_stripes - rbio->nr_data == 2) { +		p_stripe = bbio->num_stripes - 2; +		q_stripe = bbio->num_stripes - 1; +	} else { +		BUG(); +	} + +	/* at this point we either have a full stripe, +	 * or we've read the full stripe from the drive. +	 * recalculate the parity and write the new results. +	 * +	 * We're not allowed to add any new bios to the +	 * bio list here, anyone else that wants to +	 * change this stripe needs to do their own rmw. +	 */ +	spin_lock_irq(&rbio->bio_list_lock); +	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); +	spin_unlock_irq(&rbio->bio_list_lock); + +	atomic_set(&rbio->bbio->error, 0); + +	/* +	 * now that we've set rmw_locked, run through the +	 * bio list one last time and map the page pointers +	 * +	 * We don't cache full rbios because we're assuming +	 * the higher layers are unlikely to use this area of +	 * the disk again soon.  If they do use it again, +	 * hopefully they will send another full bio. +	 */ +	index_rbio_pages(rbio); +	if (!rbio_is_full(rbio)) +		cache_rbio_pages(rbio); +	else +		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + +	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { +		struct page *p; +		/* first collect one page from each data stripe */ +		for (stripe = 0; stripe < nr_data; stripe++) { +			p = page_in_rbio(rbio, stripe, pagenr, 0); +			pointers[stripe] = kmap(p); +		} + +		/* then add the parity stripe */ +		p = rbio_pstripe_page(rbio, pagenr); +		SetPageUptodate(p); +		pointers[stripe++] = kmap(p); + +		if (q_stripe != -1) { + +			/* +			 * raid6, add the qstripe and call the +			 * library function to fill in our p/q +			 */ +			p = rbio_qstripe_page(rbio, pagenr); +			SetPageUptodate(p); +			pointers[stripe++] = kmap(p); + +			raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, +						pointers); +		} else { +			/* raid5 */ +			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); +			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); +		} + + +		for (stripe = 0; stripe < bbio->num_stripes; stripe++) +			kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); +	} + +	/* +	 * time to start writing.  Make bios for everything from the +	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore +	 * everything else. +	 */ +	for (stripe = 0; stripe < bbio->num_stripes; stripe++) { +		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { +			struct page *page; +			if (stripe < rbio->nr_data) { +				page = page_in_rbio(rbio, stripe, pagenr, 1); +				if (!page) +					continue; +			} else { +			       page = rbio_stripe_page(rbio, stripe, pagenr); +			} + +			ret = rbio_add_io_page(rbio, &bio_list, +				       page, stripe, pagenr, rbio->stripe_len); +			if (ret) +				goto cleanup; +		} +	} + +	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); +	BUG_ON(atomic_read(&bbio->stripes_pending) == 0); + +	while (1) { +		bio = bio_list_pop(&bio_list); +		if (!bio) +			break; + +		bio->bi_private = rbio; +		bio->bi_end_io = raid_write_end_io; +		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); +		submit_bio(WRITE, bio); +	} +	return; + +cleanup: +	rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * helper to find the stripe number for a given bio.  Used to figure out which + * stripe has failed.  This expects the bio to correspond to a physical disk, + * so it looks up based on physical sector numbers. + */ +static int find_bio_stripe(struct btrfs_raid_bio *rbio, +			   struct bio *bio) +{ +	u64 physical = bio->bi_sector; +	u64 stripe_start; +	int i; +	struct btrfs_bio_stripe *stripe; + +	physical <<= 9; + +	for (i = 0; i < rbio->bbio->num_stripes; i++) { +		stripe = &rbio->bbio->stripes[i]; +		stripe_start = stripe->physical; +		if (physical >= stripe_start && +		    physical < stripe_start + rbio->stripe_len) { +			return i; +		} +	} +	return -1; +} + +/* + * helper to find the stripe number for a given + * bio (before mapping).  Used to figure out which stripe has + * failed.  This looks up based on logical block numbers. + */ +static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, +				   struct bio *bio) +{ +	u64 logical = bio->bi_sector; +	u64 stripe_start; +	int i; + +	logical <<= 9; + +	for (i = 0; i < rbio->nr_data; i++) { +		stripe_start = rbio->raid_map[i]; +		if (logical >= stripe_start && +		    logical < stripe_start + rbio->stripe_len) { +			return i; +		} +	} +	return -1; +} + +/* + * returns -EIO if we had too many failures + */ +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +{ +	unsigned long flags; +	int ret = 0; + +	spin_lock_irqsave(&rbio->bio_list_lock, flags); + +	/* we already know this stripe is bad, move on */ +	if (rbio->faila == failed || rbio->failb == failed) +		goto out; + +	if (rbio->faila == -1) { +		/* first failure on this rbio */ +		rbio->faila = failed; +		atomic_inc(&rbio->bbio->error); +	} else if (rbio->failb == -1) { +		/* second failure on this rbio */ +		rbio->failb = failed; +		atomic_inc(&rbio->bbio->error); +	} else { +		ret = -EIO; +	} +out: +	spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + +	return ret; +} + +/* + * helper to fail a stripe based on a physical disk + * bio. + */ +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, +			   struct bio *bio) +{ +	int failed = find_bio_stripe(rbio, bio); + +	if (failed < 0) +		return -EIO; + +	return fail_rbio_index(rbio, failed); +} + +/* + * this sets each page in the bio uptodate.  It should only be used on private + * rbio pages, nothing that comes in from the higher layers + */ +static void set_bio_pages_uptodate(struct bio *bio) +{ +	int i; +	struct page *p; + +	for (i = 0; i < bio->bi_vcnt; i++) { +		p = bio->bi_io_vec[i].bv_page; +		SetPageUptodate(p); +	} +} + +/* + * end io for the read phase of the rmw cycle.  All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid_rmw_end_io(struct bio *bio, int err) +{ +	struct btrfs_raid_bio *rbio = bio->bi_private; + +	if (err) +		fail_bio_stripe(rbio, bio); +	else +		set_bio_pages_uptodate(bio); + +	bio_put(bio); + +	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) +		return; + +	err = 0; +	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) +		goto cleanup; + +	/* +	 * this will normally call finish_rmw to start our write +	 * but if there are any failed stripes we'll reconstruct +	 * from parity first +	 */ +	validate_rbio_for_rmw(rbio); +	return; + +cleanup: + +	rbio_orig_end_io(rbio, -EIO, 0); +} + +static void async_rmw_stripe(struct btrfs_raid_bio *rbio) +{ +	rbio->work.flags = 0; +	rbio->work.func = rmw_work; + +	btrfs_queue_worker(&rbio->fs_info->rmw_workers, +			   &rbio->work); +} + +static void async_read_rebuild(struct btrfs_raid_bio *rbio) +{ +	rbio->work.flags = 0; +	rbio->work.func = read_rebuild_work; + +	btrfs_queue_worker(&rbio->fs_info->rmw_workers, +			   &rbio->work); +} + +/* + * the stripe must be locked by the caller.  It will + * unlock after all the writes are done + */ +static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +{ +	int bios_to_read = 0; +	struct btrfs_bio *bbio = rbio->bbio; +	struct bio_list bio_list; +	int ret; +	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +	int pagenr; +	int stripe; +	struct bio *bio; + +	bio_list_init(&bio_list); + +	ret = alloc_rbio_pages(rbio); +	if (ret) +		goto cleanup; + +	index_rbio_pages(rbio); + +	atomic_set(&rbio->bbio->error, 0); +	/* +	 * build a list of bios to read all the missing parts of this +	 * stripe +	 */ +	for (stripe = 0; stripe < rbio->nr_data; stripe++) { +		for (pagenr = 0; pagenr < nr_pages; pagenr++) { +			struct page *page; +			/* +			 * we want to find all the pages missing from +			 * the rbio and read them from the disk.  If +			 * page_in_rbio finds a page in the bio list +			 * we don't need to read it off the stripe. +			 */ +			page = page_in_rbio(rbio, stripe, pagenr, 1); +			if (page) +				continue; + +			page = rbio_stripe_page(rbio, stripe, pagenr); +			/* +			 * the bio cache may have handed us an uptodate +			 * page.  If so, be happy and use it +			 */ +			if (PageUptodate(page)) +				continue; + +			ret = rbio_add_io_page(rbio, &bio_list, page, +				       stripe, pagenr, rbio->stripe_len); +			if (ret) +				goto cleanup; +		} +	} + +	bios_to_read = bio_list_size(&bio_list); +	if (!bios_to_read) { +		/* +		 * this can happen if others have merged with +		 * us, it means there is nothing left to read. +		 * But if there are missing devices it may not be +		 * safe to do the full stripe write yet. +		 */ +		goto finish; +	} + +	/* +	 * the bbio may be freed once we submit the last bio.  Make sure +	 * not to touch it after that +	 */ +	atomic_set(&bbio->stripes_pending, bios_to_read); +	while (1) { +		bio = bio_list_pop(&bio_list); +		if (!bio) +			break; + +		bio->bi_private = rbio; +		bio->bi_end_io = raid_rmw_end_io; + +		btrfs_bio_wq_end_io(rbio->fs_info, bio, +				    BTRFS_WQ_ENDIO_RAID56); + +		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); +		submit_bio(READ, bio); +	} +	/* the actual write will happen once the reads are done */ +	return 0; + +cleanup: +	rbio_orig_end_io(rbio, -EIO, 0); +	return -EIO; + +finish: +	validate_rbio_for_rmw(rbio); +	return 0; +} + +/* + * if the upper layers pass in a full stripe, we thank them by only allocating + * enough pages to hold the parity, and sending it all down quickly. + */ +static int full_stripe_write(struct btrfs_raid_bio *rbio) +{ +	int ret; + +	ret = alloc_rbio_parity_pages(rbio); +	if (ret) +		return ret; + +	ret = lock_stripe_add(rbio); +	if (ret == 0) +		finish_rmw(rbio); +	return 0; +} + +/* + * partial stripe writes get handed over to async helpers. + * We're really hoping to merge a few more writes into this + * rbio before calculating new parity + */ +static int partial_stripe_write(struct btrfs_raid_bio *rbio) +{ +	int ret; + +	ret = lock_stripe_add(rbio); +	if (ret == 0) +		async_rmw_stripe(rbio); +	return 0; +} + +/* + * sometimes while we were reading from the drive to + * recalculate parity, enough new bios come into create + * a full stripe.  So we do a check here to see if we can + * go directly to finish_rmw + */ +static int __raid56_parity_write(struct btrfs_raid_bio *rbio) +{ +	/* head off into rmw land if we don't have a full stripe */ +	if (!rbio_is_full(rbio)) +		return partial_stripe_write(rbio); +	return full_stripe_write(rbio); +} + +/* + * We use plugging call backs to collect full stripes. + * Any time we get a partial stripe write while plugged + * we collect it into a list.  When the unplug comes down, + * we sort the list by logical block number and merge + * everything we can into the same rbios + */ +struct btrfs_plug_cb { +	struct blk_plug_cb cb; +	struct btrfs_fs_info *info; +	struct list_head rbio_list; +	struct btrfs_work work; +}; + +/* + * rbios on the plug list are sorted for easier merging. + */ +static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +{ +	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, +						 plug_list); +	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, +						 plug_list); +	u64 a_sector = ra->bio_list.head->bi_sector; +	u64 b_sector = rb->bio_list.head->bi_sector; + +	if (a_sector < b_sector) +		return -1; +	if (a_sector > b_sector) +		return 1; +	return 0; +} + +static void run_plug(struct btrfs_plug_cb *plug) +{ +	struct btrfs_raid_bio *cur; +	struct btrfs_raid_bio *last = NULL; + +	/* +	 * sort our plug list then try to merge +	 * everything we can in hopes of creating full +	 * stripes. +	 */ +	list_sort(NULL, &plug->rbio_list, plug_cmp); +	while (!list_empty(&plug->rbio_list)) { +		cur = list_entry(plug->rbio_list.next, +				 struct btrfs_raid_bio, plug_list); +		list_del_init(&cur->plug_list); + +		if (rbio_is_full(cur)) { +			/* we have a full stripe, send it down */ +			full_stripe_write(cur); +			continue; +		} +		if (last) { +			if (rbio_can_merge(last, cur)) { +				merge_rbio(last, cur); +				__free_raid_bio(cur); +				continue; + +			} +			__raid56_parity_write(last); +		} +		last = cur; +	} +	if (last) { +		__raid56_parity_write(last); +	} +	kfree(plug); +} + +/* + * if the unplug comes from schedule, we have to push the + * work off to a helper thread + */ +static void unplug_work(struct btrfs_work *work) +{ +	struct btrfs_plug_cb *plug; +	plug = container_of(work, struct btrfs_plug_cb, work); +	run_plug(plug); +} + +static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ +	struct btrfs_plug_cb *plug; +	plug = container_of(cb, struct btrfs_plug_cb, cb); + +	if (from_schedule) { +		plug->work.flags = 0; +		plug->work.func = unplug_work; +		btrfs_queue_worker(&plug->info->rmw_workers, +				   &plug->work); +		return; +	} +	run_plug(plug); +} + +/* + * our main entry point for writes from the rest of the FS. + */ +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, +			struct btrfs_bio *bbio, u64 *raid_map, +			u64 stripe_len) +{ +	struct btrfs_raid_bio *rbio; +	struct btrfs_plug_cb *plug = NULL; +	struct blk_plug_cb *cb; + +	rbio = alloc_rbio(root, bbio, raid_map, stripe_len); +	if (IS_ERR(rbio)) { +		kfree(raid_map); +		kfree(bbio); +		return PTR_ERR(rbio); +	} +	bio_list_add(&rbio->bio_list, bio); +	rbio->bio_list_bytes = bio->bi_size; + +	/* +	 * don't plug on full rbios, just get them out the door +	 * as quickly as we can +	 */ +	if (rbio_is_full(rbio)) +		return full_stripe_write(rbio); + +	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, +			       sizeof(*plug)); +	if (cb) { +		plug = container_of(cb, struct btrfs_plug_cb, cb); +		if (!plug->info) { +			plug->info = root->fs_info; +			INIT_LIST_HEAD(&plug->rbio_list); +		} +		list_add_tail(&rbio->plug_list, &plug->rbio_list); +	} else { +		return __raid56_parity_write(rbio); +	} +	return 0; +} + +/* + * all parity reconstruction happens here.  We've read in everything + * we can find from the drives and this does the heavy lifting of + * sorting the good from the bad. + */ +static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +{ +	int pagenr, stripe; +	void **pointers; +	int faila = -1, failb = -1; +	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +	struct page *page; +	int err; +	int i; + +	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), +			   GFP_NOFS); +	if (!pointers) { +		err = -ENOMEM; +		goto cleanup_io; +	} + +	faila = rbio->faila; +	failb = rbio->failb; + +	if (rbio->read_rebuild) { +		spin_lock_irq(&rbio->bio_list_lock); +		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); +		spin_unlock_irq(&rbio->bio_list_lock); +	} + +	index_rbio_pages(rbio); + +	for (pagenr = 0; pagenr < nr_pages; pagenr++) { +		/* setup our array of pointers with pages +		 * from each stripe +		 */ +		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { +			/* +			 * if we're rebuilding a read, we have to use +			 * pages from the bio list +			 */ +			if (rbio->read_rebuild && +			    (stripe == faila || stripe == failb)) { +				page = page_in_rbio(rbio, stripe, pagenr, 0); +			} else { +				page = rbio_stripe_page(rbio, stripe, pagenr); +			} +			pointers[stripe] = kmap(page); +		} + +		/* all raid6 handling here */ +		if (rbio->raid_map[rbio->bbio->num_stripes - 1] == +		    RAID6_Q_STRIPE) { + +			/* +			 * single failure, rebuild from parity raid5 +			 * style +			 */ +			if (failb < 0) { +				if (faila == rbio->nr_data) { +					/* +					 * Just the P stripe has failed, without +					 * a bad data or Q stripe. +					 * TODO, we should redo the xor here. +					 */ +					err = -EIO; +					goto cleanup; +				} +				/* +				 * a single failure in raid6 is rebuilt +				 * in the pstripe code below +				 */ +				goto pstripe; +			} + +			/* make sure our ps and qs are in order */ +			if (faila > failb) { +				int tmp = failb; +				failb = faila; +				faila = tmp; +			} + +			/* if the q stripe is failed, do a pstripe reconstruction +			 * from the xors. +			 * If both the q stripe and the P stripe are failed, we're +			 * here due to a crc mismatch and we can't give them the +			 * data they want +			 */ +			if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { +				if (rbio->raid_map[faila] == RAID5_P_STRIPE) { +					err = -EIO; +					goto cleanup; +				} +				/* +				 * otherwise we have one bad data stripe and +				 * a good P stripe.  raid5! +				 */ +				goto pstripe; +			} + +			if (rbio->raid_map[failb] == RAID5_P_STRIPE) { +				raid6_datap_recov(rbio->bbio->num_stripes, +						  PAGE_SIZE, faila, pointers); +			} else { +				raid6_2data_recov(rbio->bbio->num_stripes, +						  PAGE_SIZE, faila, failb, +						  pointers); +			} +		} else { +			void *p; + +			/* rebuild from P stripe here (raid5 or raid6) */ +			BUG_ON(failb != -1); +pstripe: +			/* Copy parity block into failed block to start with */ +			memcpy(pointers[faila], +			       pointers[rbio->nr_data], +			       PAGE_CACHE_SIZE); + +			/* rearrange the pointer array */ +			p = pointers[faila]; +			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) +				pointers[stripe] = pointers[stripe + 1]; +			pointers[rbio->nr_data - 1] = p; + +			/* xor in the rest */ +			run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); +		} +		/* if we're doing this rebuild as part of an rmw, go through +		 * and set all of our private rbio pages in the +		 * failed stripes as uptodate.  This way finish_rmw will +		 * know they can be trusted.  If this was a read reconstruction, +		 * other endio functions will fiddle the uptodate bits +		 */ +		if (!rbio->read_rebuild) { +			for (i = 0;  i < nr_pages; i++) { +				if (faila != -1) { +					page = rbio_stripe_page(rbio, faila, i); +					SetPageUptodate(page); +				} +				if (failb != -1) { +					page = rbio_stripe_page(rbio, failb, i); +					SetPageUptodate(page); +				} +			} +		} +		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { +			/* +			 * if we're rebuilding a read, we have to use +			 * pages from the bio list +			 */ +			if (rbio->read_rebuild && +			    (stripe == faila || stripe == failb)) { +				page = page_in_rbio(rbio, stripe, pagenr, 0); +			} else { +				page = rbio_stripe_page(rbio, stripe, pagenr); +			} +			kunmap(page); +		} +	} + +	err = 0; +cleanup: +	kfree(pointers); + +cleanup_io: + +	if (rbio->read_rebuild) { +		if (err == 0) +			cache_rbio_pages(rbio); +		else +			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + +		rbio_orig_end_io(rbio, err, err == 0); +	} else if (err == 0) { +		rbio->faila = -1; +		rbio->failb = -1; +		finish_rmw(rbio); +	} else { +		rbio_orig_end_io(rbio, err, 0); +	} +} + +/* + * This is called only for stripes we've read from disk to + * reconstruct the parity. + */ +static void raid_recover_end_io(struct bio *bio, int err) +{ +	struct btrfs_raid_bio *rbio = bio->bi_private; + +	/* +	 * we only read stripe pages off the disk, set them +	 * up to date if there were no errors +	 */ +	if (err) +		fail_bio_stripe(rbio, bio); +	else +		set_bio_pages_uptodate(bio); +	bio_put(bio); + +	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) +		return; + +	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) +		rbio_orig_end_io(rbio, -EIO, 0); +	else +		__raid_recover_end_io(rbio); +} + +/* + * reads everything we need off the disk to reconstruct + * the parity. endio handlers trigger final reconstruction + * when the IO is done. + * + * This is used both for reads from the higher layers and for + * parity construction required to finish a rmw cycle. + */ +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +{ +	int bios_to_read = 0; +	struct btrfs_bio *bbio = rbio->bbio; +	struct bio_list bio_list; +	int ret; +	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +	int pagenr; +	int stripe; +	struct bio *bio; + +	bio_list_init(&bio_list); + +	ret = alloc_rbio_pages(rbio); +	if (ret) +		goto cleanup; + +	atomic_set(&rbio->bbio->error, 0); + +	/* +	 * read everything that hasn't failed.  Thanks to the +	 * stripe cache, it is possible that some or all of these +	 * pages are going to be uptodate. +	 */ +	for (stripe = 0; stripe < bbio->num_stripes; stripe++) { +		if (rbio->faila == stripe || +		    rbio->failb == stripe) +			continue; + +		for (pagenr = 0; pagenr < nr_pages; pagenr++) { +			struct page *p; + +			/* +			 * the rmw code may have already read this +			 * page in +			 */ +			p = rbio_stripe_page(rbio, stripe, pagenr); +			if (PageUptodate(p)) +				continue; + +			ret = rbio_add_io_page(rbio, &bio_list, +				       rbio_stripe_page(rbio, stripe, pagenr), +				       stripe, pagenr, rbio->stripe_len); +			if (ret < 0) +				goto cleanup; +		} +	} + +	bios_to_read = bio_list_size(&bio_list); +	if (!bios_to_read) { +		/* +		 * we might have no bios to read just because the pages +		 * were up to date, or we might have no bios to read because +		 * the devices were gone. +		 */ +		if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { +			__raid_recover_end_io(rbio); +			goto out; +		} else { +			goto cleanup; +		} +	} + +	/* +	 * the bbio may be freed once we submit the last bio.  Make sure +	 * not to touch it after that +	 */ +	atomic_set(&bbio->stripes_pending, bios_to_read); +	while (1) { +		bio = bio_list_pop(&bio_list); +		if (!bio) +			break; + +		bio->bi_private = rbio; +		bio->bi_end_io = raid_recover_end_io; + +		btrfs_bio_wq_end_io(rbio->fs_info, bio, +				    BTRFS_WQ_ENDIO_RAID56); + +		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); +		submit_bio(READ, bio); +	} +out: +	return 0; + +cleanup: +	if (rbio->read_rebuild) +		rbio_orig_end_io(rbio, -EIO, 0); +	return -EIO; +} + +/* + * the main entry point for reads from the higher layers.  This + * is really only called when the normal read path had a failure, + * so we assume the bio they send down corresponds to a failed part + * of the drive. + */ +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, +			  struct btrfs_bio *bbio, u64 *raid_map, +			  u64 stripe_len, int mirror_num) +{ +	struct btrfs_raid_bio *rbio; +	int ret; + +	rbio = alloc_rbio(root, bbio, raid_map, stripe_len); +	if (IS_ERR(rbio)) { +		return PTR_ERR(rbio); +	} + +	rbio->read_rebuild = 1; +	bio_list_add(&rbio->bio_list, bio); +	rbio->bio_list_bytes = bio->bi_size; + +	rbio->faila = find_logical_bio_stripe(rbio, bio); +	if (rbio->faila == -1) { +		BUG(); +		kfree(rbio); +		return -EIO; +	} + +	/* +	 * reconstruct from the q stripe if they are +	 * asking for mirror 3 +	 */ +	if (mirror_num == 3) +		rbio->failb = bbio->num_stripes - 2; + +	ret = lock_stripe_add(rbio); + +	/* +	 * __raid56_parity_recover will end the bio with +	 * any errors it hits.  We don't want to return +	 * its error value up the stack because our caller +	 * will end up calling bio_endio with any nonzero +	 * return +	 */ +	if (ret == 0) +		__raid56_parity_recover(rbio); +	/* +	 * our rbio has been added to the list of +	 * rbios that will be handled after the +	 * currently lock owner is done +	 */ +	return 0; + +} + +static void rmw_work(struct btrfs_work *work) +{ +	struct btrfs_raid_bio *rbio; + +	rbio = container_of(work, struct btrfs_raid_bio, work); +	raid56_rmw_stripe(rbio); +} + +static void read_rebuild_work(struct btrfs_work *work) +{ +	struct btrfs_raid_bio *rbio; + +	rbio = container_of(work, struct btrfs_raid_bio, work); +	__raid56_parity_recover(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 00000000000..ea5d73bfdfb --- /dev/null +++ b/fs/btrfs/raid56.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2012 Fusion-io  All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_RAID56__ +#define __BTRFS_RAID56__ +static inline int nr_parity_stripes(struct map_lookup *map) +{ +	if (map->type & BTRFS_BLOCK_GROUP_RAID5) +		return 1; +	else if (map->type & BTRFS_BLOCK_GROUP_RAID6) +		return 2; +	else +		return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ +	return map->num_stripes - nr_parity_stripes(map); +} +#define RAID5_P_STRIPE ((u64)-2) +#define RAID6_Q_STRIPE ((u64)-1) + +#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\ +			     ((x) == RAID6_Q_STRIPE)) + +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, +				 struct btrfs_bio *bbio, u64 *raid_map, +				 u64 stripe_len, int mirror_num); +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, +			       struct btrfs_bio *bbio, u64 *raid_map, +			       u64 stripe_len); + +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +#endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c78b2a3fc33..53c3501fa4c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -28,6 +28,7 @@  #include "dev-replace.h"  #include "check-integrity.h"  #include "rcu-string.h" +#include "raid56.h"  /*   * This is only the first step towards a full-features scrub. It reads all @@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  	struct btrfs_device *extent_dev;  	int extent_mirror_num; +	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +			 BTRFS_BLOCK_GROUP_RAID6)) { +		if (num >= nr_data_stripes(map)) { +			return 0; +		} +	} +  	nstripes = length;  	offset = 0;  	do_div(nstripes, map->stripe_len); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 955204ca044..a83d486cc70 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -167,6 +167,9 @@ loop:  	spin_lock_init(&cur_trans->commit_lock);  	spin_lock_init(&cur_trans->delayed_refs.lock); +	atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); +	atomic_set(&cur_trans->delayed_refs.ref_seq, 0); +	init_waitqueue_head(&cur_trans->delayed_refs.wait);  	INIT_LIST_HEAD(&cur_trans->pending_snapshots);  	INIT_LIST_HEAD(&cur_trans->ordered_operations); @@ -637,7 +640,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	if (!list_empty(&trans->new_bgs))  		btrfs_create_pending_block_groups(trans, root); -	while (count < 2) { +	while (count < 1) {  		unsigned long cur = trans->delayed_ref_updates;  		trans->delayed_ref_updates = 0;  		if (cur && @@ -649,6 +652,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  		}  		count++;  	} +  	btrfs_trans_release_metadata(trans, root);  	trans->block_rsv = NULL; @@ -744,7 +748,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,  	struct extent_state *cached_state = NULL;  	u64 start = 0;  	u64 end; +	struct blk_plug plug; +	blk_start_plug(&plug);  	while (!find_first_extent_bit(dirty_pages, start, &start, &end,  				      mark, &cached_state)) {  		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, @@ -758,6 +764,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,  	}  	if (err)  		werr = err; +	blk_finish_plug(&plug);  	return werr;  } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 72b1cf1b2b5..7992dc4ea4c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,6 +25,8 @@  #include <linux/capability.h>  #include <linux/ratelimit.h>  #include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <asm/div64.h>  #include "compat.h"  #include "ctree.h"  #include "extent_map.h" @@ -32,6 +34,7 @@  #include "transaction.h"  #include "print-tree.h"  #include "volumes.h" +#include "raid56.h"  #include "async-thread.h"  #include "check-integrity.h"  #include "rcu-string.h" @@ -1465,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		goto out;  	} +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && +	    root->fs_info->fs_devices->rw_devices <= 2) { +		printk(KERN_ERR "btrfs: unable to go below two " +		       "devices on raid5\n"); +		ret = -EINVAL; +		goto out; +	} +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && +	    root->fs_info->fs_devices->rw_devices <= 3) { +		printk(KERN_ERR "btrfs: unable to go below three " +		       "devices on raid6\n"); +		ret = -EINVAL; +		goto out; +	} +  	if (strcmp(device_path, "missing") == 0) {  		struct list_head *devices;  		struct btrfs_device *tmp; @@ -2726,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,  		return 0;  	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | -	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) -		factor = 2; -	else -		factor = 1; -	factor = num_stripes / factor; +	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { +		factor = num_stripes / 2; +	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { +		factor = num_stripes - 1; +	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { +		factor = num_stripes - 2; +	} else { +		factor = num_stripes; +	}  	for (i = 0; i < num_stripes; i++) {  		stripe = btrfs_stripe_nr(chunk, i); @@ -3090,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);  	else  		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | -				BTRFS_BLOCK_GROUP_RAID10); +				BTRFS_BLOCK_GROUP_RAID10 | +				BTRFS_BLOCK_GROUP_RAID5 | +				BTRFS_BLOCK_GROUP_RAID6);  	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&  	    (!alloc_profile_is_valid(bctl->data.target, 1) || @@ -3130,7 +3154,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  	/* allow to reduce meta or sys integrity only if force set */  	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | -			BTRFS_BLOCK_GROUP_RAID10; +			BTRFS_BLOCK_GROUP_RAID10 | +			BTRFS_BLOCK_GROUP_RAID5 | +			BTRFS_BLOCK_GROUP_RAID6;  	do {  		seq = read_seqbegin(&fs_info->profiles_lock); @@ -3204,11 +3230,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		update_ioctl_balance_args(fs_info, 0, bargs);  	} -	if ((ret && ret != -ECANCELED && ret != -ENOSPC) || -	    balance_need_close(fs_info)) { -		__cancel_balance(fs_info); -	} -  	wake_up(&fs_info->balance_wait_q);  	return ret; @@ -3611,8 +3632,46 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.devs_increment	= 1,  		.ncopies	= 1,  	}, +	[BTRFS_RAID_RAID5] = { +		.sub_stripes	= 1, +		.dev_stripes	= 1, +		.devs_max	= 0, +		.devs_min	= 2, +		.devs_increment	= 1, +		.ncopies	= 2, +	}, +	[BTRFS_RAID_RAID6] = { +		.sub_stripes	= 1, +		.dev_stripes	= 1, +		.devs_max	= 0, +		.devs_min	= 3, +		.devs_increment	= 1, +		.ncopies	= 3, +	},  }; -  + +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) +{ +	/* TODO allow them to set a preferred stripe size */ +	return 64 * 1024; +} + +static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ +	u64 features; + +	if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) +		return; + +	features = btrfs_super_incompat_flags(info->super_copy); +	if (features & BTRFS_FEATURE_INCOMPAT_RAID56) +		return; + +	features |= BTRFS_FEATURE_INCOMPAT_RAID56; +	btrfs_set_super_incompat_flags(info->super_copy, features); +	printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); +} +  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  			       struct btrfs_root *extent_root,  			       struct map_lookup **map_ret, @@ -3628,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	struct btrfs_device_info *devices_info = NULL;  	u64 total_avail;  	int num_stripes;	/* total number of stripes to allocate */ +	int data_stripes;	/* number of stripes that count for +				   block group size */  	int sub_stripes;	/* sub_stripes info for map */  	int dev_stripes;	/* stripes per dev */  	int devs_max;		/* max devs to use */ @@ -3639,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	u64 max_chunk_size;  	u64 stripe_size;  	u64 num_bytes; +	u64 raid_stripe_len = BTRFS_STRIPE_LEN;  	int ndevs;  	int i;  	int j; @@ -3768,16 +3830,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	stripe_size = devices_info[ndevs-1].max_avail;  	num_stripes = ndevs * dev_stripes; +	/* +	 * this will have to be fixed for RAID1 and RAID10 over +	 * more drives +	 */ +	data_stripes = num_stripes / ncopies; +  	if (stripe_size * ndevs > max_chunk_size * ncopies) {  		stripe_size = max_chunk_size * ncopies;  		do_div(stripe_size, ndevs);  	} - +	if (type & BTRFS_BLOCK_GROUP_RAID5) { +		raid_stripe_len = find_raid56_stripe_len(ndevs - 1, +				 btrfs_super_stripesize(info->super_copy)); +		data_stripes = num_stripes - 1; +	} +	if (type & BTRFS_BLOCK_GROUP_RAID6) { +		raid_stripe_len = find_raid56_stripe_len(ndevs - 2, +				 btrfs_super_stripesize(info->super_copy)); +		data_stripes = num_stripes - 2; +	}  	do_div(stripe_size, dev_stripes);  	/* align to BTRFS_STRIPE_LEN */ -	do_div(stripe_size, BTRFS_STRIPE_LEN); -	stripe_size *= BTRFS_STRIPE_LEN; +	do_div(stripe_size, raid_stripe_len); +	stripe_size *= raid_stripe_len;  	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);  	if (!map) { @@ -3795,14 +3872,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		}  	}  	map->sector_size = extent_root->sectorsize; -	map->stripe_len = BTRFS_STRIPE_LEN; -	map->io_align = BTRFS_STRIPE_LEN; -	map->io_width = BTRFS_STRIPE_LEN; +	map->stripe_len = raid_stripe_len; +	map->io_align = raid_stripe_len; +	map->io_width = raid_stripe_len;  	map->type = type;  	map->sub_stripes = sub_stripes;  	*map_ret = map; -	num_bytes = stripe_size * (num_stripes / ncopies); +	num_bytes = stripe_size * data_stripes;  	*stripe_size_out = stripe_size;  	*num_bytes_out = num_bytes; @@ -3853,6 +3930,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	}  	free_extent_map(em); +	check_raid56_incompat_flag(extent_root->fs_info, type); +  	kfree(devices_info);  	return 0; @@ -4136,6 +4215,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  		ret = map->num_stripes;  	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)  		ret = map->sub_stripes; +	else if (map->type & BTRFS_BLOCK_GROUP_RAID5) +		ret = 2; +	else if (map->type & BTRFS_BLOCK_GROUP_RAID6) +		ret = 3;  	else  		ret = 1;  	free_extent_map(em); @@ -4148,6 +4231,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  	return ret;  } +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, +				    struct btrfs_mapping_tree *map_tree, +				    u64 logical) +{ +	struct extent_map *em; +	struct map_lookup *map; +	struct extent_map_tree *em_tree = &map_tree->map_tree; +	unsigned long len = root->sectorsize; + +	read_lock(&em_tree->lock); +	em = lookup_extent_mapping(em_tree, logical, len); +	read_unlock(&em_tree->lock); +	BUG_ON(!em); + +	BUG_ON(em->start > logical || em->start + em->len < logical); +	map = (struct map_lookup *)em->bdev; +	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +			 BTRFS_BLOCK_GROUP_RAID6)) { +		len = map->stripe_len * nr_data_stripes(map); +	} +	free_extent_map(em); +	return len; +} + +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +			   u64 logical, u64 len, int mirror_num) +{ +	struct extent_map *em; +	struct map_lookup *map; +	struct extent_map_tree *em_tree = &map_tree->map_tree; +	int ret = 0; + +	read_lock(&em_tree->lock); +	em = lookup_extent_mapping(em_tree, logical, len); +	read_unlock(&em_tree->lock); +	BUG_ON(!em); + +	BUG_ON(em->start > logical || em->start + em->len < logical); +	map = (struct map_lookup *)em->bdev; +	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +			 BTRFS_BLOCK_GROUP_RAID6)) +		ret = 1; +	free_extent_map(em); +	return ret; +} +  static int find_live_mirror(struct btrfs_fs_info *fs_info,  			    struct map_lookup *map, int first, int num,  			    int optimal, int dev_replace_is_ongoing) @@ -4185,10 +4314,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,  	return optimal;  } +static inline int parity_smaller(u64 a, u64 b) +{ +	return a > b; +} + +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +{ +	struct btrfs_bio_stripe s; +	int i; +	u64 l; +	int again = 1; + +	while (again) { +		again = 0; +		for (i = 0; i < bbio->num_stripes - 1; i++) { +			if (parity_smaller(raid_map[i], raid_map[i+1])) { +				s = bbio->stripes[i]; +				l = raid_map[i]; +				bbio->stripes[i] = bbio->stripes[i+1]; +				raid_map[i] = raid_map[i+1]; +				bbio->stripes[i+1] = s; +				raid_map[i+1] = l; +				again = 1; +			} +		} +	} +} +  static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  			     u64 logical, u64 *length,  			     struct btrfs_bio **bbio_ret, -			     int mirror_num) +			     int mirror_num, u64 **raid_map_ret)  {  	struct extent_map *em;  	struct map_lookup *map; @@ -4200,6 +4358,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	u64 stripe_nr;  	u64 stripe_nr_orig;  	u64 stripe_nr_end; +	u64 stripe_len; +	u64 *raid_map = NULL;  	int stripe_index;  	int i;  	int ret = 0; @@ -4211,6 +4371,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	int num_alloc_stripes;  	int patch_the_first_stripe_for_dev_replace = 0;  	u64 physical_to_patch_in_first_stripe = 0; +	u64 raid56_full_stripe_start = (u64)-1;  	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, logical, *length); @@ -4227,29 +4388,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	map = (struct map_lookup *)em->bdev;  	offset = logical - em->start; +	if (mirror_num > map->num_stripes) +		mirror_num = 0; + +	stripe_len = map->stripe_len;  	stripe_nr = offset;  	/*  	 * stripe_nr counts the total number of stripes we have to stride  	 * to get to this block  	 */ -	do_div(stripe_nr, map->stripe_len); +	do_div(stripe_nr, stripe_len); -	stripe_offset = stripe_nr * map->stripe_len; +	stripe_offset = stripe_nr * stripe_len;  	BUG_ON(offset < stripe_offset);  	/* stripe_offset is the offset of this block in its stripe*/  	stripe_offset = offset - stripe_offset; -	if (rw & REQ_DISCARD) +	/* if we're here for raid56, we need to know the stripe aligned start */ +	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { +		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); +		raid56_full_stripe_start = offset; + +		/* allow a write of a full stripe, but make sure we don't +		 * allow straddling of stripes +		 */ +		do_div(raid56_full_stripe_start, full_stripe_len); +		raid56_full_stripe_start *= full_stripe_len; +	} + +	if (rw & REQ_DISCARD) { +		/* we don't discard raid56 yet */ +		if (map->type & +		    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { +			ret = -EOPNOTSUPP; +			goto out; +		}  		*length = min_t(u64, em->len - offset, *length); -	else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { -		/* we limit the length of each bio to what fits in a stripe */ -		*length = min_t(u64, em->len - offset, -				map->stripe_len - stripe_offset); +	} else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { +		u64 max_len; +		/* For writes to RAID[56], allow a full stripeset across all disks. +		   For other RAID types and for RAID[56] reads, just allow a single +		   stripe (on a single disk). */ +		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && +		    (rw & REQ_WRITE)) { +			max_len = stripe_len * nr_data_stripes(map) - +				(offset - raid56_full_stripe_start); +		} else { +			/* we limit the length of each bio to what fits in a stripe */ +			max_len = stripe_len - stripe_offset; +		} +		*length = min_t(u64, em->len - offset, max_len);  	} else {  		*length = em->len - offset;  	} +	/* This is for when we're called from btrfs_merge_bio_hook() and all +	   it cares about is the length */  	if (!bbio_ret)  		goto out; @@ -4282,7 +4477,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		u64 physical_of_found = 0;  		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, -			     logical, &tmp_length, &tmp_bbio, 0); +			     logical, &tmp_length, &tmp_bbio, 0, NULL);  		if (ret) {  			WARN_ON(tmp_bbio != NULL);  			goto out; @@ -4348,6 +4543,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	do_div(stripe_nr_end, map->stripe_len);  	stripe_end_offset = stripe_nr_end * map->stripe_len -  			    (offset + *length); +  	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {  		if (rw & REQ_DISCARD)  			num_stripes = min_t(u64, map->num_stripes, @@ -4398,6 +4594,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  					      dev_replace_is_ongoing);  			mirror_num = stripe_index - old_stripe_index + 1;  		} + +	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +				BTRFS_BLOCK_GROUP_RAID6)) { +		u64 tmp; + +		if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) +		    && raid_map_ret) { +			int i, rot; + +			/* push stripe_nr back to the start of the full stripe */ +			stripe_nr = raid56_full_stripe_start; +			do_div(stripe_nr, stripe_len); + +			stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + +			/* RAID[56] write or recovery. Return all stripes */ +			num_stripes = map->num_stripes; +			max_errors = nr_parity_stripes(map); + +			raid_map = kmalloc(sizeof(u64) * num_stripes, +					   GFP_NOFS); +			if (!raid_map) { +				ret = -ENOMEM; +				goto out; +			} + +			/* Work out the disk rotation on this stripe-set */ +			tmp = stripe_nr; +			rot = do_div(tmp, num_stripes); + +			/* Fill in the logical address of each stripe */ +			tmp = stripe_nr * nr_data_stripes(map); +			for (i = 0; i < nr_data_stripes(map); i++) +				raid_map[(i+rot) % num_stripes] = +					em->start + (tmp + i) * map->stripe_len; + +			raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; +			if (map->type & BTRFS_BLOCK_GROUP_RAID6) +				raid_map[(i+rot+1) % num_stripes] = +					RAID6_Q_STRIPE; + +			*length = map->stripe_len; +			stripe_index = 0; +			stripe_offset = 0; +		} else { +			/* +			 * Mirror #0 or #1 means the original data block. +			 * Mirror #2 is RAID5 parity block. +			 * Mirror #3 is RAID6 Q block. +			 */ +			stripe_index = do_div(stripe_nr, nr_data_stripes(map)); +			if (mirror_num > 1) +				stripe_index = nr_data_stripes(map) + +						mirror_num - 2; + +			/* We distribute the parity blocks across stripes */ +			tmp = stripe_nr + stripe_index; +			stripe_index = do_div(tmp, map->num_stripes); +		}  	} else {  		/*  		 * after this do_div call, stripe_nr is the number of stripes @@ -4506,8 +4761,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {  		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |  				 BTRFS_BLOCK_GROUP_RAID10 | +				 BTRFS_BLOCK_GROUP_RAID5 |  				 BTRFS_BLOCK_GROUP_DUP)) {  			max_errors = 1; +		} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { +			max_errors = 2;  		}  	} @@ -4608,6 +4866,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;  		bbio->mirror_num = map->num_stripes + 1;  	} +	if (raid_map) { +		sort_parity_stripes(bbio, raid_map); +		*raid_map_ret = raid_map; +	}  out:  	if (dev_replace_is_ongoing)  		btrfs_dev_replace_unlock(dev_replace); @@ -4620,7 +4882,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		      struct btrfs_bio **bbio_ret, int mirror_num)  {  	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, -				 mirror_num); +				 mirror_num, NULL);  }  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -4634,6 +4896,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	u64 bytenr;  	u64 length;  	u64 stripe_nr; +	u64 rmap_len;  	int i, j, nr = 0;  	read_lock(&em_tree->lock); @@ -4644,10 +4907,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	map = (struct map_lookup *)em->bdev;  	length = em->len; +	rmap_len = map->stripe_len; +  	if (map->type & BTRFS_BLOCK_GROUP_RAID10)  		do_div(length, map->num_stripes / map->sub_stripes);  	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)  		do_div(length, map->num_stripes); +	else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +			      BTRFS_BLOCK_GROUP_RAID6)) { +		do_div(length, nr_data_stripes(map)); +		rmap_len = map->stripe_len * nr_data_stripes(map); +	}  	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);  	BUG_ON(!buf); /* -ENOMEM */ @@ -4667,8 +4937,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  			do_div(stripe_nr, map->sub_stripes);  		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {  			stripe_nr = stripe_nr * map->num_stripes + i; -		} -		bytenr = chunk_start + stripe_nr * map->stripe_len; +		} /* else if RAID[56], multiply by nr_data_stripes(). +		   * Alternatively, just use rmap_len below instead of +		   * map->stripe_len */ + +		bytenr = chunk_start + stripe_nr * rmap_len;  		WARN_ON(nr >= map->num_stripes);  		for (j = 0; j < nr; j++) {  			if (buf[j] == bytenr) @@ -4682,7 +4955,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	*logical = buf;  	*naddrs = nr; -	*stripe_len = map->stripe_len; +	*stripe_len = rmap_len;  	free_extent_map(em);  	return 0; @@ -4756,7 +5029,7 @@ static void btrfs_end_bio(struct bio *bio, int err)  		bio->bi_bdev = (struct block_device *)  					(unsigned long)bbio->mirror_num;  		/* only send an error to the higher layers if it is -		 * beyond the tolerance of the multi-bio +		 * beyond the tolerance of the btrfs bio  		 */  		if (atomic_read(&bbio->error) > bbio->max_errors) {  			err = -EIO; @@ -4790,13 +5063,18 @@ struct async_sched {   * This will add one bio to the pending list for a device and make sure   * the work struct is scheduled.   */ -static noinline void schedule_bio(struct btrfs_root *root, +noinline void btrfs_schedule_bio(struct btrfs_root *root,  				 struct btrfs_device *device,  				 int rw, struct bio *bio)  {  	int should_queue = 1;  	struct btrfs_pending_bios *pending_bios; +	if (device->missing || !device->bdev) { +		bio_endio(bio, -EIO); +		return; +	} +  	/* don't bother with additional async steps for reads, right now */  	if (!(rw & REQ_WRITE)) {  		bio_get(bio); @@ -4894,7 +5172,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,  #endif  	bio->bi_bdev = dev->bdev;  	if (async) -		schedule_bio(root, dev, rw, bio); +		btrfs_schedule_bio(root, dev, rw, bio);  	else  		btrfsic_submit_bio(rw, bio);  } @@ -4953,6 +5231,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	u64 logical = (u64)bio->bi_sector << 9;  	u64 length = 0;  	u64 map_length; +	u64 *raid_map = NULL;  	int ret;  	int dev_nr = 0;  	int total_devs = 1; @@ -4961,12 +5240,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	length = bio->bi_size;  	map_length = length; -	ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, -			      mirror_num); -	if (ret) +	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, +			      mirror_num, &raid_map); +	if (ret) /* -ENOMEM */  		return ret;  	total_devs = bbio->num_stripes; +	bbio->orig_bio = first_bio; +	bbio->private = first_bio->bi_private; +	bbio->end_io = first_bio->bi_end_io; +	atomic_set(&bbio->stripes_pending, bbio->num_stripes); + +	if (raid_map) { +		/* In this case, map_length has been set to the length of +		   a single stripe; not the whole write */ +		if (rw & WRITE) { +			return raid56_parity_write(root, bio, bbio, +						   raid_map, map_length); +		} else { +			return raid56_parity_recover(root, bio, bbio, +						     raid_map, map_length, +						     mirror_num); +		} +	} +  	if (map_length < length) {  		printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "  		       "len %llu\n", (unsigned long long)logical, @@ -4975,11 +5272,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  		BUG();  	} -	bbio->orig_bio = first_bio; -	bbio->private = first_bio->bi_private; -	bbio->end_io = first_bio->bi_end_io; -	atomic_set(&bbio->stripes_pending, bbio->num_stripes); -  	while (dev_nr < total_devs) {  		dev = bbio->stripes[dev_nr].dev;  		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 12bb84166a5..062d8604d35 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,  void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,  					      struct btrfs_device *tgtdev);  int btrfs_scratch_superblock(struct btrfs_device *device); - +void btrfs_schedule_bio(struct btrfs_root *root, +			struct btrfs_device *device, +			int rw, struct bio *bio); +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +			   u64 logical, u64 len, int mirror_num); +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, +				    struct btrfs_mapping_tree *map_tree, +				    u64 logical);  static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,  				      int index)  {  |