diff options
Diffstat (limited to 'drivers/md/raid10.c')
| -rw-r--r-- | drivers/md/raid10.c | 97 | 
1 files changed, 75 insertions, 22 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d48249c03..77b562d18a9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -38,21 +38,36 @@   *    near_copies (stored in low byte of layout)   *    far_copies (stored in second byte of layout)   *    far_offset (stored in bit 16 of layout ) + *    use_far_sets (stored in bit 17 of layout )   * - * The data to be stored is divided into chunks using chunksize. - * Each device is divided into far_copies sections. - * In each section, chunks are laid out in a style similar to raid0, but - * near_copies copies of each chunk is stored (each on a different drive). - * The starting device for each section is offset near_copies from the starting - * device of the previous section. - * Thus they are (near_copies*far_copies) of each chunk, and each is on a different - * drive. - * near_copies and far_copies must be at least one, and their product is at most - * raid_disks. + * The data to be stored is divided into chunks using chunksize.  Each device + * is divided into far_copies sections.   In each section, chunks are laid out + * in a style similar to raid0, but near_copies copies of each chunk is stored + * (each on a different drive).  The starting device for each section is offset + * near_copies from the starting device of the previous section.  Thus there + * are (near_copies * far_copies) of each chunk, and each is on a different + * drive.  near_copies and far_copies must be at least one, and their product + * is at most raid_disks.   *   * If far_offset is true, then the far_copies are handled a bit differently. - * The copies are still in different stripes, but instead of be very far apart - * on disk, there are adjacent stripes. + * The copies are still in different stripes, but instead of being very far + * apart on disk, there are adjacent stripes. + * + * The far and offset algorithms are handled slightly differently if + * 'use_far_sets' is true.  In this case, the array's devices are grouped into + * sets that are (near_copies * far_copies) in size.  The far copied stripes + * are still shifted by 'near_copies' devices, but this shifting stays confined + * to the set rather than the entire array.  This is done to improve the number + * of device combinations that can fail without causing the array to fail. + * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk + * on a device): + *    A B C D    A B C D E + *      ...         ... + *    D A B C    E A B C D + * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): + *    [A B] [C D]    [A B] [C D E] + *    |...| |...|    |...| | ... | + *    [B A] [D C]    [B A] [E C D]   */  /* @@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)  	sector_t stripe;  	int dev;  	int slot = 0; +	int last_far_set_start, last_far_set_size; + +	last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; +	last_far_set_start *= geo->far_set_size; + +	last_far_set_size = geo->far_set_size; +	last_far_set_size += (geo->raid_disks % geo->far_set_size);  	/* now calculate first sector/dev */  	chunk = r10bio->sector >> geo->chunk_shift; @@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)  	/* and calculate all the others */  	for (n = 0; n < geo->near_copies; n++) {  		int d = dev; +		int set;  		sector_t s = sector; -		r10bio->devs[slot].addr = sector;  		r10bio->devs[slot].devnum = d; +		r10bio->devs[slot].addr = s;  		slot++;  		for (f = 1; f < geo->far_copies; f++) { +			set = d / geo->far_set_size;  			d += geo->near_copies; -			if (d >= geo->raid_disks) -				d -= geo->raid_disks; + +			if ((geo->raid_disks % geo->far_set_size) && +			    (d > last_far_set_start)) { +				d -= last_far_set_start; +				d %= last_far_set_size; +				d += last_far_set_start; +			} else { +				d %= geo->far_set_size; +				d += geo->far_set_size * set; +			}  			s += geo->stride;  			r10bio->devs[slot].devnum = d;  			r10bio->devs[slot].addr = s; @@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)  	 * or recovery, so reshape isn't happening  	 */  	struct geom *geo = &conf->geo; +	int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; +	int far_set_size = geo->far_set_size; +	int last_far_set_start; + +	if (geo->raid_disks % geo->far_set_size) { +		last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; +		last_far_set_start *= geo->far_set_size; + +		if (dev >= last_far_set_start) { +			far_set_size = geo->far_set_size; +			far_set_size += (geo->raid_disks % geo->far_set_size); +			far_set_start = last_far_set_start; +		} +	}  	offset = sector & geo->chunk_mask;  	if (geo->far_offset) { @@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)  		chunk = sector >> geo->chunk_shift;  		fc = sector_div(chunk, geo->far_copies);  		dev -= fc * geo->near_copies; -		if (dev < 0) -			dev += geo->raid_disks; +		if (dev < far_set_start) +			dev += far_set_size;  	} else {  		while (sector >= geo->stride) {  			sector -= geo->stride; -			if (dev < geo->near_copies) -				dev += geo->raid_disks - geo->near_copies; +			if (dev < (geo->near_copies + far_set_start)) +				dev += far_set_size - geo->near_copies;  			else  				dev -= geo->near_copies;  		} @@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)  		bio_list_merge(&conf->pending_bio_list, &plug->pending);  		conf->pending_count += plug->pending_cnt;  		spin_unlock_irq(&conf->device_lock); +		wake_up(&conf->wait_barrier);  		md_wakeup_thread(mddev->thread);  		kfree(plug);  		return; @@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)  	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);  	const unsigned long do_discard = (bio->bi_rw  					  & (REQ_DISCARD | REQ_SECURE)); +	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);  	unsigned long flags;  	struct md_rdev *blocked_rdev;  	struct blk_plug_cb *cb; @@ -1460,7 +1508,8 @@ retry_write:  							      rdev));  			mbio->bi_bdev = rdev->bdev;  			mbio->bi_end_io	= raid10_end_write_request; -			mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; +			mbio->bi_rw = +				WRITE | do_sync | do_fua | do_discard | do_same;  			mbio->bi_private = r10_bio;  			atomic_inc(&r10_bio->remaining); @@ -1502,7 +1551,8 @@ retry_write:  						   r10_bio, rdev));  			mbio->bi_bdev = rdev->bdev;  			mbio->bi_end_io	= raid10_end_write_request; -			mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; +			mbio->bi_rw = +				WRITE | do_sync | do_fua | do_discard | do_same;  			mbio->bi_private = r10_bio;  			atomic_inc(&r10_bio->remaining); @@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)  		disks = mddev->raid_disks + mddev->delta_disks;  		break;  	} -	if (layout >> 17) +	if (layout >> 18)  		return -1;  	if (chunk < (PAGE_SIZE >> 9) ||  	    !is_power_of_2(chunk)) @@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)  	geo->near_copies = nc;  	geo->far_copies = fc;  	geo->far_offset = fo; +	geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;  	geo->chunk_mask = chunk - 1;  	geo->chunk_shift = ffz(~chunk);  	return nc*fc; @@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)  	if (mddev->queue) {  		blk_queue_max_discard_sectors(mddev->queue,  					      mddev->chunk_sectors); +		blk_queue_max_write_same_sectors(mddev->queue, +						 mddev->chunk_sectors);  		blk_queue_io_min(mddev->queue, chunk_size);  		if (conf->geo.raid_disks % conf->geo.near_copies)  			blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);  |