diff options
| author | John W. Linville <linville@tuxdriver.com> | 2012-09-07 15:07:55 -0400 | 
|---|---|---|
| committer | John W. Linville <linville@tuxdriver.com> | 2012-09-07 15:07:55 -0400 | 
| commit | fac805f8c198092de9a2842efd7f5022e2937b18 (patch) | |
| tree | 7557809c373f97a343c427d8fded0696060394ce /drivers/md/raid1.c | |
| parent | 2461c7d60f9f3821274e4acf9019cba8b82c94b5 (diff) | |
| parent | f10723841e624c0726c70356b31d91befed01dd6 (diff) | |
| download | olio-linux-3.10-fac805f8c198092de9a2842efd7f5022e2937b18.tar.xz olio-linux-3.10-fac805f8c198092de9a2842efd7f5022e2937b18.zip  | |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless
Diffstat (limited to 'drivers/md/raid1.c')
| -rw-r--r-- | drivers/md/raid1.c | 167 | 
1 files changed, 122 insertions, 45 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cacd008d686..9f7f8bee844 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -46,6 +46,20 @@   */  #define	NR_RAID1_BIOS 256 +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error.  To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context.  So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) +  /* When there are this many requests queue to be written by   * the raid1 thread, we become 'congested' to provide back-pressure   * for writeback. @@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect  	const sector_t this_sector = r1_bio->sector;  	int sectors;  	int best_good_sectors; -	int start_disk; -	int best_disk; -	int i; +	int best_disk, best_dist_disk, best_pending_disk; +	int has_nonrot_disk; +	int disk;  	sector_t best_dist; +	unsigned int min_pending;  	struct md_rdev *rdev;  	int choose_first; +	int choose_next_idle;  	rcu_read_lock();  	/* @@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect   retry:  	sectors = r1_bio->sectors;  	best_disk = -1; +	best_dist_disk = -1;  	best_dist = MaxSector; +	best_pending_disk = -1; +	min_pending = UINT_MAX;  	best_good_sectors = 0; +	has_nonrot_disk = 0; +	choose_next_idle = 0;  	if (conf->mddev->recovery_cp < MaxSector && -	    (this_sector + sectors >= conf->next_resync)) { +	    (this_sector + sectors >= conf->next_resync))  		choose_first = 1; -		start_disk = 0; -	} else { +	else  		choose_first = 0; -		start_disk = conf->last_used; -	} -	for (i = 0 ; i < conf->raid_disks * 2 ; i++) { +	for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {  		sector_t dist;  		sector_t first_bad;  		int bad_sectors; - -		int disk = start_disk + i; -		if (disk >= conf->raid_disks * 2) -			disk -= conf->raid_disks * 2; +		unsigned int pending; +		bool nonrot;  		rdev = rcu_dereference(conf->mirrors[disk].rdev);  		if (r1_bio->bios[disk] == IO_BLOCKED @@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect  		} else  			best_good_sectors = sectors; +		nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); +		has_nonrot_disk |= nonrot; +		pending = atomic_read(&rdev->nr_pending);  		dist = abs(this_sector - conf->mirrors[disk].head_position); -		if (choose_first -		    /* Don't change to another disk for sequential reads */ -		    || conf->next_seq_sect == this_sector -		    || dist == 0 -		    /* If device is idle, use it */ -		    || atomic_read(&rdev->nr_pending) == 0) { +		if (choose_first) { +			best_disk = disk; +			break; +		} +		/* Don't change to another disk for sequential reads */ +		if (conf->mirrors[disk].next_seq_sect == this_sector +		    || dist == 0) { +			int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; +			struct raid1_info *mirror = &conf->mirrors[disk]; + +			best_disk = disk; +			/* +			 * If buffered sequential IO size exceeds optimal +			 * iosize, check if there is idle disk. If yes, choose +			 * the idle disk. read_balance could already choose an +			 * idle disk before noticing it's a sequential IO in +			 * this disk. This doesn't matter because this disk +			 * will idle, next time it will be utilized after the +			 * first disk has IO size exceeds optimal iosize. In +			 * this way, iosize of the first disk will be optimal +			 * iosize at least. iosize of the second disk might be +			 * small, but not a big deal since when the second disk +			 * starts IO, the first disk is likely still busy. +			 */ +			if (nonrot && opt_iosize > 0 && +			    mirror->seq_start != MaxSector && +			    mirror->next_seq_sect > opt_iosize && +			    mirror->next_seq_sect - opt_iosize >= +			    mirror->seq_start) { +				choose_next_idle = 1; +				continue; +			} +			break; +		} +		/* If device is idle, use it */ +		if (pending == 0) {  			best_disk = disk;  			break;  		} + +		if (choose_next_idle) +			continue; + +		if (min_pending > pending) { +			min_pending = pending; +			best_pending_disk = disk; +		} +  		if (dist < best_dist) {  			best_dist = dist; -			best_disk = disk; +			best_dist_disk = disk;  		}  	} +	/* +	 * If all disks are rotational, choose the closest disk. If any disk is +	 * non-rotational, choose the disk with less pending request even the +	 * disk is rotational, which might/might not be optimal for raids with +	 * mixed ratation/non-rotational disks depending on workload. +	 */ +	if (best_disk == -1) { +		if (has_nonrot_disk) +			best_disk = best_pending_disk; +		else +			best_disk = best_dist_disk; +	} +  	if (best_disk >= 0) {  		rdev = rcu_dereference(conf->mirrors[best_disk].rdev);  		if (!rdev) @@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect  			goto retry;  		}  		sectors = best_good_sectors; -		conf->next_seq_sect = this_sector + sectors; -		conf->last_used = best_disk; + +		if (conf->mirrors[best_disk].next_seq_sect != this_sector) +			conf->mirrors[best_disk].seq_start = this_sector; + +		conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;  	}  	rcu_read_unlock();  	*max_sectors = sectors; @@ -873,7 +947,7 @@ do_sync_io:  static void make_request(struct mddev *mddev, struct bio * bio)  {  	struct r1conf *conf = mddev->private; -	struct mirror_info *mirror; +	struct raid1_info *mirror;  	struct r1bio *r1_bio;  	struct bio *read_bio;  	int i, disks; @@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)  	struct r1conf *conf = mddev->private;  	int err = -EEXIST;  	int mirror = 0; -	struct mirror_info *p; +	struct raid1_info *p;  	int first = 0;  	int last = conf->raid_disks - 1;  	struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)  	struct r1conf *conf = mddev->private;  	int err = 0;  	int number = rdev->raid_disk; -	struct mirror_info *p = conf->mirrors+ number; +	struct raid1_info *p = conf->mirrors + number;  	if (rdev != p->rdev)  		p = conf->mirrors + conf->raid_disks + number; @@ -2173,8 +2247,7 @@ static void raid1d(struct mddev *mddev)  	blk_start_plug(&plug);  	for (;;) { -		if (atomic_read(&mddev->plug_cnt) == 0) -			flush_pending_writes(conf); +		flush_pending_writes(conf);  		spin_lock_irqsave(&conf->device_lock, flags);  		if (list_empty(head)) { @@ -2371,6 +2444,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp  				bio->bi_rw = READ;  				bio->bi_end_io = end_sync_read;  				read_targets++; +			} else if (!test_bit(WriteErrorSeen, &rdev->flags) && +				test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && +				!test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { +				/* +				 * The device is suitable for reading (InSync), +				 * but has bad block(s) here. Let's try to correct them, +				 * if we are doing resync or repair. Otherwise, leave +				 * this device alone for this sync request. +				 */ +				bio->bi_rw = WRITE; +				bio->bi_end_io = end_sync_write; +				write_targets++;  			}  		}  		if (bio->bi_end_io) { @@ -2428,7 +2513,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp  		/* There is nowhere to write, so all non-sync  		 * drives must be failed - so we are finished  		 */ -		sector_t rv = max_sector - sector_nr; +		sector_t rv; +		if (min_bad > 0) +			max_sector = sector_nr + min_bad; +		rv = max_sector - sector_nr;  		*skipped = 1;  		put_buf(r1_bio);  		return rv; @@ -2521,7 +2609,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)  {  	struct r1conf *conf;  	int i; -	struct mirror_info *disk; +	struct raid1_info *disk;  	struct md_rdev *rdev;  	int err = -ENOMEM; @@ -2529,7 +2617,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)  	if (!conf)  		goto abort; -	conf->mirrors = kzalloc(sizeof(struct mirror_info) +	conf->mirrors = kzalloc(sizeof(struct raid1_info)  				* mddev->raid_disks * 2,  				 GFP_KERNEL);  	if (!conf->mirrors) @@ -2572,6 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)  			mddev->merge_check_needed = 1;  		disk->head_position = 0; +		disk->seq_start = MaxSector;  	}  	conf->raid_disks = mddev->raid_disks;  	conf->mddev = mddev; @@ -2585,7 +2674,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)  	conf->recovery_disabled = mddev->recovery_disabled - 1;  	err = -EIO; -	conf->last_used = -1;  	for (i = 0; i < conf->raid_disks * 2; i++) {  		disk = conf->mirrors + i; @@ -2611,19 +2699,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)  			if (disk->rdev &&  			    (disk->rdev->saved_raid_disk < 0))  				conf->fullsync = 1; -		} else if (conf->last_used < 0) -			/* -			 * The first working device is used as a -			 * starting point to read balancing. -			 */ -			conf->last_used = i; +		}  	} -	if (conf->last_used < 0) { -		printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", -		       mdname(mddev)); -		goto abort; -	}  	err = -ENOMEM;  	conf->thread = md_register_thread(raid1d, mddev, "raid1");  	if (!conf->thread) { @@ -2798,7 +2876,7 @@ static int raid1_reshape(struct mddev *mddev)  	 */  	mempool_t *newpool, *oldpool;  	struct pool_info *newpoolinfo; -	struct mirror_info *newmirrors; +	struct raid1_info *newmirrors;  	struct r1conf *conf = mddev->private;  	int cnt, raid_disks;  	unsigned long flags; @@ -2841,7 +2919,7 @@ static int raid1_reshape(struct mddev *mddev)  		kfree(newpoolinfo);  		return -ENOMEM;  	} -	newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, +	newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,  			     GFP_KERNEL);  	if (!newmirrors) {  		kfree(newpoolinfo); @@ -2880,7 +2958,6 @@ static int raid1_reshape(struct mddev *mddev)  	conf->raid_disks = mddev->raid_disks = raid_disks;  	mddev->delta_disks = 0; -	conf->last_used = 0; /* just make sure it is in-range */  	lower_barrier(conf);  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);  |