diff options
| -rw-r--r-- | drivers/md/raid5.c | 168 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 1 | 
2 files changed, 166 insertions, 3 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81c02d63440..74dcf19cfe6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)  				rw = WRITE_FUA;  			else  				rw = WRITE; +			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags)) +				rw |= REQ_DISCARD;  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))  			rw = READ;  		else if (test_and_clear_bit(R5_WantReplace, @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)  					set_bit(R5_WantFUA, &dev->flags);  				if (wbi->bi_rw & REQ_SYNC)  					set_bit(R5_SyncIO, &dev->flags); -				tx = async_copy_data(1, wbi, dev->page, -					dev->sector, tx); +				if (wbi->bi_rw & REQ_DISCARD) { +					memset(page_address(dev->page), 0, +						STRIPE_SECTORS << 9); +					set_bit(R5_Discard, &dev->flags); +				} else +					tx = async_copy_data(1, wbi, dev->page, +						dev->sector, tx);  				wbi = r5_next_bio(wbi, dev->sector);  			}  		} @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,  	pr_debug("%s: stripe %llu\n", __func__,  		(unsigned long long)sh->sector); +	for (i = 0; i < sh->disks; i++) { +		if (pd_idx == i) +			continue; +		if (!test_bit(R5_Discard, &sh->dev[i].flags)) +			break; +	} +	if (i >= sh->disks) { +		atomic_inc(&sh->count); +		memset(page_address(sh->dev[pd_idx].page), 0, +			STRIPE_SECTORS << 9); +		set_bit(R5_Discard, &sh->dev[pd_idx].flags); +		ops_complete_reconstruct(sh); +		return; +	}  	/* check if prexor is active which means only process blocks  	 * that are part of a read-modify-write (written)  	 */ @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,  {  	struct async_submit_ctl submit;  	struct page **blocks = percpu->scribble; -	int count; +	int count, i;  	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); +	for (i = 0; i < sh->disks; i++) { +		if (sh->pd_idx == i || sh->qd_idx == i) +			continue; +		if (!test_bit(R5_Discard, &sh->dev[i].flags)) +			break; +	} +	if (i >= sh->disks) { +		atomic_inc(&sh->count); +		memset(page_address(sh->dev[sh->pd_idx].page), 0, +			STRIPE_SECTORS << 9); +		memset(page_address(sh->dev[sh->qd_idx].page), 0, +			STRIPE_SECTORS << 9); +		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); +		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); +		ops_complete_reconstruct(sh); +		return; +	} +  	count = set_syndrome_sources(blocks, sh);  	atomic_inc(&sh->count); @@ -4067,6 +4106,88 @@ static void release_stripe_plug(struct mddev *mddev,  		release_stripe(sh);  } +static void make_discard_request(struct mddev *mddev, struct bio *bi) +{ +	struct r5conf *conf = mddev->private; +	sector_t logical_sector, last_sector; +	struct stripe_head *sh; +	int remaining; +	int stripe_sectors; + +	if (mddev->reshape_position != MaxSector) +		/* Skip discard while reshape is happening */ +		return; + +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); +	last_sector = bi->bi_sector + (bi->bi_size>>9); + +	bi->bi_next = NULL; +	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + +	stripe_sectors = conf->chunk_sectors * +		(conf->raid_disks - conf->max_degraded); +	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, +					       stripe_sectors); +	sector_div(last_sector, stripe_sectors); + +	logical_sector *= conf->chunk_sectors; +	last_sector *= conf->chunk_sectors; + +	for (; logical_sector < last_sector; +	     logical_sector += STRIPE_SECTORS) { +		DEFINE_WAIT(w); +		int d; +	again: +		sh = get_active_stripe(conf, logical_sector, 0, 0, 0); +		prepare_to_wait(&conf->wait_for_overlap, &w, +				TASK_UNINTERRUPTIBLE); +		spin_lock_irq(&sh->stripe_lock); +		for (d = 0; d < conf->raid_disks; d++) { +			if (d == sh->pd_idx || d == sh->qd_idx) +				continue; +			if (sh->dev[d].towrite || sh->dev[d].toread) { +				set_bit(R5_Overlap, &sh->dev[d].flags); +				spin_unlock_irq(&sh->stripe_lock); +				release_stripe(sh); +				schedule(); +				goto again; +			} +		} +		finish_wait(&conf->wait_for_overlap, &w); +		for (d = 0; d < conf->raid_disks; d++) { +			if (d == sh->pd_idx || d == sh->qd_idx) +				continue; +			sh->dev[d].towrite = bi; +			set_bit(R5_OVERWRITE, &sh->dev[d].flags); +			raid5_inc_bi_active_stripes(bi); +		} +		spin_unlock_irq(&sh->stripe_lock); +		if (conf->mddev->bitmap) { +			for (d = 0; +			     d < conf->raid_disks - conf->max_degraded; +			     d++) +				bitmap_startwrite(mddev->bitmap, +						  sh->sector, +						  STRIPE_SECTORS, +						  0); +			sh->bm_seq = conf->seq_flush + 1; +			set_bit(STRIPE_BIT_DELAY, &sh->state); +		} + +		set_bit(STRIPE_HANDLE, &sh->state); +		clear_bit(STRIPE_DELAYED, &sh->state); +		if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) +			atomic_inc(&conf->preread_active_stripes); +		release_stripe_plug(mddev, sh); +	} + +	remaining = raid5_dec_bi_active_stripes(bi); +	if (remaining == 0) { +		md_write_end(mddev); +		bio_endio(bi, 0); +	} +} +  static void make_request(struct mddev *mddev, struct bio * bi)  {  	struct r5conf *conf = mddev->private; @@ -4089,6 +4210,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)  	     chunk_aligned_read(mddev,bi))  		return; +	if (unlikely(bi->bi_rw & REQ_DISCARD)) { +		make_discard_request(mddev, bi); +		return; +	} +  	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);  	last_sector = bi->bi_sector + (bi->bi_size>>9);  	bi->bi_next = NULL; @@ -5362,6 +5488,7 @@ static int run(struct mddev *mddev)  	if (mddev->queue) {  		int chunk_size; +		bool discard_supported = true;  		/* read-ahead size must cover two whole stripes, which  		 * is 2 * (datadisks) * chunksize where 'n' is the  		 * number of raid devices @@ -5381,13 +5508,48 @@ static int run(struct mddev *mddev)  		blk_queue_io_min(mddev->queue, chunk_size);  		blk_queue_io_opt(mddev->queue, chunk_size *  				 (conf->raid_disks - conf->max_degraded)); +		/* +		 * We can only discard a whole stripe. It doesn't make sense to +		 * discard data disk but write parity disk +		 */ +		stripe = stripe * PAGE_SIZE; +		mddev->queue->limits.discard_alignment = stripe; +		mddev->queue->limits.discard_granularity = stripe; +		/* +		 * unaligned part of discard request will be ignored, so can't +		 * guarantee discard_zerors_data +		 */ +		mddev->queue->limits.discard_zeroes_data = 0;  		rdev_for_each(rdev, mddev) {  			disk_stack_limits(mddev->gendisk, rdev->bdev,  					  rdev->data_offset << 9);  			disk_stack_limits(mddev->gendisk, rdev->bdev,  					  rdev->new_data_offset << 9); +			/* +			 * discard_zeroes_data is required, otherwise data +			 * could be lost. Consider a scenario: discard a stripe +			 * (the stripe could be inconsistent if +			 * discard_zeroes_data is 0); write one disk of the +			 * stripe (the stripe could be inconsistent again +			 * depending on which disks are used to calculate +			 * parity); the disk is broken; The stripe data of this +			 * disk is lost. +			 */ +			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || +			    !bdev_get_queue(rdev->bdev)-> +						limits.discard_zeroes_data) +				discard_supported = false;  		} + +		if (discard_supported && +		   mddev->queue->limits.max_discard_sectors >= stripe && +		   mddev->queue->limits.discard_granularity >= stripe) +			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, +						mddev->queue); +		else +			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, +						mddev->queue);  	}  	return 0; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a9fc24901ed..18b2c4a8a1f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -298,6 +298,7 @@ enum r5dev_flags {  	R5_WantReplace, /* We need to update the replacement, we have read  			 * data in, and now is a good time to write it out.  			 */ +	R5_Discard,	/* Discard the stripe */  };  /*  |