diff options
Diffstat (limited to 'drivers/md')
| -rw-r--r-- | drivers/md/Kconfig | 11 | ||||
| -rw-r--r-- | drivers/md/dm-bufio.c | 2 | ||||
| -rw-r--r-- | drivers/md/dm-cache-metadata.c | 64 | ||||
| -rw-r--r-- | drivers/md/dm-cache-metadata.h | 2 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy-cleaner.c | 7 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy-internal.h | 2 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 8 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy.c | 8 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy.h | 2 | ||||
| -rw-r--r-- | drivers/md/dm-cache-target.c | 169 | ||||
| -rw-r--r-- | drivers/md/dm-raid.c | 123 | ||||
| -rw-r--r-- | drivers/md/dm-thin.c | 11 | ||||
| -rw-r--r-- | drivers/md/dm-verity.c | 39 | ||||
| -rw-r--r-- | drivers/md/md.c | 25 | ||||
| -rw-r--r-- | drivers/md/md.h | 4 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-btree-remove.c | 46 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 13 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 8 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 97 | ||||
| -rw-r--r-- | drivers/md/raid10.h | 5 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 154 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 5 | 
22 files changed, 557 insertions, 248 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index e30b490055a..4d8d90b4fe7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -154,17 +154,6 @@ config MD_RAID456  	  If unsure, say Y. -config MULTICORE_RAID456 -	bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" -	depends on MD_RAID456 -	depends on SMP -	depends on EXPERIMENTAL -	---help--- -	  Enable the raid456 module to dispatch per-stripe raid operations to a -	  thread pool. - -	  If unsure, say N. -  config MD_MULTIPATH  	tristate "Multipath I/O support"  	depends on BLK_DEV_MD diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 3c955e10a61..c6083132c4b 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1025,6 +1025,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,  {  	struct blk_plug plug; +	BUG_ON(dm_bufio_in_request()); +  	blk_start_plug(&plug);  	dm_bufio_lock(c); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index fbd3625f274..83e995fece8 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -83,6 +83,8 @@ struct cache_disk_superblock {  	__le32 read_misses;  	__le32 write_hits;  	__le32 write_misses; + +	__le32 policy_version[CACHE_POLICY_VERSION_SIZE];  } __packed;  struct dm_cache_metadata { @@ -109,6 +111,7 @@ struct dm_cache_metadata {  	bool clean_when_opened:1;  	char policy_name[CACHE_POLICY_NAME_SIZE]; +	unsigned policy_version[CACHE_POLICY_VERSION_SIZE];  	size_t policy_hint_size;  	struct dm_cache_statistics stats;  }; @@ -268,7 +271,8 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)  	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));  	disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);  	disk_super->version = cpu_to_le32(CACHE_VERSION); -	memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE); +	memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); +	memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));  	disk_super->policy_hint_size = 0;  	r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, @@ -284,7 +288,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)  	disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);  	disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);  	disk_super->cache_blocks = cpu_to_le32(0); -	memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));  	disk_super->read_hits = cpu_to_le32(0);  	disk_super->read_misses = cpu_to_le32(0); @@ -478,6 +481,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,  	cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);  	cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));  	strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); +	cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]); +	cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]); +	cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]);  	cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);  	cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); @@ -572,6 +578,9 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,  	disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));  	disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));  	strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); +	disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); +	disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); +	disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);  	disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);  	disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); @@ -854,18 +863,43 @@ struct thunk {  	bool hints_valid;  }; +static bool policy_unchanged(struct dm_cache_metadata *cmd, +			     struct dm_cache_policy *policy) +{ +	const char *policy_name = dm_cache_policy_get_name(policy); +	const unsigned *policy_version = dm_cache_policy_get_version(policy); +	size_t policy_hint_size = dm_cache_policy_get_hint_size(policy); + +	/* +	 * Ensure policy names match. +	 */ +	if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name))) +		return false; + +	/* +	 * Ensure policy major versions match. +	 */ +	if (cmd->policy_version[0] != policy_version[0]) +		return false; + +	/* +	 * Ensure policy hint sizes match. +	 */ +	if (cmd->policy_hint_size != policy_hint_size) +		return false; + +	return true; +} +  static bool hints_array_initialized(struct dm_cache_metadata *cmd)  {  	return cmd->hint_root && cmd->policy_hint_size;  }  static bool hints_array_available(struct dm_cache_metadata *cmd, -				  const char *policy_name) +				  struct dm_cache_policy *policy)  { -	bool policy_names_match = !strncmp(cmd->policy_name, policy_name, -					   sizeof(cmd->policy_name)); - -	return cmd->clean_when_opened && policy_names_match && +	return cmd->clean_when_opened && policy_unchanged(cmd, policy) &&  		hints_array_initialized(cmd);  } @@ -899,7 +933,8 @@ static int __load_mapping(void *context, uint64_t cblock, void *leaf)  	return r;  } -static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +static int __load_mappings(struct dm_cache_metadata *cmd, +			   struct dm_cache_policy *policy,  			   load_mapping_fn fn, void *context)  {  	struct thunk thunk; @@ -909,18 +944,19 @@ static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_nam  	thunk.cmd = cmd;  	thunk.respect_dirty_flags = cmd->clean_when_opened; -	thunk.hints_valid = hints_array_available(cmd, policy_name); +	thunk.hints_valid = hints_array_available(cmd, policy);  	return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);  } -int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, +			   struct dm_cache_policy *policy,  			   load_mapping_fn fn, void *context)  {  	int r;  	down_read(&cmd->root_lock); -	r = __load_mappings(cmd, policy_name, fn, context); +	r = __load_mappings(cmd, policy, fn, context);  	up_read(&cmd->root_lock);  	return r; @@ -979,7 +1015,7 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty  		/* nothing to be done */  		return 0; -	value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0)); +	value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0));  	__dm_bless_for_disk(&value);  	r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), @@ -1070,13 +1106,15 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po  	__le32 value;  	size_t hint_size;  	const char *policy_name = dm_cache_policy_get_name(policy); +	const unsigned *policy_version = dm_cache_policy_get_version(policy);  	if (!policy_name[0] ||  	    (strlen(policy_name) > sizeof(cmd->policy_name) - 1))  		return -EINVAL; -	if (strcmp(cmd->policy_name, policy_name)) { +	if (!policy_unchanged(cmd, policy)) {  		strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); +		memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));  		hint_size = dm_cache_policy_get_hint_size(policy);  		if (!hint_size) diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 135864ea0ee..f45cef21f3d 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -89,7 +89,7 @@ typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,  			       dm_cblock_t cblock, bool dirty,  			       uint32_t hint, bool hint_valid);  int dm_cache_load_mappings(struct dm_cache_metadata *cmd, -			   const char *policy_name, +			   struct dm_cache_policy *policy,  			   load_mapping_fn fn,  			   void *context); diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c index cc05d70b3cb..b04d1f904d0 100644 --- a/drivers/md/dm-cache-policy-cleaner.c +++ b/drivers/md/dm-cache-policy-cleaner.c @@ -17,7 +17,6 @@  /*----------------------------------------------------------------*/  #define DM_MSG_PREFIX "cache cleaner" -#define CLEANER_VERSION "1.0.0"  /* Cache entry struct. */  struct wb_cache_entry { @@ -434,6 +433,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,  static struct dm_cache_policy_type wb_policy_type = {  	.name = "cleaner", +	.version = {1, 0, 0},  	.hint_size = 0,  	.owner = THIS_MODULE,  	.create = wb_create @@ -446,7 +446,10 @@ static int __init wb_init(void)  	if (r < 0)  		DMERR("register failed %d", r);  	else -		DMINFO("version " CLEANER_VERSION " loaded"); +		DMINFO("version %u.%u.%u loaded", +		       wb_policy_type.version[0], +		       wb_policy_type.version[1], +		       wb_policy_type.version[2]);  	return r;  } diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 52a75beeced..0928abdc49f 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -117,6 +117,8 @@ void dm_cache_policy_destroy(struct dm_cache_policy *p);   */  const char *dm_cache_policy_get_name(struct dm_cache_policy *p); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p); +  size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);  /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 96415325507..dc112a7137f 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -14,7 +14,6 @@  #include <linux/vmalloc.h>  #define DM_MSG_PREFIX "cache-policy-mq" -#define MQ_VERSION	"1.0.0"  static struct kmem_cache *mq_entry_cache; @@ -1133,6 +1132,7 @@ bad_cache_alloc:  static struct dm_cache_policy_type mq_policy_type = {  	.name = "mq", +	.version = {1, 0, 0},  	.hint_size = 4,  	.owner = THIS_MODULE,  	.create = mq_create @@ -1140,6 +1140,7 @@ static struct dm_cache_policy_type mq_policy_type = {  static struct dm_cache_policy_type default_policy_type = {  	.name = "default", +	.version = {1, 0, 0},  	.hint_size = 4,  	.owner = THIS_MODULE,  	.create = mq_create @@ -1164,7 +1165,10 @@ static int __init mq_init(void)  	r = dm_cache_policy_register(&default_policy_type);  	if (!r) { -		DMINFO("version " MQ_VERSION " loaded"); +		DMINFO("version %u.%u.%u loaded", +		       mq_policy_type.version[0], +		       mq_policy_type.version[1], +		       mq_policy_type.version[2]);  		return 0;  	} diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c index 2cbf5fdaac5..21c03c570c0 100644 --- a/drivers/md/dm-cache-policy.c +++ b/drivers/md/dm-cache-policy.c @@ -150,6 +150,14 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p)  }  EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p) +{ +	struct dm_cache_policy_type *t = p->private; + +	return t->version; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_version); +  size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)  {  	struct dm_cache_policy_type *t = p->private; diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index f0f51b26054..558bdfdabf5 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -196,6 +196,7 @@ struct dm_cache_policy {   * We maintain a little register of the different policy types.   */  #define CACHE_POLICY_NAME_SIZE 16 +#define CACHE_POLICY_VERSION_SIZE 3  struct dm_cache_policy_type {  	/* For use by the register code only. */ @@ -206,6 +207,7 @@ struct dm_cache_policy_type {  	 * what gets passed on the target line to select your policy.  	 */  	char name[CACHE_POLICY_NAME_SIZE]; +	unsigned version[CACHE_POLICY_VERSION_SIZE];  	/*  	 * Policies may store a hint for each each cache block. diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 0f4e84b15c3..66120bd46d1 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -142,6 +142,7 @@ struct cache {  	spinlock_t lock;  	struct bio_list deferred_bios;  	struct bio_list deferred_flush_bios; +	struct bio_list deferred_writethrough_bios;  	struct list_head quiesced_migrations;  	struct list_head completed_migrations;  	struct list_head need_commit_migrations; @@ -158,7 +159,7 @@ struct cache {  	/*  	 * origin_blocks entries, discarded if set.  	 */ -	sector_t discard_block_size; /* a power of 2 times sectors per block */ +	uint32_t discard_block_size; /* a power of 2 times sectors per block */  	dm_dblock_t discard_nr_blocks;  	unsigned long *discard_bitset; @@ -199,6 +200,11 @@ struct per_bio_data {  	bool tick:1;  	unsigned req_nr:2;  	struct dm_deferred_entry *all_io_entry; + +	/* writethrough fields */ +	struct cache *cache; +	dm_cblock_t cblock; +	bio_end_io_t *saved_bi_end_io;  };  struct dm_cache_migration { @@ -412,17 +418,24 @@ static bool block_size_is_power_of_two(struct cache *cache)  	return cache->sectors_per_block_shift >= 0;  } +static dm_block_t block_div(dm_block_t b, uint32_t n) +{ +	do_div(b, n); + +	return b; +} +  static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)  { -	sector_t discard_blocks = cache->discard_block_size; +	uint32_t discard_blocks = cache->discard_block_size;  	dm_block_t b = from_oblock(oblock);  	if (!block_size_is_power_of_two(cache)) -		(void) sector_div(discard_blocks, cache->sectors_per_block); +		discard_blocks = discard_blocks / cache->sectors_per_block;  	else  		discard_blocks >>= cache->sectors_per_block_shift; -	(void) sector_div(b, discard_blocks); +	b = block_div(b, discard_blocks);  	return to_dblock(b);  } @@ -609,6 +622,56 @@ static void issue(struct cache *cache, struct bio *bio)  	spin_unlock_irqrestore(&cache->lock, flags);  } +static void defer_writethrough_bio(struct cache *cache, struct bio *bio) +{ +	unsigned long flags; + +	spin_lock_irqsave(&cache->lock, flags); +	bio_list_add(&cache->deferred_writethrough_bios, bio); +	spin_unlock_irqrestore(&cache->lock, flags); + +	wake_worker(cache); +} + +static void writethrough_endio(struct bio *bio, int err) +{ +	struct per_bio_data *pb = get_per_bio_data(bio); +	bio->bi_end_io = pb->saved_bi_end_io; + +	if (err) { +		bio_endio(bio, err); +		return; +	} + +	remap_to_cache(pb->cache, bio, pb->cblock); + +	/* +	 * We can't issue this bio directly, since we're in interrupt +	 * context.  So it get's put on a bio list for processing by the +	 * worker thread. +	 */ +	defer_writethrough_bio(pb->cache, bio); +} + +/* + * When running in writethrough mode we need to send writes to clean blocks + * to both the cache and origin devices.  In future we'd like to clone the + * bio and send them in parallel, but for now we're doing them in + * series as this is easier. + */ +static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, +				       dm_oblock_t oblock, dm_cblock_t cblock) +{ +	struct per_bio_data *pb = get_per_bio_data(bio); + +	pb->cache = cache; +	pb->cblock = cblock; +	pb->saved_bi_end_io = bio->bi_end_io; +	bio->bi_end_io = writethrough_endio; + +	remap_to_origin_clear_discard(pb->cache, bio, oblock); +} +  /*----------------------------------------------------------------   * Migration processing   * @@ -1002,7 +1065,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio)  	dm_block_t end_block = bio->bi_sector + bio_sectors(bio);  	dm_block_t b; -	(void) sector_div(end_block, cache->discard_block_size); +	end_block = block_div(end_block, cache->discard_block_size);  	for (b = start_block; b < end_block; b++)  		set_discard(cache, to_dblock(b)); @@ -1070,14 +1133,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,  		inc_hit_counter(cache, bio);  		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); -		if (is_writethrough_io(cache, bio, lookup_result.cblock)) { -			/* -			 * No need to mark anything dirty in write through mode. -			 */ -			pb->req_nr == 0 ? -				remap_to_cache(cache, bio, lookup_result.cblock) : -				remap_to_origin_clear_discard(cache, bio, block); -		} else +		if (is_writethrough_io(cache, bio, lookup_result.cblock)) +			remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); +		else  			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);  		issue(cache, bio); @@ -1086,17 +1144,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,  	case POLICY_MISS:  		inc_miss_counter(cache, bio);  		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - -		if (pb->req_nr != 0) { -			/* -			 * This is a duplicate writethrough io that is no -			 * longer needed because the block has been demoted. -			 */ -			bio_endio(bio, 0); -		} else { -			remap_to_origin_clear_discard(cache, bio, block); -			issue(cache, bio); -		} +		remap_to_origin_clear_discard(cache, bio, block); +		issue(cache, bio);  		break;  	case POLICY_NEW: @@ -1217,6 +1266,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)  		submit_bios ? generic_make_request(bio) : bio_io_error(bio);  } +static void process_deferred_writethrough_bios(struct cache *cache) +{ +	unsigned long flags; +	struct bio_list bios; +	struct bio *bio; + +	bio_list_init(&bios); + +	spin_lock_irqsave(&cache->lock, flags); +	bio_list_merge(&bios, &cache->deferred_writethrough_bios); +	bio_list_init(&cache->deferred_writethrough_bios); +	spin_unlock_irqrestore(&cache->lock, flags); + +	while ((bio = bio_list_pop(&bios))) +		generic_make_request(bio); +} +  static void writeback_some_dirty_blocks(struct cache *cache)  {  	int r = 0; @@ -1313,6 +1379,7 @@ static int more_work(struct cache *cache)  	else  		return !bio_list_empty(&cache->deferred_bios) ||  			!bio_list_empty(&cache->deferred_flush_bios) || +			!bio_list_empty(&cache->deferred_writethrough_bios) ||  			!list_empty(&cache->quiesced_migrations) ||  			!list_empty(&cache->completed_migrations) ||  			!list_empty(&cache->need_commit_migrations); @@ -1331,6 +1398,8 @@ static void do_worker(struct work_struct *ws)  		writeback_some_dirty_blocks(cache); +		process_deferred_writethrough_bios(cache); +  		if (commit_if_needed(cache)) {  			process_deferred_flush_bios(cache, false); @@ -1756,8 +1825,11 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,  	}  	r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); -	if (r) +	if (r) { +		*error = "Error setting cache policy's config values";  		dm_cache_policy_destroy(cache->policy); +		cache->policy = NULL; +	}  	return r;  } @@ -1793,8 +1865,6 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size,  #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio); -  static int cache_create(struct cache_args *ca, struct cache **result)  {  	int r = 0; @@ -1821,9 +1891,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)  	memcpy(&cache->features, &ca->features, sizeof(cache->features)); -	if (cache->features.write_through) -		ti->num_write_bios = cache_num_write_bios; -  	cache->callbacks.congested_fn = cache_is_congested;  	dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -1835,7 +1902,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)  	/* FIXME: factor out this whole section */  	origin_blocks = cache->origin_sectors = ca->origin_sectors; -	(void) sector_div(origin_blocks, ca->block_size); +	origin_blocks = block_div(origin_blocks, ca->block_size);  	cache->origin_blocks = to_oblock(origin_blocks);  	cache->sectors_per_block = ca->block_size; @@ -1848,7 +1915,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)  		dm_block_t cache_size = ca->cache_sectors;  		cache->sectors_per_block_shift = -1; -		(void) sector_div(cache_size, ca->block_size); +		cache_size = block_div(cache_size, ca->block_size);  		cache->cache_size = to_cblock(cache_size);  	} else {  		cache->sectors_per_block_shift = __ffs(ca->block_size); @@ -1873,6 +1940,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)  	spin_lock_init(&cache->lock);  	bio_list_init(&cache->deferred_bios);  	bio_list_init(&cache->deferred_flush_bios); +	bio_list_init(&cache->deferred_writethrough_bios);  	INIT_LIST_HEAD(&cache->quiesced_migrations);  	INIT_LIST_HEAD(&cache->completed_migrations);  	INIT_LIST_HEAD(&cache->need_commit_migrations); @@ -2002,6 +2070,8 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)  		goto out;  	r = cache_create(ca, &cache); +	if (r) +		goto out;  	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);  	if (r) { @@ -2016,20 +2086,6 @@ out:  	return r;  } -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio) -{ -	int r; -	struct cache *cache = ti->private; -	dm_oblock_t block = get_bio_block(cache, bio); -	dm_cblock_t cblock; - -	r = policy_lookup(cache->policy, block, &cblock); -	if (r < 0) -		return 2;	/* assume the worst */ - -	return (!r && !is_dirty(cache, cblock)) ? 2 : 1; -} -  static int cache_map(struct dm_target *ti, struct bio *bio)  {  	struct cache *cache = ti->private; @@ -2097,18 +2153,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)  		inc_hit_counter(cache, bio);  		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); -		if (is_writethrough_io(cache, bio, lookup_result.cblock)) { -			/* -			 * No need to mark anything dirty in write through mode. -			 */ -			pb->req_nr == 0 ? -				remap_to_cache(cache, bio, lookup_result.cblock) : -				remap_to_origin_clear_discard(cache, bio, block); -			cell_defer(cache, cell, false); -		} else { +		if (is_writethrough_io(cache, bio, lookup_result.cblock)) +			remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); +		else  			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); -			cell_defer(cache, cell, false); -		} + +		cell_defer(cache, cell, false);  		break;  	case POLICY_MISS: @@ -2319,8 +2369,7 @@ static int cache_preresume(struct dm_target *ti)  	}  	if (!cache->loaded_mappings) { -		r = dm_cache_load_mappings(cache->cmd, -					   dm_cache_policy_get_name(cache->policy), +		r = dm_cache_load_mappings(cache->cmd, cache->policy,  					   load_mapping, cache);  		if (r) {  			DMERR("could not load cache mappings"); @@ -2535,7 +2584,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)  static struct target_type cache_target = {  	.name = "cache", -	.version = {1, 0, 0}, +	.version = {1, 1, 0},  	.module = THIS_MODULE,  	.ctr = cache_ctr,  	.dtr = cache_dtr, diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9a01d1e4c78..311e3d35b27 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -91,15 +91,44 @@ static struct raid_type {  	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}  }; +static char *raid10_md_layout_to_format(int layout) +{ +	/* +	 * Bit 16 and 17 stand for "offset" and "use_far_sets" +	 * Refer to MD's raid10.c for details +	 */ +	if ((layout & 0x10000) && (layout & 0x20000)) +		return "offset"; + +	if ((layout & 0xFF) > 1) +		return "near"; + +	return "far"; +} +  static unsigned raid10_md_layout_to_copies(int layout)  { -	return layout & 0xFF; +	if ((layout & 0xFF) > 1) +		return layout & 0xFF; +	return (layout >> 8) & 0xFF;  }  static int raid10_format_to_md_layout(char *format, unsigned copies)  { -	/* 1 "far" copy, and 'copies' "near" copies */ -	return (1 << 8) | (copies & 0xFF); +	unsigned n = 1, f = 1; + +	if (!strcmp("near", format)) +		n = copies; +	else +		f = copies; + +	if (!strcmp("offset", format)) +		return 0x30000 | (f << 8) | n; + +	if (!strcmp("far", format)) +		return 0x20000 | (f << 8) | n; + +	return (f << 8) | n;  }  static struct raid_type *get_raid_type(char *name) @@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)  {  	unsigned i, rebuild_cnt = 0;  	unsigned rebuilds_per_group, copies, d; +	unsigned group_size, last_group_start;  	for (i = 0; i < rs->md.raid_disks; i++)  		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || @@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)  		 * as long as the failed devices occur in different mirror  		 * groups (i.e. different stripes).  		 * -		 * Right now, we only allow for "near" copies.  When other -		 * formats are added, we will have to check those too. -		 *  		 * When checking "near" format, make sure no adjacent devices  		 * have failed beyond what can be handled.  In addition to the  		 * simple case where the number of devices is a multiple of the @@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)  		 *          A    A    B    B    C  		 *          C    D    D    E    E  		 */ -		for (i = 0; i < rs->md.raid_disks * copies; i++) { -			if (!(i % copies)) +		if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) { +			for (i = 0; i < rs->md.raid_disks * copies; i++) { +				if (!(i % copies)) +					rebuilds_per_group = 0; +				d = i % rs->md.raid_disks; +				if ((!rs->dev[d].rdev.sb_page || +				     !test_bit(In_sync, &rs->dev[d].rdev.flags)) && +				    (++rebuilds_per_group >= copies)) +					goto too_many; +			} +			break; +		} + +		/* +		 * When checking "far" and "offset" formats, we need to ensure +		 * that the device that holds its copy is not also dead or +		 * being rebuilt.  (Note that "far" and "offset" formats only +		 * support two copies right now.  These formats also only ever +		 * use the 'use_far_sets' variant.) +		 * +		 * This check is somewhat complicated by the need to account +		 * for arrays that are not a multiple of (far) copies.  This +		 * results in the need to treat the last (potentially larger) +		 * set differently. +		 */ +		group_size = (rs->md.raid_disks / copies); +		last_group_start = (rs->md.raid_disks / group_size) - 1; +		last_group_start *= group_size; +		for (i = 0; i < rs->md.raid_disks; i++) { +			if (!(i % copies) && !(i > last_group_start))  				rebuilds_per_group = 0; -			d = i % rs->md.raid_disks; -			if ((!rs->dev[d].rdev.sb_page || -			     !test_bit(In_sync, &rs->dev[d].rdev.flags)) && +			if ((!rs->dev[i].rdev.sb_page || +			     !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&  			    (++rebuilds_per_group >= copies)) -				goto too_many; +					goto too_many;  		}  		break;  	default: @@ -433,7 +487,7 @@ too_many:   *   * RAID10-only options:   *    [raid10_copies <# copies>]        Number of copies.  (Default: 2) - *    [raid10_format <near>]            Layout algorithm.  (Default: near) + *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)   */  static int parse_raid_params(struct raid_set *rs, char **argv,  			     unsigned num_raid_params) @@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,  				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";  				return -EINVAL;  			} -			if (strcmp("near", argv[i])) { +			if (strcmp("near", argv[i]) && +			    strcmp("far", argv[i]) && +			    strcmp("offset", argv[i])) {  				rs->ti->error = "Invalid 'raid10_format' value given";  				return -EINVAL;  			} @@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,  			return -EINVAL;  		} +		/* +		 * If the format is not "near", we only support +		 * two copies at the moment. +		 */ +		if (strcmp("near", raid10_format) && (raid10_copies > 2)) { +			rs->ti->error = "Too many copies for given RAID10 format."; +			return -EINVAL; +		} +  		/* (Len * #mirrors) / #devices */  		sectors_per_dev = rs->ti->len * raid10_copies;  		sector_div(sectors_per_dev, rs->md.raid_disks); @@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)  	/*  	 * Reshaping is not currently allowed  	 */ -	if ((le32_to_cpu(sb->level) != mddev->level) || -	    (le32_to_cpu(sb->layout) != mddev->layout) || -	    (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { -		DMERR("Reshaping arrays not yet supported."); +	if (le32_to_cpu(sb->level) != mddev->level) { +		DMERR("Reshaping arrays not yet supported. (RAID level change)"); +		return -EINVAL; +	} +	if (le32_to_cpu(sb->layout) != mddev->layout) { +		DMERR("Reshaping arrays not yet supported. (RAID layout change)"); +		DMERR("  0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); +		DMERR("  Old layout: %s w/ %d copies", +		      raid10_md_layout_to_format(le32_to_cpu(sb->layout)), +		      raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); +		DMERR("  New layout: %s w/ %d copies", +		      raid10_md_layout_to_format(mddev->layout), +		      raid10_md_layout_to_copies(mddev->layout)); +		return -EINVAL; +	} +	if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { +		DMERR("Reshaping arrays not yet supported. (stripe sectors change)");  		return -EINVAL;  	}  	/* We can only change the number of devices in RAID1 right now */  	if ((rs->raid_type->level != 1) &&  	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { -		DMERR("Reshaping arrays not yet supported."); +		DMERR("Reshaping arrays not yet supported. (device count change)");  		return -EINVAL;  	} @@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,  			       raid10_md_layout_to_copies(rs->md.layout));  		if (rs->print_flags & DMPF_RAID10_FORMAT) -			DMEMIT(" raid10_format near"); +			DMEMIT(" raid10_format %s", +			       raid10_md_layout_to_format(rs->md.layout));  		DMEMIT(" %d", rs->md.raid_disks);  		for (i = 0; i < rs->md.raid_disks; i++) { @@ -1418,6 +1497,10 @@ static struct target_type raid_target = {  static int __init dm_raid_init(void)  { +	DMINFO("Loading target version %u.%u.%u", +	       raid_target.version[0], +	       raid_target.version[1], +	       raid_target.version[2]);  	return dm_register_target(&raid_target);  } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 009339d6282..004ad1652b7 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1577,6 +1577,11 @@ static bool data_dev_supports_discard(struct pool_c *pt)  	return q && blk_queue_discard(q);  } +static bool is_factor(sector_t block_size, uint32_t n) +{ +	return !sector_div(block_size, n); +} +  /*   * If discard_passdown was enabled verify that the data device   * supports discards.  Disable discard_passdown if not. @@ -1602,7 +1607,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)  	else if (data_limits->discard_granularity > block_size)  		reason = "discard granularity larger than a block"; -	else if (block_size & (data_limits->discard_granularity - 1)) +	else if (!is_factor(block_size, data_limits->discard_granularity))  		reason = "discard granularity not a factor of block size";  	if (reason) { @@ -2544,7 +2549,7 @@ static struct target_type pool_target = {  	.name = "thin-pool",  	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |  		    DM_TARGET_IMMUTABLE, -	.version = {1, 6, 1}, +	.version = {1, 7, 0},  	.module = THIS_MODULE,  	.ctr = pool_ctr,  	.dtr = pool_dtr, @@ -2831,7 +2836,7 @@ static int thin_iterate_devices(struct dm_target *ti,  static struct target_type thin_target = {  	.name = "thin", -	.version = {1, 7, 1}, +	.version = {1, 8, 0},  	.module	= THIS_MODULE,  	.ctr = thin_ctr,  	.dtr = thin_dtr, diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 6ad538375c3..a746f1d21c6 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -93,6 +93,13 @@ struct dm_verity_io {  	 */  }; +struct dm_verity_prefetch_work { +	struct work_struct work; +	struct dm_verity *v; +	sector_t block; +	unsigned n_blocks; +}; +  static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)  {  	return (struct shash_desc *)(io + 1); @@ -424,15 +431,18 @@ static void verity_end_io(struct bio *bio, int error)   * The root buffer is not prefetched, it is assumed that it will be cached   * all the time.   */ -static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) +static void verity_prefetch_io(struct work_struct *work)  { +	struct dm_verity_prefetch_work *pw = +		container_of(work, struct dm_verity_prefetch_work, work); +	struct dm_verity *v = pw->v;  	int i;  	for (i = v->levels - 2; i >= 0; i--) {  		sector_t hash_block_start;  		sector_t hash_block_end; -		verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); -		verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); +		verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); +		verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);  		if (!i) {  			unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); @@ -452,6 +462,25 @@ no_prefetch_cluster:  		dm_bufio_prefetch(v->bufio, hash_block_start,  				  hash_block_end - hash_block_start + 1);  	} + +	kfree(pw); +} + +static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) +{ +	struct dm_verity_prefetch_work *pw; + +	pw = kmalloc(sizeof(struct dm_verity_prefetch_work), +		GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + +	if (!pw) +		return; + +	INIT_WORK(&pw->work, verity_prefetch_io); +	pw->v = v; +	pw->block = io->block; +	pw->n_blocks = io->n_blocks; +	queue_work(v->verify_wq, &pw->work);  }  /* @@ -498,7 +527,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)  	memcpy(io->io_vec, bio_iovec(bio),  	       io->io_vec_size * sizeof(struct bio_vec)); -	verity_prefetch_io(v, io); +	verity_submit_prefetch(v, io);  	generic_make_request(bio); @@ -858,7 +887,7 @@ bad:  static struct target_type verity_target = {  	.name		= "verity", -	.version	= {1, 1, 1}, +	.version	= {1, 2, 0},  	.module		= THIS_MODULE,  	.ctr		= verity_ctr,  	.dtr		= verity_dtr, diff --git a/drivers/md/md.c b/drivers/md/md.c index 3db3d1b271f..aeceedfc530 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)  		bio_io_error(bio);  		return;  	} +	if (mddev->ro == 1 && unlikely(rw == WRITE)) { +		bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); +		return; +	}  	smp_rmb(); /* Ensure implications of  'active' are visible */  	rcu_read_lock();  	if (mddev->suspended) { @@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)  		} else if (!sectors)  			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -  				rdev->data_offset; +		if (!my_mddev->pers->resize) +			/* Cannot change size for RAID0 or Linear etc */ +			return -EINVAL;  	}  	if (sectors < my_mddev->dev_sectors)  		return -EINVAL; /* component must fit device */ @@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,  			mddev->ro = 0;  			sysfs_notify_dirent_safe(mddev->sysfs_state);  			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); -			md_wakeup_thread(mddev->thread); +			/* mddev_unlock will wake thread */ +			/* If a device failed while we were read-only, we +			 * need to make sure the metadata is updated now. +			 */ +			if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { +				mddev_unlock(mddev); +				wait_event(mddev->sb_wait, +					   !test_bit(MD_CHANGE_DEVS, &mddev->flags) && +					   !test_bit(MD_CHANGE_PENDING, &mddev->flags)); +				mddev_lock(mddev); +			}  		} else {  			err = -EROFS;  			goto abort_unlock; @@ -7646,10 +7663,8 @@ static int remove_and_add_spares(struct mddev *mddev)  				removed++;  			}  		} -	if (removed) -		sysfs_notify(&mddev->kobj, NULL, -			     "degraded"); - +	if (removed && mddev->kobj.sd) +		sysfs_notify(&mddev->kobj, NULL, "degraded");  	rdev_for_each(rdev, mddev) {  		if (rdev->raid_disk >= 0 && diff --git a/drivers/md/md.h b/drivers/md/md.h index eca59c3074e..d90fb1a879e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -506,7 +506,7 @@ static inline char * mdname (struct mddev * mddev)  static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)  {  	char nm[20]; -	if (!test_bit(Replacement, &rdev->flags)) { +	if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {  		sprintf(nm, "rd%d", rdev->raid_disk);  		return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);  	} else @@ -516,7 +516,7 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)  static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)  {  	char nm[20]; -	if (!test_bit(Replacement, &rdev->flags)) { +	if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {  		sprintf(nm, "rd%d", rdev->raid_disk);  		sysfs_remove_link(&mddev->kobj, nm);  	} diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index c4f28133ef8..b88757cd0d1 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -139,15 +139,8 @@ struct child {  	struct btree_node *n;  }; -static struct dm_btree_value_type le64_type = { -	.context = NULL, -	.size = sizeof(__le64), -	.inc = NULL, -	.dec = NULL, -	.equal = NULL -}; - -static int init_child(struct dm_btree_info *info, struct btree_node *parent, +static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, +		      struct btree_node *parent,  		      unsigned index, struct child *result)  {  	int r, inc; @@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct btree_node *parent,  	result->n = dm_block_data(result->block);  	if (inc) -		inc_children(info->tm, result->n, &le64_type); +		inc_children(info->tm, result->n, vt);  	*((__le64 *) value_ptr(parent, index)) =  		cpu_to_le64(dm_block_location(result->block)); @@ -236,7 +229,7 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,  }  static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, -		      unsigned left_index) +		      struct dm_btree_value_type *vt, unsigned left_index)  {  	int r;  	struct btree_node *parent; @@ -244,11 +237,11 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,  	parent = dm_block_data(shadow_current(s)); -	r = init_child(info, parent, left_index, &left); +	r = init_child(info, vt, parent, left_index, &left);  	if (r)  		return r; -	r = init_child(info, parent, left_index + 1, &right); +	r = init_child(info, vt, parent, left_index + 1, &right);  	if (r) {  		exit_child(info, &left);  		return r; @@ -368,7 +361,7 @@ static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,  }  static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, -		      unsigned left_index) +		      struct dm_btree_value_type *vt, unsigned left_index)  {  	int r;  	struct btree_node *parent = dm_block_data(shadow_current(s)); @@ -377,17 +370,17 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,  	/*  	 * FIXME: fill out an array?  	 */ -	r = init_child(info, parent, left_index, &left); +	r = init_child(info, vt, parent, left_index, &left);  	if (r)  		return r; -	r = init_child(info, parent, left_index + 1, ¢er); +	r = init_child(info, vt, parent, left_index + 1, ¢er);  	if (r) {  		exit_child(info, &left);  		return r;  	} -	r = init_child(info, parent, left_index + 2, &right); +	r = init_child(info, vt, parent, left_index + 2, &right);  	if (r) {  		exit_child(info, &left);  		exit_child(info, ¢er); @@ -434,7 +427,8 @@ static int get_nr_entries(struct dm_transaction_manager *tm,  }  static int rebalance_children(struct shadow_spine *s, -			      struct dm_btree_info *info, uint64_t key) +			      struct dm_btree_info *info, +			      struct dm_btree_value_type *vt, uint64_t key)  {  	int i, r, has_left_sibling, has_right_sibling;  	uint32_t child_entries; @@ -472,13 +466,13 @@ static int rebalance_children(struct shadow_spine *s,  	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);  	if (!has_left_sibling) -		r = rebalance2(s, info, i); +		r = rebalance2(s, info, vt, i);  	else if (!has_right_sibling) -		r = rebalance2(s, info, i - 1); +		r = rebalance2(s, info, vt, i - 1);  	else -		r = rebalance3(s, info, i - 1); +		r = rebalance3(s, info, vt, i - 1);  	return r;  } @@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,  		if (le32_to_cpu(n->header.flags) & LEAF_NODE)  			return do_leaf(n, key, index); -		r = rebalance_children(s, info, key); +		r = rebalance_children(s, info, vt, key);  		if (r)  			break; @@ -550,6 +544,14 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,  	return r;  } +static struct dm_btree_value_type le64_type = { +	.context = NULL, +	.size = sizeof(__le64), +	.inc = NULL, +	.dec = NULL, +	.equal = NULL +}; +  int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,  		    uint64_t *keys, dm_block_t *new_root)  { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 24b359717a7..0505452de8d 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)  			rdev1->new_raid_disk = j;  		} -		if (j < 0 || j >= mddev->raid_disks) { +		if (j < 0) { +			printk(KERN_ERR +			       "md/raid0:%s: remove inactive devices before converting to RAID0\n", +			       mdname(mddev)); +			goto abort; +		} +		if (j >= mddev->raid_disks) {  			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "  			       "aborting!\n", mdname(mddev), j);  			goto abort; @@ -289,7 +295,7 @@ abort:  	kfree(conf->strip_zone);  	kfree(conf->devlist);  	kfree(conf); -	*private_conf = NULL; +	*private_conf = ERR_PTR(err);  	return err;  } @@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks  		  "%s does not support generic reshape\n", __func__);  	rdev_for_each(rdev, mddev) -		array_sectors += rdev->sectors; +		array_sectors += (rdev->sectors & +				  ~(sector_t)(mddev->chunk_sectors-1));  	return array_sectors;  } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d5bddfc4010..fd86b372692 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)  		bio_list_merge(&conf->pending_bio_list, &plug->pending);  		conf->pending_count += plug->pending_cnt;  		spin_unlock_irq(&conf->device_lock); +		wake_up(&conf->wait_barrier);  		md_wakeup_thread(mddev->thread);  		kfree(plug);  		return; @@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)  	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));  	const unsigned long do_discard = (bio->bi_rw  					  & (REQ_DISCARD | REQ_SECURE)); +	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);  	struct md_rdev *blocked_rdev;  	struct blk_plug_cb *cb;  	struct raid1_plug_cb *plug = NULL; @@ -1301,7 +1303,8 @@ read_again:  				   conf->mirrors[i].rdev->data_offset);  		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;  		mbio->bi_end_io	= raid1_end_write_request; -		mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; +		mbio->bi_rw = +			WRITE | do_flush_fua | do_sync | do_discard | do_same;  		mbio->bi_private = r1_bio;  		atomic_inc(&r1_bio->remaining); @@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev)  	if (IS_ERR(conf))  		return PTR_ERR(conf); +	if (mddev->queue) +		blk_queue_max_write_same_sectors(mddev->queue, +						 mddev->chunk_sectors);  	rdev_for_each(rdev, mddev) {  		if (!mddev->gendisk)  			continue; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d48249c03..77b562d18a9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -38,21 +38,36 @@   *    near_copies (stored in low byte of layout)   *    far_copies (stored in second byte of layout)   *    far_offset (stored in bit 16 of layout ) + *    use_far_sets (stored in bit 17 of layout )   * - * The data to be stored is divided into chunks using chunksize. - * Each device is divided into far_copies sections. - * In each section, chunks are laid out in a style similar to raid0, but - * near_copies copies of each chunk is stored (each on a different drive). - * The starting device for each section is offset near_copies from the starting - * device of the previous section. - * Thus they are (near_copies*far_copies) of each chunk, and each is on a different - * drive. - * near_copies and far_copies must be at least one, and their product is at most - * raid_disks. + * The data to be stored is divided into chunks using chunksize.  Each device + * is divided into far_copies sections.   In each section, chunks are laid out + * in a style similar to raid0, but near_copies copies of each chunk is stored + * (each on a different drive).  The starting device for each section is offset + * near_copies from the starting device of the previous section.  Thus there + * are (near_copies * far_copies) of each chunk, and each is on a different + * drive.  near_copies and far_copies must be at least one, and their product + * is at most raid_disks.   *   * If far_offset is true, then the far_copies are handled a bit differently. - * The copies are still in different stripes, but instead of be very far apart - * on disk, there are adjacent stripes. + * The copies are still in different stripes, but instead of being very far + * apart on disk, there are adjacent stripes. + * + * The far and offset algorithms are handled slightly differently if + * 'use_far_sets' is true.  In this case, the array's devices are grouped into + * sets that are (near_copies * far_copies) in size.  The far copied stripes + * are still shifted by 'near_copies' devices, but this shifting stays confined + * to the set rather than the entire array.  This is done to improve the number + * of device combinations that can fail without causing the array to fail. + * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk + * on a device): + *    A B C D    A B C D E + *      ...         ... + *    D A B C    E A B C D + * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): + *    [A B] [C D]    [A B] [C D E] + *    |...| |...|    |...| | ... | + *    [B A] [D C]    [B A] [E C D]   */  /* @@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)  	sector_t stripe;  	int dev;  	int slot = 0; +	int last_far_set_start, last_far_set_size; + +	last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; +	last_far_set_start *= geo->far_set_size; + +	last_far_set_size = geo->far_set_size; +	last_far_set_size += (geo->raid_disks % geo->far_set_size);  	/* now calculate first sector/dev */  	chunk = r10bio->sector >> geo->chunk_shift; @@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)  	/* and calculate all the others */  	for (n = 0; n < geo->near_copies; n++) {  		int d = dev; +		int set;  		sector_t s = sector; -		r10bio->devs[slot].addr = sector;  		r10bio->devs[slot].devnum = d; +		r10bio->devs[slot].addr = s;  		slot++;  		for (f = 1; f < geo->far_copies; f++) { +			set = d / geo->far_set_size;  			d += geo->near_copies; -			if (d >= geo->raid_disks) -				d -= geo->raid_disks; + +			if ((geo->raid_disks % geo->far_set_size) && +			    (d > last_far_set_start)) { +				d -= last_far_set_start; +				d %= last_far_set_size; +				d += last_far_set_start; +			} else { +				d %= geo->far_set_size; +				d += geo->far_set_size * set; +			}  			s += geo->stride;  			r10bio->devs[slot].devnum = d;  			r10bio->devs[slot].addr = s; @@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)  	 * or recovery, so reshape isn't happening  	 */  	struct geom *geo = &conf->geo; +	int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; +	int far_set_size = geo->far_set_size; +	int last_far_set_start; + +	if (geo->raid_disks % geo->far_set_size) { +		last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; +		last_far_set_start *= geo->far_set_size; + +		if (dev >= last_far_set_start) { +			far_set_size = geo->far_set_size; +			far_set_size += (geo->raid_disks % geo->far_set_size); +			far_set_start = last_far_set_start; +		} +	}  	offset = sector & geo->chunk_mask;  	if (geo->far_offset) { @@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)  		chunk = sector >> geo->chunk_shift;  		fc = sector_div(chunk, geo->far_copies);  		dev -= fc * geo->near_copies; -		if (dev < 0) -			dev += geo->raid_disks; +		if (dev < far_set_start) +			dev += far_set_size;  	} else {  		while (sector >= geo->stride) {  			sector -= geo->stride; -			if (dev < geo->near_copies) -				dev += geo->raid_disks - geo->near_copies; +			if (dev < (geo->near_copies + far_set_start)) +				dev += far_set_size - geo->near_copies;  			else  				dev -= geo->near_copies;  		} @@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)  		bio_list_merge(&conf->pending_bio_list, &plug->pending);  		conf->pending_count += plug->pending_cnt;  		spin_unlock_irq(&conf->device_lock); +		wake_up(&conf->wait_barrier);  		md_wakeup_thread(mddev->thread);  		kfree(plug);  		return; @@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)  	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);  	const unsigned long do_discard = (bio->bi_rw  					  & (REQ_DISCARD | REQ_SECURE)); +	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);  	unsigned long flags;  	struct md_rdev *blocked_rdev;  	struct blk_plug_cb *cb; @@ -1460,7 +1508,8 @@ retry_write:  							      rdev));  			mbio->bi_bdev = rdev->bdev;  			mbio->bi_end_io	= raid10_end_write_request; -			mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; +			mbio->bi_rw = +				WRITE | do_sync | do_fua | do_discard | do_same;  			mbio->bi_private = r10_bio;  			atomic_inc(&r10_bio->remaining); @@ -1502,7 +1551,8 @@ retry_write:  						   r10_bio, rdev));  			mbio->bi_bdev = rdev->bdev;  			mbio->bi_end_io	= raid10_end_write_request; -			mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; +			mbio->bi_rw = +				WRITE | do_sync | do_fua | do_discard | do_same;  			mbio->bi_private = r10_bio;  			atomic_inc(&r10_bio->remaining); @@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)  		disks = mddev->raid_disks + mddev->delta_disks;  		break;  	} -	if (layout >> 17) +	if (layout >> 18)  		return -1;  	if (chunk < (PAGE_SIZE >> 9) ||  	    !is_power_of_2(chunk)) @@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)  	geo->near_copies = nc;  	geo->far_copies = fc;  	geo->far_offset = fo; +	geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;  	geo->chunk_mask = chunk - 1;  	geo->chunk_shift = ffz(~chunk);  	return nc*fc; @@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)  	if (mddev->queue) {  		blk_queue_max_discard_sectors(mddev->queue,  					      mddev->chunk_sectors); +		blk_queue_max_write_same_sectors(mddev->queue, +						 mddev->chunk_sectors);  		blk_queue_io_min(mddev->queue, chunk_size);  		if (conf->geo.raid_disks % conf->geo.near_copies)  			blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1054cf60234..157d69e83ff 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -33,6 +33,11 @@ struct r10conf {  					       * far_offset, in which case it is  					       * 1 stripe.  					       */ +		int             far_set_size; /* The number of devices in a set, +					       * where a 'set' are devices that +					       * contain far/offset copies of +					       * each other. +					       */  		int		chunk_shift; /* shift from chunks to sectors */  		sector_t	chunk_mask;  	} prev, geo; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5af2d270908..24909eb13fe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -671,9 +671,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)  			bi->bi_next = NULL;  			if (rrdev)  				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); -			trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), -					      bi, disk_devt(conf->mddev->gendisk), -					      sh->dev[i].sector); + +			if (conf->mddev->gendisk) +				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), +						      bi, disk_devt(conf->mddev->gendisk), +						      sh->dev[i].sector);  			generic_make_request(bi);  		}  		if (rrdev) { @@ -701,9 +703,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)  			rbi->bi_io_vec[0].bv_offset = 0;  			rbi->bi_size = STRIPE_SIZE;  			rbi->bi_next = NULL; -			trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), -					      rbi, disk_devt(conf->mddev->gendisk), -					      sh->dev[i].sector); +			if (conf->mddev->gendisk) +				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), +						      rbi, disk_devt(conf->mddev->gendisk), +						      sh->dev[i].sector);  			generic_make_request(rbi);  		}  		if (!rdev && !rrdev) { @@ -1403,7 +1406,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu  			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);  } -static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)  {  	int overlap_clear = 0, i, disks = sh->disks;  	struct dma_async_tx_descriptor *tx = NULL; @@ -1468,36 +1471,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)  	put_cpu();  } -#ifdef CONFIG_MULTICORE_RAID456 -static void async_run_ops(void *param, async_cookie_t cookie) -{ -	struct stripe_head *sh = param; -	unsigned long ops_request = sh->ops.request; - -	clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); -	wake_up(&sh->ops.wait_for_ops); - -	__raid_run_ops(sh, ops_request); -	release_stripe(sh); -} - -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) -{ -	/* since handle_stripe can be called outside of raid5d context -	 * we need to ensure sh->ops.request is de-staged before another -	 * request arrives -	 */ -	wait_event(sh->ops.wait_for_ops, -		   !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); -	sh->ops.request = ops_request; - -	atomic_inc(&sh->count); -	async_schedule(async_run_ops, sh); -} -#else -#define raid_run_ops __raid_run_ops -#endif -  static int grow_one_stripe(struct r5conf *conf)  {  	struct stripe_head *sh; @@ -1506,9 +1479,6 @@ static int grow_one_stripe(struct r5conf *conf)  		return 0;  	sh->raid_conf = conf; -	#ifdef CONFIG_MULTICORE_RAID456 -	init_waitqueue_head(&sh->ops.wait_for_ops); -	#endif  	spin_lock_init(&sh->stripe_lock); @@ -1627,9 +1597,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)  			break;  		nsh->raid_conf = conf; -		#ifdef CONFIG_MULTICORE_RAID456 -		init_waitqueue_head(&nsh->ops.wait_for_ops); -		#endif  		spin_lock_init(&nsh->stripe_lock);  		list_add(&nsh->lru, &newstripes); @@ -2316,17 +2283,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,  	int level = conf->level;  	if (rcw) { -		/* if we are not expanding this is a proper write request, and -		 * there will be bios with new data to be drained into the -		 * stripe cache -		 */ -		if (!expand) { -			sh->reconstruct_state = reconstruct_state_drain_run; -			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); -		} else -			sh->reconstruct_state = reconstruct_state_run; - -		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);  		for (i = disks; i--; ) {  			struct r5dev *dev = &sh->dev[i]; @@ -2339,6 +2295,21 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,  				s->locked++;  			}  		} +		/* if we are not expanding this is a proper write request, and +		 * there will be bios with new data to be drained into the +		 * stripe cache +		 */ +		if (!expand) { +			if (!s->locked) +				/* False alarm, nothing to do */ +				return; +			sh->reconstruct_state = reconstruct_state_drain_run; +			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); +		} else +			sh->reconstruct_state = reconstruct_state_run; + +		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); +  		if (s->locked + conf->max_degraded == disks)  			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))  				atomic_inc(&conf->pending_full_writes); @@ -2347,11 +2318,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,  		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||  			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); -		sh->reconstruct_state = reconstruct_state_prexor_drain_run; -		set_bit(STRIPE_OP_PREXOR, &s->ops_request); -		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); -		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); -  		for (i = disks; i--; ) {  			struct r5dev *dev = &sh->dev[i];  			if (i == pd_idx) @@ -2366,6 +2332,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,  				s->locked++;  			}  		} +		if (!s->locked) +			/* False alarm - nothing to do */ +			return; +		sh->reconstruct_state = reconstruct_state_prexor_drain_run; +		set_bit(STRIPE_OP_PREXOR, &s->ops_request); +		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); +		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);  	}  	/* keep the parity disk(s) locked while asynchronous operations @@ -2600,6 +2573,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,  	int i;  	clear_bit(STRIPE_SYNCING, &sh->state); +	if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) +		wake_up(&conf->wait_for_overlap);  	s->syncing = 0;  	s->replacing = 0;  	/* There is nothing more to do for sync/check/repair. @@ -2773,6 +2748,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,  {  	int i;  	struct r5dev *dev; +	int discard_pending = 0;  	for (i = disks; i--; )  		if (sh->dev[i].written) { @@ -2801,9 +2777,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,  						STRIPE_SECTORS,  					 !test_bit(STRIPE_DEGRADED, &sh->state),  						0); -			} -		} else if (test_bit(R5_Discard, &sh->dev[i].flags)) -			clear_bit(R5_Discard, &sh->dev[i].flags); +			} else if (test_bit(R5_Discard, &dev->flags)) +				discard_pending = 1; +		} +	if (!discard_pending && +	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { +		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); +		clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); +		if (sh->qd_idx >= 0) { +			clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); +			clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); +		} +		/* now that discard is done we can proceed with any sync */ +		clear_bit(STRIPE_DISCARD, &sh->state); +		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) +			set_bit(STRIPE_HANDLE, &sh->state); + +	}  	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))  		if (atomic_dec_and_test(&conf->pending_full_writes)) @@ -2862,8 +2852,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,  	set_bit(STRIPE_HANDLE, &sh->state);  	if (rmw < rcw && rmw > 0) {  		/* prefer read-modify-write, but need to get some data */ -		blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d", -				  (unsigned long long)sh->sector, rmw); +		if (conf->mddev->queue) +			blk_add_trace_msg(conf->mddev->queue, +					  "raid5 rmw %llu %d", +					  (unsigned long long)sh->sector, rmw);  		for (i = disks; i--; ) {  			struct r5dev *dev = &sh->dev[i];  			if ((dev->towrite || i == sh->pd_idx) && @@ -2913,7 +2905,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,  				}  			}  		} -		if (rcw) +		if (rcw && conf->mddev->queue)  			blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",  					  (unsigned long long)sh->sector,  					  rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); @@ -3453,9 +3445,15 @@ static void handle_stripe(struct stripe_head *sh)  		return;  	} -	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { -		set_bit(STRIPE_SYNCING, &sh->state); -		clear_bit(STRIPE_INSYNC, &sh->state); +	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { +		spin_lock(&sh->stripe_lock); +		/* Cannot process 'sync' concurrently with 'discard' */ +		if (!test_bit(STRIPE_DISCARD, &sh->state) && +		    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { +			set_bit(STRIPE_SYNCING, &sh->state); +			clear_bit(STRIPE_INSYNC, &sh->state); +		} +		spin_unlock(&sh->stripe_lock);  	}  	clear_bit(STRIPE_DELAYED, &sh->state); @@ -3615,6 +3613,8 @@ static void handle_stripe(struct stripe_head *sh)  	    test_bit(STRIPE_INSYNC, &sh->state)) {  		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);  		clear_bit(STRIPE_SYNCING, &sh->state); +		if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) +			wake_up(&conf->wait_for_overlap);  	}  	/* If the failed drives are just a ReadError, then we might need @@ -4018,9 +4018,10 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)  		atomic_inc(&conf->active_aligned_reads);  		spin_unlock_irq(&conf->device_lock); -		trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), -				      align_bi, disk_devt(mddev->gendisk), -				      raid_bio->bi_sector); +		if (mddev->gendisk) +			trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), +					      align_bi, disk_devt(mddev->gendisk), +					      raid_bio->bi_sector);  		generic_make_request(align_bi);  		return 1;  	} else { @@ -4114,7 +4115,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)  		}  		spin_unlock_irq(&conf->device_lock);  	} -	trace_block_unplug(mddev->queue, cnt, !from_schedule); +	if (mddev->queue) +		trace_block_unplug(mddev->queue, cnt, !from_schedule);  	kfree(cb);  } @@ -4177,6 +4179,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)  		sh = get_active_stripe(conf, logical_sector, 0, 0, 0);  		prepare_to_wait(&conf->wait_for_overlap, &w,  				TASK_UNINTERRUPTIBLE); +		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); +		if (test_bit(STRIPE_SYNCING, &sh->state)) { +			release_stripe(sh); +			schedule(); +			goto again; +		} +		clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);  		spin_lock_irq(&sh->stripe_lock);  		for (d = 0; d < conf->raid_disks; d++) {  			if (d == sh->pd_idx || d == sh->qd_idx) @@ -4189,6 +4198,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)  				goto again;  			}  		} +		set_bit(STRIPE_DISCARD, &sh->state);  		finish_wait(&conf->wait_for_overlap, &w);  		for (d = 0; d < conf->raid_disks; d++) {  			if (d == sh->pd_idx || d == sh->qd_idx) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 18b2c4a8a1f..b0b663b119a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -221,10 +221,6 @@ struct stripe_head {  	struct stripe_operations {  		int 		     target, target2;  		enum sum_check_flags zero_sum_result; -		#ifdef CONFIG_MULTICORE_RAID456 -		unsigned long	     request; -		wait_queue_head_t    wait_for_ops; -		#endif  	} ops;  	struct r5dev {  		/* rreq and rvec are used for the replacement device when @@ -323,6 +319,7 @@ enum {  	STRIPE_COMPUTE_RUN,  	STRIPE_OPS_REQ_PENDING,  	STRIPE_ON_UNPLUG_LIST, +	STRIPE_DISCARD,  };  /*  |