summaryrefslogtreecommitdiff
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-04-01 17:08:13 -0700
committerTejun Heo <tj@kernel.org>2013-04-01 18:45:36 -0700
commit229641a6f1f09e27a1f12fba38980f33f4c92975 (patch)
tree234a6f8aea0910de3242af0bbe6d7494fcf81847 /drivers/md/raid10.c
parentd55262c4d164759a8debe772da6c9b16059dec47 (diff)
parent07961ac7c0ee8b546658717034fe692fd12eefa9 (diff)
downloadolio-linux-3.10-229641a6f1f09e27a1f12fba38980f33f4c92975.tar.xz
olio-linux-3.10-229641a6f1f09e27a1f12fba38980f33f4c92975.zip
Merge tag 'v3.9-rc5' into wq/for-3.10
Writeback conversion to workqueue will be based on top of wq/for-3.10 branch to take advantage of custom attrs and NUMA support for unbound workqueues. Mainline currently contains two commits which result in non-trivial merge conflicts with wq/for-3.10 and because block/for-3.10/core is based on v3.9-rc3 which contains one of the conflicting commits, we need a pre-merge-window merge anyway. Let's pull v3.9-rc5 into wq/for-3.10 so that the block tree doesn't suffer from workqueue merge conflicts. The two conflicts and their resolutions: * e68035fb65 ("workqueue: convert to idr_alloc()") in mainline changes worker_pool_assign_id() to use idr_alloc() instead of the old idr interface. worker_pool_assign_id() goes through multiple locking changes in wq/for-3.10 causing the following conflict. static int worker_pool_assign_id(struct worker_pool *pool) { int ret; <<<<<<< HEAD lockdep_assert_held(&wq_pool_mutex); do { if (!idr_pre_get(&worker_pool_idr, GFP_KERNEL)) return -ENOMEM; ret = idr_get_new(&worker_pool_idr, pool, &pool->id); } while (ret == -EAGAIN); ======= mutex_lock(&worker_pool_idr_mutex); ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); if (ret >= 0) pool->id = ret; mutex_unlock(&worker_pool_idr_mutex); >>>>>>> c67bf5361e7e66a0ff1f4caf95f89347d55dfb89 return ret < 0 ? ret : 0; } We want locking from the former and idr_alloc() usage from the latter, which can be combined to the following. static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; } return ret; } * eb2834285c ("workqueue: fix possible pool stall bug in wq_unbind_fn()") updated wq_unbind_fn() such that it has single larger for_each_std_worker_pool() loop instead of two separate loops with a schedule() call inbetween. wq/for-3.10 renamed pool->assoc_mutex to pool->manager_mutex causing the following conflict (earlier function body and comments omitted for brevity). static void wq_unbind_fn(struct work_struct *work) { ... spin_unlock_irq(&pool->lock); <<<<<<< HEAD mutex_unlock(&pool->manager_mutex); } ======= mutex_unlock(&pool->assoc_mutex); >>>>>>> c67bf5361e7e66a0ff1f4caf95f89347d55dfb89 schedule(); <<<<<<< HEAD for_each_cpu_worker_pool(pool, cpu) ======= >>>>>>> c67bf5361e7e66a0ff1f4caf95f89347d55dfb89 atomic_set(&pool->nr_running, 0); spin_lock_irq(&pool->lock); wake_up_worker(pool); spin_unlock_irq(&pool->lock); } } The resolution is mostly trivial. We want the control flow of the latter with the rename of the former. static void wq_unbind_fn(struct work_struct *work) { ... spin_unlock_irq(&pool->lock); mutex_unlock(&pool->manager_mutex); schedule(); atomic_set(&pool->nr_running, 0); spin_lock_irq(&pool->lock); wake_up_worker(pool); spin_unlock_irq(&pool->lock); } } Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c97
1 files changed, 75 insertions, 22 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03..77b562d18a9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
* near_copies (stored in low byte of layout)
* far_copies (stored in second byte of layout)
* far_offset (stored in bit 16 of layout )
+ * use_far_sets (stored in bit 17 of layout )
*
- * The data to be stored is divided into chunks using chunksize.
- * Each device is divided into far_copies sections.
- * In each section, chunks are laid out in a style similar to raid0, but
- * near_copies copies of each chunk is stored (each on a different drive).
- * The starting device for each section is offset near_copies from the starting
- * device of the previous section.
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
- * drive.
- * near_copies and far_copies must be at least one, and their product is at most
- * raid_disks.
+ * The data to be stored is divided into chunks using chunksize. Each device
+ * is divided into far_copies sections. In each section, chunks are laid out
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
+ * (each on a different drive). The starting device for each section is offset
+ * near_copies from the starting device of the previous section. Thus there
+ * are (near_copies * far_copies) of each chunk, and each is on a different
+ * drive. near_copies and far_copies must be at least one, and their product
+ * is at most raid_disks.
*
* If far_offset is true, then the far_copies are handled a bit differently.
- * The copies are still in different stripes, but instead of be very far apart
- * on disk, there are adjacent stripes.
+ * The copies are still in different stripes, but instead of being very far
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true. In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size. The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array. This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ * A B C D A B C D E
+ * ... ...
+ * D A B C E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ * [A B] [C D] [A B] [C D E]
+ * |...| |...| |...| | ... |
+ * [B A] [D C] [B A] [E C D]
*/
/*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
sector_t stripe;
int dev;
int slot = 0;
+ int last_far_set_start, last_far_set_size;
+
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ last_far_set_size = geo->far_set_size;
+ last_far_set_size += (geo->raid_disks % geo->far_set_size);
/* now calculate first sector/dev */
chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
/* and calculate all the others */
for (n = 0; n < geo->near_copies; n++) {
int d = dev;
+ int set;
sector_t s = sector;
- r10bio->devs[slot].addr = sector;
r10bio->devs[slot].devnum = d;
+ r10bio->devs[slot].addr = s;
slot++;
for (f = 1; f < geo->far_copies; f++) {
+ set = d / geo->far_set_size;
d += geo->near_copies;
- if (d >= geo->raid_disks)
- d -= geo->raid_disks;
+
+ if ((geo->raid_disks % geo->far_set_size) &&
+ (d > last_far_set_start)) {
+ d -= last_far_set_start;
+ d %= last_far_set_size;
+ d += last_far_set_start;
+ } else {
+ d %= geo->far_set_size;
+ d += geo->far_set_size * set;
+ }
s += geo->stride;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* or recovery, so reshape isn't happening
*/
struct geom *geo = &conf->geo;
+ int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+ int far_set_size = geo->far_set_size;
+ int last_far_set_start;
+
+ if (geo->raid_disks % geo->far_set_size) {
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ if (dev >= last_far_set_start) {
+ far_set_size = geo->far_set_size;
+ far_set_size += (geo->raid_disks % geo->far_set_size);
+ far_set_start = last_far_set_start;
+ }
+ }
offset = sector & geo->chunk_mask;
if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
chunk = sector >> geo->chunk_shift;
fc = sector_div(chunk, geo->far_copies);
dev -= fc * geo->near_copies;
- if (dev < 0)
- dev += geo->raid_disks;
+ if (dev < far_set_start)
+ dev += far_set_size;
} else {
while (sector >= geo->stride) {
sector -= geo->stride;
- if (dev < geo->near_copies)
- dev += geo->raid_disks - geo->near_copies;
+ if (dev < (geo->near_copies + far_set_start))
+ dev += far_set_size - geo->near_copies;
else
dev -= geo->near_copies;
}
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
unsigned long flags;
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
@@ -1460,7 +1508,8 @@ retry_write:
rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1551,8 @@ retry_write:
r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
disks = mddev->raid_disks + mddev->delta_disks;
break;
}
- if (layout >> 17)
+ if (layout >> 18)
return -1;
if (chunk < (PAGE_SIZE >> 9) ||
!is_power_of_2(chunk))
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
geo->near_copies = nc;
geo->far_copies = fc;
geo->far_offset = fo;
+ geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
geo->chunk_mask = chunk - 1;
geo->chunk_shift = ffz(~chunk);
return nc*fc;
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue,
+ mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);