diff options
| author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 | 
| commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
| tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /drivers/md | |
| download | olio-linux-3.10-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.xz olio-linux-3.10-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip  | |
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'drivers/md')
54 files changed, 28483 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig new file mode 100644 index 00000000000..ac43f98062f --- /dev/null +++ b/drivers/md/Kconfig @@ -0,0 +1,240 @@ +# +# Block device driver configuration +# + +menu "Multi-device support (RAID and LVM)" + +config MD +	bool "Multiple devices driver support (RAID and LVM)" +	help +	  Support multiple physical spindles through a single logical device. +	  Required for RAID and logical volume management. + +config BLK_DEV_MD +	tristate "RAID support" +	depends on MD +	---help--- +	  This driver lets you combine several hard disk partitions into one +	  logical block device. This can be used to simply append one +	  partition to another one or to combine several redundant hard disks +	  into a RAID1/4/5 device so as to provide protection against hard +	  disk failures. This is called "Software RAID" since the combining of +	  the partitions is done by the kernel. "Hardware RAID" means that the +	  combining is done by a dedicated controller; if you have such a +	  controller, you do not need to say Y here. + +	  More information about Software RAID on Linux is contained in the +	  Software RAID mini-HOWTO, available from +	  <http://www.tldp.org/docs.html#howto>. There you will also learn +	  where to get the supporting user space utilities raidtools. + +	  If unsure, say N. + +config MD_LINEAR +	tristate "Linear (append) mode" +	depends on BLK_DEV_MD +	---help--- +	  If you say Y here, then your multiple devices driver will be able to +	  use the so-called linear mode, i.e. it will combine the hard disk +	  partitions by simply appending one to the other. + +	  To compile this as a module, choose M here: the module +	  will be called linear. + +	  If unsure, say Y. + +config MD_RAID0 +	tristate "RAID-0 (striping) mode" +	depends on BLK_DEV_MD +	---help--- +	  If you say Y here, then your multiple devices driver will be able to +	  use the so-called raid0 mode, i.e. it will combine the hard disk +	  partitions into one logical device in such a fashion as to fill them +	  up evenly, one chunk here and one chunk there. This will increase +	  the throughput rate if the partitions reside on distinct disks. + +	  Information about Software RAID on Linux is contained in the +	  Software-RAID mini-HOWTO, available from +	  <http://www.tldp.org/docs.html#howto>. There you will also +	  learn where to get the supporting user space utilities raidtools. + +	  To compile this as a module, choose M here: the module +	  will be called raid0. + +	  If unsure, say Y. + +config MD_RAID1 +	tristate "RAID-1 (mirroring) mode" +	depends on BLK_DEV_MD +	---help--- +	  A RAID-1 set consists of several disk drives which are exact copies +	  of each other.  In the event of a mirror failure, the RAID driver +	  will continue to use the operational mirrors in the set, providing +	  an error free MD (multiple device) to the higher levels of the +	  kernel.  In a set with N drives, the available space is the capacity +	  of a single drive, and the set protects against a failure of (N - 1) +	  drives. + +	  Information about Software RAID on Linux is contained in the +	  Software-RAID mini-HOWTO, available from +	  <http://www.tldp.org/docs.html#howto>.  There you will also +	  learn where to get the supporting user space utilities raidtools. + +	  If you want to use such a RAID-1 set, say Y.  To compile this code +	  as a module, choose M here: the module will be called raid1. + +	  If unsure, say Y. + +config MD_RAID10 +	tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" +	depends on BLK_DEV_MD && EXPERIMENTAL +	---help--- +	  RAID-10 provides a combination of striping (RAID-0) and +	  mirroring (RAID-1) with easier configuration and more flexable +	  layout. +	  Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to +	  be the same size (or at least, only as much as the smallest device +	  will be used). +	  RAID-10 provides a variety of layouts that provide different levels +	  of redundancy and performance. + +	  RAID-10 requires mdadm-1.7.0 or later, available at: + +	  ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/ + +	  If unsure, say Y. + +config MD_RAID5 +	tristate "RAID-4/RAID-5 mode" +	depends on BLK_DEV_MD +	---help--- +	  A RAID-5 set of N drives with a capacity of C MB per drive provides +	  the capacity of C * (N - 1) MB, and protects against a failure +	  of a single drive. For a given sector (row) number, (N - 1) drives +	  contain data sectors, and one drive contains the parity protection. +	  For a RAID-4 set, the parity blocks are present on a single drive, +	  while a RAID-5 set distributes the parity across the drives in one +	  of the available parity distribution methods. + +	  Information about Software RAID on Linux is contained in the +	  Software-RAID mini-HOWTO, available from +	  <http://www.tldp.org/docs.html#howto>. There you will also +	  learn where to get the supporting user space utilities raidtools. + +	  If you want to use such a RAID-4/RAID-5 set, say Y.  To +	  compile this code as a module, choose M here: the module +	  will be called raid5. + +	  If unsure, say Y. + +config MD_RAID6 +	tristate "RAID-6 mode" +	depends on BLK_DEV_MD +	---help--- +	  A RAID-6 set of N drives with a capacity of C MB per drive +	  provides the capacity of C * (N - 2) MB, and protects +	  against a failure of any two drives. For a given sector +	  (row) number, (N - 2) drives contain data sectors, and two +	  drives contains two independent redundancy syndromes.  Like +	  RAID-5, RAID-6 distributes the syndromes across the drives +	  in one of the available parity distribution methods. + +	  RAID-6 requires mdadm-1.5.0 or later, available at: + +	  ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/ + +	  If you want to use such a RAID-6 set, say Y.  To compile +	  this code as a module, choose M here: the module will be +	  called raid6. + +	  If unsure, say Y. + +config MD_MULTIPATH +	tristate "Multipath I/O support" +	depends on BLK_DEV_MD +	help +	  Multipath-IO is the ability of certain devices to address the same +	  physical disk over multiple 'IO paths'. The code ensures that such +	  paths can be defined and handled at runtime, and ensures that a +	  transparent failover to the backup path(s) happens if a IO errors +	  arrives on the primary path. + +	  If unsure, say N. + +config MD_FAULTY +	tristate "Faulty test module for MD" +	depends on BLK_DEV_MD +	help +	  The "faulty" module allows for a block device that occasionally returns +	  read or write errors.  It is useful for testing. + +	  In unsure, say N. + +config BLK_DEV_DM +	tristate "Device mapper support" +	depends on MD +	---help--- +	  Device-mapper is a low level volume manager.  It works by allowing +	  people to specify mappings for ranges of logical sectors.  Various +	  mapping types are available, in addition people may write their own +	  modules containing custom mappings if they wish. + +	  Higher level volume managers such as LVM2 use this driver. + +	  To compile this as a module, choose M here: the module will be +	  called dm-mod. + +	  If unsure, say N. + +config DM_CRYPT +	tristate "Crypt target support" +	depends on BLK_DEV_DM && EXPERIMENTAL +	select CRYPTO +	---help--- +	  This device-mapper target allows you to create a device that +	  transparently encrypts the data on it. You'll need to activate +	  the ciphers you're going to use in the cryptoapi configuration. + +	  Information on how to use dm-crypt can be found on + +	  <http://www.saout.de/misc/dm-crypt/> + +	  To compile this code as a module, choose M here: the module will +	  be called dm-crypt. + +	  If unsure, say N. + +config DM_SNAPSHOT +       tristate "Snapshot target (EXPERIMENTAL)" +       depends on BLK_DEV_DM && EXPERIMENTAL +       ---help--- +         Allow volume managers to take writeable snapshots of a device. + +config DM_MIRROR +       tristate "Mirror target (EXPERIMENTAL)" +       depends on BLK_DEV_DM && EXPERIMENTAL +       ---help--- +         Allow volume managers to mirror logical volumes, also +         needed for live data migration tools such as 'pvmove'. + +config DM_ZERO +	tristate "Zero target (EXPERIMENTAL)" +	depends on BLK_DEV_DM && EXPERIMENTAL +	---help--- +	  A target that discards writes, and returns all zeroes for +	  reads.  Useful in some recovery situations. + +config DM_MULTIPATH +	tristate "Multipath target (EXPERIMENTAL)" +	depends on BLK_DEV_DM && EXPERIMENTAL +	---help--- +	  Allow volume managers to support multipath hardware. + +config DM_MULTIPATH_EMC +	tristate "EMC CX/AX multipath support (EXPERIMENTAL)" +	depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL +	---help--- +	  Multipath support for EMC CX/AX series hardware. + +endmenu + diff --git a/drivers/md/Makefile b/drivers/md/Makefile new file mode 100644 index 00000000000..90de9c146a5 --- /dev/null +++ b/drivers/md/Makefile @@ -0,0 +1,107 @@ +# +# Makefile for the kernel software RAID and LVM drivers. +# + +dm-mod-objs	:= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ +		   dm-ioctl.o dm-io.o kcopyd.o +dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o +dm-snapshot-objs := dm-snap.o dm-exception-store.o +dm-mirror-objs	:= dm-log.o dm-raid1.o +raid6-objs	:= raid6main.o raid6algos.o raid6recov.o raid6tables.o \ +		   raid6int1.o raid6int2.o raid6int4.o \ +		   raid6int8.o raid6int16.o raid6int32.o \ +		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \ +		   raid6altivec8.o \ +		   raid6mmx.o raid6sse1.o raid6sse2.o +hostprogs-y	:= mktables + +# Note: link order is important.  All raid personalities +# and xor.o must come before md.o, as they each initialise  +# themselves, and md.o may use the personalities when it  +# auto-initialised. + +obj-$(CONFIG_MD_LINEAR)		+= linear.o +obj-$(CONFIG_MD_RAID0)		+= raid0.o +obj-$(CONFIG_MD_RAID1)		+= raid1.o +obj-$(CONFIG_MD_RAID10)		+= raid10.o +obj-$(CONFIG_MD_RAID5)		+= raid5.o xor.o +obj-$(CONFIG_MD_RAID6)		+= raid6.o xor.o +obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o +obj-$(CONFIG_MD_FAULTY)		+= faulty.o +obj-$(CONFIG_BLK_DEV_MD)	+= md.o +obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o +obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o +obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o +obj-$(CONFIG_DM_MULTIPATH_EMC)	+= dm-emc.o +obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o +obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o +obj-$(CONFIG_DM_ZERO)		+= dm-zero.o + +quiet_cmd_unroll = UNROLL  $@ +      cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ +                   < $< > $@ || ( rm -f $@ && exit 1 ) + +ifeq ($(CONFIG_ALTIVEC),y) +altivec_flags := -maltivec -mabi=altivec +endif + +targets += raid6int1.c +$(obj)/raid6int1.c:   UNROLL := 1 +$(obj)/raid6int1.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +targets += raid6int2.c +$(obj)/raid6int2.c:   UNROLL := 2 +$(obj)/raid6int2.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +targets += raid6int4.c +$(obj)/raid6int4.c:   UNROLL := 4 +$(obj)/raid6int4.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +targets += raid6int8.c +$(obj)/raid6int8.c:   UNROLL := 8 +$(obj)/raid6int8.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +targets += raid6int16.c +$(obj)/raid6int16.c:  UNROLL := 16 +$(obj)/raid6int16.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +targets += raid6int32.c +$(obj)/raid6int32.c:  UNROLL := 32 +$(obj)/raid6int32.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +CFLAGS_raid6altivec1.o += $(altivec_flags) +targets += raid6altivec1.c +$(obj)/raid6altivec1.c:   UNROLL := 1 +$(obj)/raid6altivec1.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +CFLAGS_raid6altivec2.o += $(altivec_flags) +targets += raid6altivec2.c +$(obj)/raid6altivec2.c:   UNROLL := 2 +$(obj)/raid6altivec2.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +CFLAGS_raid6altivec4.o += $(altivec_flags) +targets += raid6altivec4.c +$(obj)/raid6altivec4.c:   UNROLL := 4 +$(obj)/raid6altivec4.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +CFLAGS_raid6altivec8.o += $(altivec_flags) +targets += raid6altivec8.c +$(obj)/raid6altivec8.c:   UNROLL := 8 +$(obj)/raid6altivec8.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE +	$(call if_changed,unroll) + +quiet_cmd_mktable = TABLE   $@ +      cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) + +targets += raid6tables.c +$(obj)/raid6tables.c: $(obj)/mktables FORCE +	$(call if_changed,mktable) diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h new file mode 100644 index 00000000000..bc021e1fd4d --- /dev/null +++ b/drivers/md/dm-bio-list.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2004 Red Hat UK Ltd. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_LIST_H +#define DM_BIO_LIST_H + +#include <linux/bio.h> + +struct bio_list { +	struct bio *head; +	struct bio *tail; +}; + +static inline void bio_list_init(struct bio_list *bl) +{ +	bl->head = bl->tail = NULL; +} + +static inline void bio_list_add(struct bio_list *bl, struct bio *bio) +{ +	bio->bi_next = NULL; + +	if (bl->tail) +		bl->tail->bi_next = bio; +	else +		bl->head = bio; + +	bl->tail = bio; +} + +static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) +{ +	if (bl->tail) +		bl->tail->bi_next = bl2->head; +	else +		bl->head = bl2->head; + +	bl->tail = bl2->tail; +} + +static inline struct bio *bio_list_pop(struct bio_list *bl) +{ +	struct bio *bio = bl->head; + +	if (bio) { +		bl->head = bl->head->bi_next; +		if (!bl->head) +			bl->tail = NULL; + +		bio->bi_next = NULL; +	} + +	return bio; +} + +static inline struct bio *bio_list_get(struct bio_list *bl) +{ +	struct bio *bio = bl->head; + +	bl->head = bl->tail = NULL; + +	return bio; +} + +#endif diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h new file mode 100644 index 00000000000..d3ec217847d --- /dev/null +++ b/drivers/md/dm-bio-record.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_RECORD_H +#define DM_BIO_RECORD_H + +#include <linux/bio.h> + +/* + * There are lots of mutable fields in the bio struct that get + * changed by the lower levels of the block layer.  Some targets, + * such as multipath, may wish to resubmit a bio on error.  The + * functions in this file help the target record and restore the + * original bio state. + */ +struct dm_bio_details { +	sector_t bi_sector; +	struct block_device *bi_bdev; +	unsigned int bi_size; +	unsigned short bi_idx; +	unsigned long bi_flags; +}; + +static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) +{ +	bd->bi_sector = bio->bi_sector; +	bd->bi_bdev = bio->bi_bdev; +	bd->bi_size = bio->bi_size; +	bd->bi_idx = bio->bi_idx; +	bd->bi_flags = bio->bi_flags; +} + +static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) +{ +	bio->bi_sector = bd->bi_sector; +	bio->bi_bdev = bd->bi_bdev; +	bio->bi_size = bd->bi_size; +	bio->bi_idx = bd->bi_idx; +	bio->bi_flags = bd->bi_flags; +} + +#endif diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c new file mode 100644 index 00000000000..77619a56e2b --- /dev/null +++ b/drivers/md/dm-crypt.c @@ -0,0 +1,977 @@ +/* + * Copyright (C) 2003 Christophe Saout <christophe@saout.de> + * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> + * + * This file is released under the GPL. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/mempool.h> +#include <linux/slab.h> +#include <linux/crypto.h> +#include <linux/workqueue.h> +#include <asm/atomic.h> +#include <asm/scatterlist.h> +#include <asm/page.h> + +#include "dm.h" + +#define PFX	"crypt: " + +/* + * per bio private data + */ +struct crypt_io { +	struct dm_target *target; +	struct bio *bio; +	struct bio *first_clone; +	struct work_struct work; +	atomic_t pending; +	int error; +}; + +/* + * context holding the current state of a multi-part conversion + */ +struct convert_context { +	struct bio *bio_in; +	struct bio *bio_out; +	unsigned int offset_in; +	unsigned int offset_out; +	unsigned int idx_in; +	unsigned int idx_out; +	sector_t sector; +	int write; +}; + +struct crypt_config; + +struct crypt_iv_operations { +	int (*ctr)(struct crypt_config *cc, struct dm_target *ti, +	           const char *opts); +	void (*dtr)(struct crypt_config *cc); +	const char *(*status)(struct crypt_config *cc); +	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); +}; + +/* + * Crypt: maps a linear range of a block device + * and encrypts / decrypts at the same time. + */ +struct crypt_config { +	struct dm_dev *dev; +	sector_t start; + +	/* +	 * pool for per bio private data and +	 * for encryption buffer pages +	 */ +	mempool_t *io_pool; +	mempool_t *page_pool; + +	/* +	 * crypto related data +	 */ +	struct crypt_iv_operations *iv_gen_ops; +	char *iv_mode; +	void *iv_gen_private; +	sector_t iv_offset; +	unsigned int iv_size; + +	struct crypto_tfm *tfm; +	unsigned int key_size; +	u8 key[0]; +}; + +#define MIN_IOS        256 +#define MIN_POOL_PAGES 32 +#define MIN_BIO_PAGES  8 + +static kmem_cache_t *_crypt_io_pool; + +/* + * Mempool alloc and free functions for the page + */ +static void *mempool_alloc_page(unsigned int __nocast gfp_mask, void *data) +{ +	return alloc_page(gfp_mask); +} + +static void mempool_free_page(void *page, void *data) +{ +	__free_page(page); +} + + +/* + * Different IV generation algorithms: + * + * plain: the initial vector is the 32-bit low-endian version of the sector + *        number, padded with zeros if neccessary. + * + * ess_iv: "encrypted sector|salt initial vector", the sector number is + *         encrypted with the bulk cipher using a salt as key. The salt + *         should be derived from the bulk cipher's key via hashing. + * + * plumb: unimplemented, see: + * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 + */ + +static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +{ +	memset(iv, 0, cc->iv_size); +	*(u32 *)iv = cpu_to_le32(sector & 0xffffffff); + +	return 0; +} + +static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, +	                      const char *opts) +{ +	struct crypto_tfm *essiv_tfm; +	struct crypto_tfm *hash_tfm; +	struct scatterlist sg; +	unsigned int saltsize; +	u8 *salt; + +	if (opts == NULL) { +		ti->error = PFX "Digest algorithm missing for ESSIV mode"; +		return -EINVAL; +	} + +	/* Hash the cipher key with the given hash algorithm */ +	hash_tfm = crypto_alloc_tfm(opts, 0); +	if (hash_tfm == NULL) { +		ti->error = PFX "Error initializing ESSIV hash"; +		return -EINVAL; +	} + +	if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) { +		ti->error = PFX "Expected digest algorithm for ESSIV hash"; +		crypto_free_tfm(hash_tfm); +		return -EINVAL; +	} + +	saltsize = crypto_tfm_alg_digestsize(hash_tfm); +	salt = kmalloc(saltsize, GFP_KERNEL); +	if (salt == NULL) { +		ti->error = PFX "Error kmallocing salt storage in ESSIV"; +		crypto_free_tfm(hash_tfm); +		return -ENOMEM; +	} + +	sg.page = virt_to_page(cc->key); +	sg.offset = offset_in_page(cc->key); +	sg.length = cc->key_size; +	crypto_digest_digest(hash_tfm, &sg, 1, salt); +	crypto_free_tfm(hash_tfm); + +	/* Setup the essiv_tfm with the given salt */ +	essiv_tfm = crypto_alloc_tfm(crypto_tfm_alg_name(cc->tfm), +	                             CRYPTO_TFM_MODE_ECB); +	if (essiv_tfm == NULL) { +		ti->error = PFX "Error allocating crypto tfm for ESSIV"; +		kfree(salt); +		return -EINVAL; +	} +	if (crypto_tfm_alg_blocksize(essiv_tfm) +	    != crypto_tfm_alg_ivsize(cc->tfm)) { +		ti->error = PFX "Block size of ESSIV cipher does " +			        "not match IV size of block cipher"; +		crypto_free_tfm(essiv_tfm); +		kfree(salt); +		return -EINVAL; +	} +	if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) { +		ti->error = PFX "Failed to set key for ESSIV cipher"; +		crypto_free_tfm(essiv_tfm); +		kfree(salt); +		return -EINVAL; +	} +	kfree(salt); + +	cc->iv_gen_private = (void *)essiv_tfm; +	return 0; +} + +static void crypt_iv_essiv_dtr(struct crypt_config *cc) +{ +	crypto_free_tfm((struct crypto_tfm *)cc->iv_gen_private); +	cc->iv_gen_private = NULL; +} + +static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +{ +	struct scatterlist sg = { NULL, }; + +	memset(iv, 0, cc->iv_size); +	*(u64 *)iv = cpu_to_le64(sector); + +	sg.page = virt_to_page(iv); +	sg.offset = offset_in_page(iv); +	sg.length = cc->iv_size; +	crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private, +	                      &sg, &sg, cc->iv_size); + +	return 0; +} + +static struct crypt_iv_operations crypt_iv_plain_ops = { +	.generator = crypt_iv_plain_gen +}; + +static struct crypt_iv_operations crypt_iv_essiv_ops = { +	.ctr       = crypt_iv_essiv_ctr, +	.dtr       = crypt_iv_essiv_dtr, +	.generator = crypt_iv_essiv_gen +}; + + +static inline int +crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, +                          struct scatterlist *in, unsigned int length, +                          int write, sector_t sector) +{ +	u8 iv[cc->iv_size]; +	int r; + +	if (cc->iv_gen_ops) { +		r = cc->iv_gen_ops->generator(cc, iv, sector); +		if (r < 0) +			return r; + +		if (write) +			r = crypto_cipher_encrypt_iv(cc->tfm, out, in, length, iv); +		else +			r = crypto_cipher_decrypt_iv(cc->tfm, out, in, length, iv); +	} else { +		if (write) +			r = crypto_cipher_encrypt(cc->tfm, out, in, length); +		else +			r = crypto_cipher_decrypt(cc->tfm, out, in, length); +	} + +	return r; +} + +static void +crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, +                   struct bio *bio_out, struct bio *bio_in, +                   sector_t sector, int write) +{ +	ctx->bio_in = bio_in; +	ctx->bio_out = bio_out; +	ctx->offset_in = 0; +	ctx->offset_out = 0; +	ctx->idx_in = bio_in ? bio_in->bi_idx : 0; +	ctx->idx_out = bio_out ? bio_out->bi_idx : 0; +	ctx->sector = sector + cc->iv_offset; +	ctx->write = write; +} + +/* + * Encrypt / decrypt data from one bio to another one (can be the same one) + */ +static int crypt_convert(struct crypt_config *cc, +                         struct convert_context *ctx) +{ +	int r = 0; + +	while(ctx->idx_in < ctx->bio_in->bi_vcnt && +	      ctx->idx_out < ctx->bio_out->bi_vcnt) { +		struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); +		struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); +		struct scatterlist sg_in = { +			.page = bv_in->bv_page, +			.offset = bv_in->bv_offset + ctx->offset_in, +			.length = 1 << SECTOR_SHIFT +		}; +		struct scatterlist sg_out = { +			.page = bv_out->bv_page, +			.offset = bv_out->bv_offset + ctx->offset_out, +			.length = 1 << SECTOR_SHIFT +		}; + +		ctx->offset_in += sg_in.length; +		if (ctx->offset_in >= bv_in->bv_len) { +			ctx->offset_in = 0; +			ctx->idx_in++; +		} + +		ctx->offset_out += sg_out.length; +		if (ctx->offset_out >= bv_out->bv_len) { +			ctx->offset_out = 0; +			ctx->idx_out++; +		} + +		r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length, +		                              ctx->write, ctx->sector); +		if (r < 0) +			break; + +		ctx->sector++; +	} + +	return r; +} + +/* + * Generate a new unfragmented bio with the given size + * This should never violate the device limitations + * May return a smaller bio when running out of pages + */ +static struct bio * +crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, +                   struct bio *base_bio, unsigned int *bio_vec_idx) +{ +	struct bio *bio; +	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; +	int gfp_mask = GFP_NOIO | __GFP_HIGHMEM; +	unsigned long flags = current->flags; +	unsigned int i; + +	/* +	 * Tell VM to act less aggressively and fail earlier. +	 * This is not necessary but increases throughput. +	 * FIXME: Is this really intelligent? +	 */ +	current->flags &= ~PF_MEMALLOC; + +	if (base_bio) +		bio = bio_clone(base_bio, GFP_NOIO); +	else +		bio = bio_alloc(GFP_NOIO, nr_iovecs); +	if (!bio) { +		if (flags & PF_MEMALLOC) +			current->flags |= PF_MEMALLOC; +		return NULL; +	} + +	/* if the last bio was not complete, continue where that one ended */ +	bio->bi_idx = *bio_vec_idx; +	bio->bi_vcnt = *bio_vec_idx; +	bio->bi_size = 0; +	bio->bi_flags &= ~(1 << BIO_SEG_VALID); + +	/* bio->bi_idx pages have already been allocated */ +	size -= bio->bi_idx * PAGE_SIZE; + +	for(i = bio->bi_idx; i < nr_iovecs; i++) { +		struct bio_vec *bv = bio_iovec_idx(bio, i); + +		bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask); +		if (!bv->bv_page) +			break; + +		/* +		 * if additional pages cannot be allocated without waiting, +		 * return a partially allocated bio, the caller will then try +		 * to allocate additional bios while submitting this partial bio +		 */ +		if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1)) +			gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; + +		bv->bv_offset = 0; +		if (size > PAGE_SIZE) +			bv->bv_len = PAGE_SIZE; +		else +			bv->bv_len = size; + +		bio->bi_size += bv->bv_len; +		bio->bi_vcnt++; +		size -= bv->bv_len; +	} + +	if (flags & PF_MEMALLOC) +		current->flags |= PF_MEMALLOC; + +	if (!bio->bi_size) { +		bio_put(bio); +		return NULL; +	} + +	/* +	 * Remember the last bio_vec allocated to be able +	 * to correctly continue after the splitting. +	 */ +	*bio_vec_idx = bio->bi_vcnt; + +	return bio; +} + +static void crypt_free_buffer_pages(struct crypt_config *cc, +                                    struct bio *bio, unsigned int bytes) +{ +	unsigned int i, start, end; +	struct bio_vec *bv; + +	/* +	 * This is ugly, but Jens Axboe thinks that using bi_idx in the +	 * endio function is too dangerous at the moment, so I calculate the +	 * correct position using bi_vcnt and bi_size. +	 * The bv_offset and bv_len fields might already be modified but we +	 * know that we always allocated whole pages. +	 * A fix to the bi_idx issue in the kernel is in the works, so +	 * we will hopefully be able to revert to the cleaner solution soon. +	 */ +	i = bio->bi_vcnt - 1; +	bv = bio_iovec_idx(bio, i); +	end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size; +	start = end - bytes; + +	start >>= PAGE_SHIFT; +	if (!bio->bi_size) +		end = bio->bi_vcnt; +	else +		end >>= PAGE_SHIFT; + +	for(i = start; i < end; i++) { +		bv = bio_iovec_idx(bio, i); +		BUG_ON(!bv->bv_page); +		mempool_free(bv->bv_page, cc->page_pool); +		bv->bv_page = NULL; +	} +} + +/* + * One of the bios was finished. Check for completion of + * the whole request and correctly clean up the buffer. + */ +static void dec_pending(struct crypt_io *io, int error) +{ +	struct crypt_config *cc = (struct crypt_config *) io->target->private; + +	if (error < 0) +		io->error = error; + +	if (!atomic_dec_and_test(&io->pending)) +		return; + +	if (io->first_clone) +		bio_put(io->first_clone); + +	bio_endio(io->bio, io->bio->bi_size, io->error); + +	mempool_free(io, cc->io_pool); +} + +/* + * kcryptd: + * + * Needed because it would be very unwise to do decryption in an + * interrupt context, so bios returning from read requests get + * queued here. + */ +static struct workqueue_struct *_kcryptd_workqueue; + +static void kcryptd_do_work(void *data) +{ +	struct crypt_io *io = (struct crypt_io *) data; +	struct crypt_config *cc = (struct crypt_config *) io->target->private; +	struct convert_context ctx; +	int r; + +	crypt_convert_init(cc, &ctx, io->bio, io->bio, +	                   io->bio->bi_sector - io->target->begin, 0); +	r = crypt_convert(cc, &ctx); + +	dec_pending(io, r); +} + +static void kcryptd_queue_io(struct crypt_io *io) +{ +	INIT_WORK(&io->work, kcryptd_do_work, io); +	queue_work(_kcryptd_workqueue, &io->work); +} + +/* + * Decode key from its hex representation + */ +static int crypt_decode_key(u8 *key, char *hex, unsigned int size) +{ +	char buffer[3]; +	char *endp; +	unsigned int i; + +	buffer[2] = '\0'; + +	for(i = 0; i < size; i++) { +		buffer[0] = *hex++; +		buffer[1] = *hex++; + +		key[i] = (u8)simple_strtoul(buffer, &endp, 16); + +		if (endp != &buffer[2]) +			return -EINVAL; +	} + +	if (*hex != '\0') +		return -EINVAL; + +	return 0; +} + +/* + * Encode key into its hex representation + */ +static void crypt_encode_key(char *hex, u8 *key, unsigned int size) +{ +	unsigned int i; + +	for(i = 0; i < size; i++) { +		sprintf(hex, "%02x", *key); +		hex += 2; +		key++; +	} +} + +/* + * Construct an encryption mapping: + * <cipher> <key> <iv_offset> <dev_path> <start> + */ +static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ +	struct crypt_config *cc; +	struct crypto_tfm *tfm; +	char *tmp; +	char *cipher; +	char *chainmode; +	char *ivmode; +	char *ivopts; +	unsigned int crypto_flags; +	unsigned int key_size; + +	if (argc != 5) { +		ti->error = PFX "Not enough arguments"; +		return -EINVAL; +	} + +	tmp = argv[0]; +	cipher = strsep(&tmp, "-"); +	chainmode = strsep(&tmp, "-"); +	ivopts = strsep(&tmp, "-"); +	ivmode = strsep(&ivopts, ":"); + +	if (tmp) +		DMWARN(PFX "Unexpected additional cipher options"); + +	key_size = strlen(argv[1]) >> 1; + +	cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); +	if (cc == NULL) { +		ti->error = +			PFX "Cannot allocate transparent encryption context"; +		return -ENOMEM; +	} + +	cc->key_size = key_size; +	if ((!key_size && strcmp(argv[1], "-") != 0) || +	    (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) { +		ti->error = PFX "Error decoding key"; +		goto bad1; +	} + +	/* Compatiblity mode for old dm-crypt cipher strings */ +	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { +		chainmode = "cbc"; +		ivmode = "plain"; +	} + +	/* Choose crypto_flags according to chainmode */ +	if (strcmp(chainmode, "cbc") == 0) +		crypto_flags = CRYPTO_TFM_MODE_CBC; +	else if (strcmp(chainmode, "ecb") == 0) +		crypto_flags = CRYPTO_TFM_MODE_ECB; +	else { +		ti->error = PFX "Unknown chaining mode"; +		goto bad1; +	} + +	if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) { +		ti->error = PFX "This chaining mode requires an IV mechanism"; +		goto bad1; +	} + +	tfm = crypto_alloc_tfm(cipher, crypto_flags); +	if (!tfm) { +		ti->error = PFX "Error allocating crypto tfm"; +		goto bad1; +	} +	if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) { +		ti->error = PFX "Expected cipher algorithm"; +		goto bad2; +	} + +	cc->tfm = tfm; + +	/* +	 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>". +	 * See comments at iv code +	 */ + +	if (ivmode == NULL) +		cc->iv_gen_ops = NULL; +	else if (strcmp(ivmode, "plain") == 0) +		cc->iv_gen_ops = &crypt_iv_plain_ops; +	else if (strcmp(ivmode, "essiv") == 0) +		cc->iv_gen_ops = &crypt_iv_essiv_ops; +	else { +		ti->error = PFX "Invalid IV mode"; +		goto bad2; +	} + +	if (cc->iv_gen_ops && cc->iv_gen_ops->ctr && +	    cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) +		goto bad2; + +	if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv) +		/* at least a 64 bit sector number should fit in our buffer */ +		cc->iv_size = max(crypto_tfm_alg_ivsize(tfm), +		                  (unsigned int)(sizeof(u64) / sizeof(u8))); +	else { +		cc->iv_size = 0; +		if (cc->iv_gen_ops) { +			DMWARN(PFX "Selected cipher does not support IVs"); +			if (cc->iv_gen_ops->dtr) +				cc->iv_gen_ops->dtr(cc); +			cc->iv_gen_ops = NULL; +		} +	} + +	cc->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, +				     mempool_free_slab, _crypt_io_pool); +	if (!cc->io_pool) { +		ti->error = PFX "Cannot allocate crypt io mempool"; +		goto bad3; +	} + +	cc->page_pool = mempool_create(MIN_POOL_PAGES, mempool_alloc_page, +				       mempool_free_page, NULL); +	if (!cc->page_pool) { +		ti->error = PFX "Cannot allocate page mempool"; +		goto bad4; +	} + +	if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) { +		ti->error = PFX "Error setting key"; +		goto bad5; +	} + +	if (sscanf(argv[2], SECTOR_FORMAT, &cc->iv_offset) != 1) { +		ti->error = PFX "Invalid iv_offset sector"; +		goto bad5; +	} + +	if (sscanf(argv[4], SECTOR_FORMAT, &cc->start) != 1) { +		ti->error = PFX "Invalid device sector"; +		goto bad5; +	} + +	if (dm_get_device(ti, argv[3], cc->start, ti->len, +	                  dm_table_get_mode(ti->table), &cc->dev)) { +		ti->error = PFX "Device lookup failed"; +		goto bad5; +	} + +	if (ivmode && cc->iv_gen_ops) { +		if (ivopts) +			*(ivopts - 1) = ':'; +		cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL); +		if (!cc->iv_mode) { +			ti->error = PFX "Error kmallocing iv_mode string"; +			goto bad5; +		} +		strcpy(cc->iv_mode, ivmode); +	} else +		cc->iv_mode = NULL; + +	ti->private = cc; +	return 0; + +bad5: +	mempool_destroy(cc->page_pool); +bad4: +	mempool_destroy(cc->io_pool); +bad3: +	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) +		cc->iv_gen_ops->dtr(cc); +bad2: +	crypto_free_tfm(tfm); +bad1: +	kfree(cc); +	return -EINVAL; +} + +static void crypt_dtr(struct dm_target *ti) +{ +	struct crypt_config *cc = (struct crypt_config *) ti->private; + +	mempool_destroy(cc->page_pool); +	mempool_destroy(cc->io_pool); + +	if (cc->iv_mode) +		kfree(cc->iv_mode); +	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) +		cc->iv_gen_ops->dtr(cc); +	crypto_free_tfm(cc->tfm); +	dm_put_device(ti, cc->dev); +	kfree(cc); +} + +static int crypt_endio(struct bio *bio, unsigned int done, int error) +{ +	struct crypt_io *io = (struct crypt_io *) bio->bi_private; +	struct crypt_config *cc = (struct crypt_config *) io->target->private; + +	if (bio_data_dir(bio) == WRITE) { +		/* +		 * free the processed pages, even if +		 * it's only a partially completed write +		 */ +		crypt_free_buffer_pages(cc, bio, done); +	} + +	if (bio->bi_size) +		return 1; + +	bio_put(bio); + +	/* +	 * successful reads are decrypted by the worker thread +	 */ +	if ((bio_data_dir(bio) == READ) +	    && bio_flagged(bio, BIO_UPTODATE)) { +		kcryptd_queue_io(io); +		return 0; +	} + +	dec_pending(io, error); +	return error; +} + +static inline struct bio * +crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio, +            sector_t sector, unsigned int *bvec_idx, +            struct convert_context *ctx) +{ +	struct bio *clone; + +	if (bio_data_dir(bio) == WRITE) { +		clone = crypt_alloc_buffer(cc, bio->bi_size, +                                 io->first_clone, bvec_idx); +		if (clone) { +			ctx->bio_out = clone; +			if (crypt_convert(cc, ctx) < 0) { +				crypt_free_buffer_pages(cc, clone, +				                        clone->bi_size); +				bio_put(clone); +				return NULL; +			} +		} +	} else { +		/* +		 * The block layer might modify the bvec array, so always +		 * copy the required bvecs because we need the original +		 * one in order to decrypt the whole bio data *afterwards*. +		 */ +		clone = bio_alloc(GFP_NOIO, bio_segments(bio)); +		if (clone) { +			clone->bi_idx = 0; +			clone->bi_vcnt = bio_segments(bio); +			clone->bi_size = bio->bi_size; +			memcpy(clone->bi_io_vec, bio_iovec(bio), +			       sizeof(struct bio_vec) * clone->bi_vcnt); +		} +	} + +	if (!clone) +		return NULL; + +	clone->bi_private = io; +	clone->bi_end_io = crypt_endio; +	clone->bi_bdev = cc->dev->bdev; +	clone->bi_sector = cc->start + sector; +	clone->bi_rw = bio->bi_rw; + +	return clone; +} + +static int crypt_map(struct dm_target *ti, struct bio *bio, +		     union map_info *map_context) +{ +	struct crypt_config *cc = (struct crypt_config *) ti->private; +	struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO); +	struct convert_context ctx; +	struct bio *clone; +	unsigned int remaining = bio->bi_size; +	sector_t sector = bio->bi_sector - ti->begin; +	unsigned int bvec_idx = 0; + +	io->target = ti; +	io->bio = bio; +	io->first_clone = NULL; +	io->error = 0; +	atomic_set(&io->pending, 1); /* hold a reference */ + +	if (bio_data_dir(bio) == WRITE) +		crypt_convert_init(cc, &ctx, NULL, bio, sector, 1); + +	/* +	 * The allocated buffers can be smaller than the whole bio, +	 * so repeat the whole process until all the data can be handled. +	 */ +	while (remaining) { +		clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx); +		if (!clone) +			goto cleanup; + +		if (!io->first_clone) { +			/* +			 * hold a reference to the first clone, because it +			 * holds the bio_vec array and that can't be freed +			 * before all other clones are released +			 */ +			bio_get(clone); +			io->first_clone = clone; +		} +		atomic_inc(&io->pending); + +		remaining -= clone->bi_size; +		sector += bio_sectors(clone); + +		generic_make_request(clone); + +		/* out of memory -> run queues */ +		if (remaining) +			blk_congestion_wait(bio_data_dir(clone), HZ/100); +	} + +	/* drop reference, clones could have returned before we reach this */ +	dec_pending(io, 0); +	return 0; + +cleanup: +	if (io->first_clone) { +		dec_pending(io, -ENOMEM); +		return 0; +	} + +	/* if no bio has been dispatched yet, we can directly return the error */ +	mempool_free(io, cc->io_pool); +	return -ENOMEM; +} + +static int crypt_status(struct dm_target *ti, status_type_t type, +			char *result, unsigned int maxlen) +{ +	struct crypt_config *cc = (struct crypt_config *) ti->private; +	const char *cipher; +	const char *chainmode = NULL; +	unsigned int sz = 0; + +	switch (type) { +	case STATUSTYPE_INFO: +		result[0] = '\0'; +		break; + +	case STATUSTYPE_TABLE: +		cipher = crypto_tfm_alg_name(cc->tfm); + +		switch(cc->tfm->crt_cipher.cit_mode) { +		case CRYPTO_TFM_MODE_CBC: +			chainmode = "cbc"; +			break; +		case CRYPTO_TFM_MODE_ECB: +			chainmode = "ecb"; +			break; +		default: +			BUG(); +		} + +		if (cc->iv_mode) +			DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode); +		else +			DMEMIT("%s-%s ", cipher, chainmode); + +		if (cc->key_size > 0) { +			if ((maxlen - sz) < ((cc->key_size << 1) + 1)) +				return -ENOMEM; + +			crypt_encode_key(result + sz, cc->key, cc->key_size); +			sz += cc->key_size << 1; +		} else { +			if (sz >= maxlen) +				return -ENOMEM; +			result[sz++] = '-'; +		} + +		DMEMIT(" " SECTOR_FORMAT " %s " SECTOR_FORMAT, +		       cc->iv_offset, cc->dev->name, cc->start); +		break; +	} +	return 0; +} + +static struct target_type crypt_target = { +	.name   = "crypt", +	.version= {1, 1, 0}, +	.module = THIS_MODULE, +	.ctr    = crypt_ctr, +	.dtr    = crypt_dtr, +	.map    = crypt_map, +	.status = crypt_status, +}; + +static int __init dm_crypt_init(void) +{ +	int r; + +	_crypt_io_pool = kmem_cache_create("dm-crypt_io", +	                                   sizeof(struct crypt_io), +	                                   0, 0, NULL, NULL); +	if (!_crypt_io_pool) +		return -ENOMEM; + +	_kcryptd_workqueue = create_workqueue("kcryptd"); +	if (!_kcryptd_workqueue) { +		r = -ENOMEM; +		DMERR(PFX "couldn't create kcryptd"); +		goto bad1; +	} + +	r = dm_register_target(&crypt_target); +	if (r < 0) { +		DMERR(PFX "register failed %d", r); +		goto bad2; +	} + +	return 0; + +bad2: +	destroy_workqueue(_kcryptd_workqueue); +bad1: +	kmem_cache_destroy(_crypt_io_pool); +	return r; +} + +static void __exit dm_crypt_exit(void) +{ +	int r = dm_unregister_target(&crypt_target); + +	if (r < 0) +		DMERR(PFX "unregister failed %d", r); + +	destroy_workqueue(_kcryptd_workqueue); +	kmem_cache_destroy(_crypt_io_pool); +} + +module_init(dm_crypt_init); +module_exit(dm_crypt_exit); + +MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); +MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c new file mode 100644 index 00000000000..70065866459 --- /dev/null +++ b/drivers/md/dm-emc.c @@ -0,0 +1,359 @@ +/* + * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath support for EMC CLARiiON AX/CX-series hardware. + */ + +#include "dm.h" +#include "dm-hw-handler.h" +#include <scsi/scsi.h> +#include <scsi/scsi_cmnd.h> + +struct emc_handler { +	spinlock_t lock; + +	/* Whether we should send the short trespass command (FC-series) +	 * or the long version (default for AX/CX CLARiiON arrays). */ +	unsigned short_trespass; +	/* Whether or not to honor SCSI reservations when initiating a +	 * switch-over. Default: Don't. */ +	unsigned hr; + +	unsigned char sense[SCSI_SENSE_BUFFERSIZE]; +}; + +#define TRESPASS_PAGE 0x22 +#define EMC_FAILOVER_TIMEOUT (60 * HZ) + +/* Code borrowed from dm-lsi-rdac by Mike Christie */ + +static inline void free_bio(struct bio *bio) +{ +	__free_page(bio->bi_io_vec[0].bv_page); +	bio_put(bio); +} + +static int emc_endio(struct bio *bio, unsigned int bytes_done, int error) +{ +	struct path *path = bio->bi_private; + +	if (bio->bi_size) +		return 1; + +	/* We also need to look at the sense keys here whether or not to +	 * switch to the next PG etc. +	 * +	 * For now simple logic: either it works or it doesn't. +	 */ +	if (error) +		dm_pg_init_complete(path, MP_FAIL_PATH); +	else +		dm_pg_init_complete(path, 0); + +	/* request is freed in block layer */ +	free_bio(bio); + +	return 0; +} + +static struct bio *get_failover_bio(struct path *path, unsigned data_size) +{ +	struct bio *bio; +	struct page *page; + +	bio = bio_alloc(GFP_ATOMIC, 1); +	if (!bio) { +		DMERR("dm-emc: get_failover_bio: bio_alloc() failed."); +		return NULL; +	} + +	bio->bi_rw |= (1 << BIO_RW); +	bio->bi_bdev = path->dev->bdev; +	bio->bi_sector = 0; +	bio->bi_private = path; +	bio->bi_end_io = emc_endio; + +	page = alloc_page(GFP_ATOMIC); +	if (!page) { +		DMERR("dm-emc: get_failover_bio: alloc_page() failed."); +		bio_put(bio); +		return NULL; +	} + +	if (bio_add_page(bio, page, data_size, 0) != data_size) { +		DMERR("dm-emc: get_failover_bio: alloc_page() failed."); +		__free_page(page); +		bio_put(bio); +		return NULL; +	} + +	return bio; +} + +static struct request *get_failover_req(struct emc_handler *h, +					struct bio *bio, struct path *path) +{ +	struct request *rq; +	struct block_device *bdev = bio->bi_bdev; +	struct request_queue *q = bdev_get_queue(bdev); + +	/* FIXME: Figure out why it fails with GFP_ATOMIC. */ +	rq = blk_get_request(q, WRITE, __GFP_WAIT); +	if (!rq) { +		DMERR("dm-emc: get_failover_req: blk_get_request failed"); +		return NULL; +	} + +	rq->bio = rq->biotail = bio; +	blk_rq_bio_prep(q, rq, bio); + +	rq->rq_disk = bdev->bd_contains->bd_disk; + +	/* bio backed don't set data */ +	rq->buffer = rq->data = NULL; +	/* rq data_len used for pc cmd's request_bufflen */ +	rq->data_len = bio->bi_size; + +	rq->sense = h->sense; +	memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); +	rq->sense_len = 0; + +	memset(&rq->cmd, 0, BLK_MAX_CDB); + +	rq->timeout = EMC_FAILOVER_TIMEOUT; +	rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE); + +	return rq; +} + +static struct request *emc_trespass_get(struct emc_handler *h, +					struct path *path) +{ +	struct bio *bio; +	struct request *rq; +	unsigned char *page22; +	unsigned char long_trespass_pg[] = { +		0, 0, 0, 0, +		TRESPASS_PAGE,        /* Page code */ +		0x09,                 /* Page length - 2 */ +		h->hr ? 0x01 : 0x81,  /* Trespass code + Honor reservation bit */ +		0xff, 0xff,           /* Trespass target */ +		0, 0, 0, 0, 0, 0      /* Reserved bytes / unknown */ +		}; +	unsigned char short_trespass_pg[] = { +		0, 0, 0, 0, +		TRESPASS_PAGE,        /* Page code */ +		0x02,                 /* Page length - 2 */ +		h->hr ? 0x01 : 0x81,  /* Trespass code + Honor reservation bit */ +		0xff,                 /* Trespass target */ +		}; +	unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) : +				sizeof(long_trespass_pg); + +	/* get bio backing */ +	if (data_size > PAGE_SIZE) +		/* this should never happen */ +		return NULL; + +	bio = get_failover_bio(path, data_size); +	if (!bio) { +		DMERR("dm-emc: emc_trespass_get: no bio"); +		return NULL; +	} + +	page22 = (unsigned char *)bio_data(bio); +	memset(page22, 0, data_size); + +	memcpy(page22, h->short_trespass ? +		short_trespass_pg : long_trespass_pg, data_size); + +	/* get request for block layer packet command */ +	rq = get_failover_req(h, bio, path); +	if (!rq) { +		DMERR("dm-emc: emc_trespass_get: no rq"); +		free_bio(bio); +		return NULL; +	} + +	/* Prepare the command. */ +	rq->cmd[0] = MODE_SELECT; +	rq->cmd[1] = 0x10; +	rq->cmd[4] = data_size; +	rq->cmd_len = COMMAND_SIZE(rq->cmd[0]); + +	return rq; +} + +static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed, +			struct path *path) +{ +	struct request *rq; +	struct request_queue *q = bdev_get_queue(path->dev->bdev); + +	/* +	 * We can either blindly init the pg (then look at the sense), +	 * or we can send some commands to get the state here (then +	 * possibly send the fo cmnd), or we can also have the +	 * initial state passed into us and then get an update here. +	 */ +	if (!q) { +		DMINFO("dm-emc: emc_pg_init: no queue"); +		goto fail_path; +	} + +	/* FIXME: The request should be pre-allocated. */ +	rq = emc_trespass_get(hwh->context, path); +	if (!rq) { +		DMERR("dm-emc: emc_pg_init: no rq"); +		goto fail_path; +	} + +	DMINFO("dm-emc: emc_pg_init: sending switch-over command"); +	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1); +	return; + +fail_path: +	dm_pg_init_complete(path, MP_FAIL_PATH); +} + +static struct emc_handler *alloc_emc_handler(void) +{ +	struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL); + +	if (h) +		spin_lock_init(&h->lock); + +	return h; +} + +static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv) +{ +	struct emc_handler *h; +	unsigned hr, short_trespass; + +	if (argc == 0) { +		/* No arguments: use defaults */ +		hr = 0; +		short_trespass = 0; +	} else if (argc != 2) { +		DMWARN("dm-emc hwhandler: incorrect number of arguments"); +		return -EINVAL; +	} else { +		if ((sscanf(argv[0], "%u", &short_trespass) != 1) +			|| (short_trespass > 1)) { +			DMWARN("dm-emc: invalid trespass mode selected"); +			return -EINVAL; +		} + +		if ((sscanf(argv[1], "%u", &hr) != 1) +			|| (hr > 1)) { +			DMWARN("dm-emc: invalid honor reservation flag selected"); +			return -EINVAL; +		} +	} + +	h = alloc_emc_handler(); +	if (!h) +		return -ENOMEM; + +	memset(h, 0, sizeof(*h)); + +	hwh->context = h; + +	if ((h->short_trespass = short_trespass)) +		DMWARN("dm-emc: short trespass command will be send"); +	else +		DMWARN("dm-emc: long trespass command will be send"); + +	if ((h->hr = hr)) +		DMWARN("dm-emc: honor reservation bit will be set"); +	else +		DMWARN("dm-emc: honor reservation bit will not be set (default)"); + +	return 0; +} + +static void emc_destroy(struct hw_handler *hwh) +{ +	struct emc_handler *h = (struct emc_handler *) hwh->context; + +	kfree(h); +	hwh->context = NULL; +} + +static unsigned emc_error(struct hw_handler *hwh, struct bio *bio) +{ +	/* FIXME: Patch from axboe still missing */ +#if 0 +	int sense; + +	if (bio->bi_error & BIO_SENSE) { +		sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */ + +		if (sense == 0x020403) { +			/* LUN Not Ready - Manual Intervention Required +			 * indicates this is a passive path. +			 * +			 * FIXME: However, if this is seen and EVPD C0 +			 * indicates that this is due to a NDU in +			 * progress, we should set FAIL_PATH too. +			 * This indicates we might have to do a SCSI +			 * inquiry in the end_io path. Ugh. */ +			return MP_BYPASS_PG | MP_RETRY_IO; +		} else if (sense == 0x052501) { +			/* An array based copy is in progress. Do not +			 * fail the path, do not bypass to another PG, +			 * do not retry. Fail the IO immediately. +			 * (Actually this is the same conclusion as in +			 * the default handler, but lets make sure.) */ +			return 0; +		} else if (sense == 0x062900) { +			/* Unit Attention Code. This is the first IO +			 * to the new path, so just retry. */ +			return MP_RETRY_IO; +		} +	} +#endif + +	/* Try default handler */ +	return dm_scsi_err_handler(hwh, bio); +} + +static struct hw_handler_type emc_hwh = { +	.name = "emc", +	.module = THIS_MODULE, +	.create = emc_create, +	.destroy = emc_destroy, +	.pg_init = emc_pg_init, +	.error = emc_error, +}; + +static int __init dm_emc_init(void) +{ +	int r = dm_register_hw_handler(&emc_hwh); + +	if (r < 0) +		DMERR("emc: register failed %d", r); + +	DMINFO("dm-emc version 0.0.3 loaded"); + +	return r; +} + +static void __exit dm_emc_exit(void) +{ +	int r = dm_unregister_hw_handler(&emc_hwh); + +	if (r < 0) +		DMERR("emc: unregister failed %d", r); +} + +module_init(dm_emc_init); +module_exit(dm_emc_exit); + +MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath"); +MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c new file mode 100644 index 00000000000..17212b4201a --- /dev/null +++ b/drivers/md/dm-exception-store.c @@ -0,0 +1,648 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-snap.h" +#include "dm-io.h" +#include "kcopyd.h" + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> + +/*----------------------------------------------------------------- + * Persistent snapshots, by persistent we mean that the snapshot + * will survive a reboot. + *---------------------------------------------------------------*/ + +/* + * We need to store a record of which parts of the origin have + * been copied to the snapshot device.  The snapshot code + * requires that we copy exception chunks to chunk aligned areas + * of the COW store.  It makes sense therefore, to store the + * metadata in chunk size blocks. + * + * There is no backward or forward compatibility implemented, + * snapshots with different disk versions than the kernel will + * not be usable.  It is expected that "lvcreate" will blank out + * the start of a fresh COW device before calling the snapshot + * constructor. + * + * The first chunk of the COW device just contains the header. + * After this there is a chunk filled with exception metadata, + * followed by as many exception chunks as can fit in the + * metadata areas. + * + * All on disk structures are in little-endian format.  The end + * of the exceptions info is indicated by an exception with a + * new_chunk of 0, which is invalid since it would point to the + * header chunk. + */ + +/* + * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + */ +#define SNAP_MAGIC 0x70416e53 + +/* + * The on-disk version of the metadata. + */ +#define SNAPSHOT_DISK_VERSION 1 + +struct disk_header { +	uint32_t magic; + +	/* +	 * Is this snapshot valid.  There is no way of recovering +	 * an invalid snapshot. +	 */ +	uint32_t valid; + +	/* +	 * Simple, incrementing version. no backward +	 * compatibility. +	 */ +	uint32_t version; + +	/* In sectors */ +	uint32_t chunk_size; +}; + +struct disk_exception { +	uint64_t old_chunk; +	uint64_t new_chunk; +}; + +struct commit_callback { +	void (*callback)(void *, int success); +	void *context; +}; + +/* + * The top level structure for a persistent exception store. + */ +struct pstore { +	struct dm_snapshot *snap;	/* up pointer to my snapshot */ +	int version; +	int valid; +	uint32_t chunk_size; +	uint32_t exceptions_per_area; + +	/* +	 * Now that we have an asynchronous kcopyd there is no +	 * need for large chunk sizes, so it wont hurt to have a +	 * whole chunks worth of metadata in memory at once. +	 */ +	void *area; + +	/* +	 * Used to keep track of which metadata area the data in +	 * 'chunk' refers to. +	 */ +	uint32_t current_area; + +	/* +	 * The next free chunk for an exception. +	 */ +	uint32_t next_free; + +	/* +	 * The index of next free exception in the current +	 * metadata area. +	 */ +	uint32_t current_committed; + +	atomic_t pending_count; +	uint32_t callback_count; +	struct commit_callback *callbacks; +}; + +static inline unsigned int sectors_to_pages(unsigned int sectors) +{ +	return sectors / (PAGE_SIZE >> 9); +} + +static int alloc_area(struct pstore *ps) +{ +	int r = -ENOMEM; +	size_t len; + +	len = ps->chunk_size << SECTOR_SHIFT; + +	/* +	 * Allocate the chunk_size block of memory that will hold +	 * a single metadata area. +	 */ +	ps->area = vmalloc(len); +	if (!ps->area) +		return r; + +	return 0; +} + +static void free_area(struct pstore *ps) +{ +	vfree(ps->area); +} + +/* + * Read or write a chunk aligned and sized block of data from a device. + */ +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) +{ +	struct io_region where; +	unsigned long bits; + +	where.bdev = ps->snap->cow->bdev; +	where.sector = ps->chunk_size * chunk; +	where.count = ps->chunk_size; + +	return dm_io_sync_vm(1, &where, rw, ps->area, &bits); +} + +/* + * Read or write a metadata area.  Remembering to skip the first + * chunk which holds the header. + */ +static int area_io(struct pstore *ps, uint32_t area, int rw) +{ +	int r; +	uint32_t chunk; + +	/* convert a metadata area index to a chunk index */ +	chunk = 1 + ((ps->exceptions_per_area + 1) * area); + +	r = chunk_io(ps, chunk, rw); +	if (r) +		return r; + +	ps->current_area = area; +	return 0; +} + +static int zero_area(struct pstore *ps, uint32_t area) +{ +	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); +	return area_io(ps, area, WRITE); +} + +static int read_header(struct pstore *ps, int *new_snapshot) +{ +	int r; +	struct disk_header *dh; + +	r = chunk_io(ps, 0, READ); +	if (r) +		return r; + +	dh = (struct disk_header *) ps->area; + +	if (le32_to_cpu(dh->magic) == 0) { +		*new_snapshot = 1; + +	} else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { +		*new_snapshot = 0; +		ps->valid = le32_to_cpu(dh->valid); +		ps->version = le32_to_cpu(dh->version); +		ps->chunk_size = le32_to_cpu(dh->chunk_size); + +	} else { +		DMWARN("Invalid/corrupt snapshot"); +		r = -ENXIO; +	} + +	return r; +} + +static int write_header(struct pstore *ps) +{ +	struct disk_header *dh; + +	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + +	dh = (struct disk_header *) ps->area; +	dh->magic = cpu_to_le32(SNAP_MAGIC); +	dh->valid = cpu_to_le32(ps->valid); +	dh->version = cpu_to_le32(ps->version); +	dh->chunk_size = cpu_to_le32(ps->chunk_size); + +	return chunk_io(ps, 0, WRITE); +} + +/* + * Access functions for the disk exceptions, these do the endian conversions. + */ +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) +{ +	if (index >= ps->exceptions_per_area) +		return NULL; + +	return ((struct disk_exception *) ps->area) + index; +} + +static int read_exception(struct pstore *ps, +			  uint32_t index, struct disk_exception *result) +{ +	struct disk_exception *e; + +	e = get_exception(ps, index); +	if (!e) +		return -EINVAL; + +	/* copy it */ +	result->old_chunk = le64_to_cpu(e->old_chunk); +	result->new_chunk = le64_to_cpu(e->new_chunk); + +	return 0; +} + +static int write_exception(struct pstore *ps, +			   uint32_t index, struct disk_exception *de) +{ +	struct disk_exception *e; + +	e = get_exception(ps, index); +	if (!e) +		return -EINVAL; + +	/* copy it */ +	e->old_chunk = cpu_to_le64(de->old_chunk); +	e->new_chunk = cpu_to_le64(de->new_chunk); + +	return 0; +} + +/* + * Registers the exceptions that are present in the current area. + * 'full' is filled in to indicate if the area has been + * filled. + */ +static int insert_exceptions(struct pstore *ps, int *full) +{ +	int r; +	unsigned int i; +	struct disk_exception de; + +	/* presume the area is full */ +	*full = 1; + +	for (i = 0; i < ps->exceptions_per_area; i++) { +		r = read_exception(ps, i, &de); + +		if (r) +			return r; + +		/* +		 * If the new_chunk is pointing at the start of +		 * the COW device, where the first metadata area +		 * is we know that we've hit the end of the +		 * exceptions.  Therefore the area is not full. +		 */ +		if (de.new_chunk == 0LL) { +			ps->current_committed = i; +			*full = 0; +			break; +		} + +		/* +		 * Keep track of the start of the free chunks. +		 */ +		if (ps->next_free <= de.new_chunk) +			ps->next_free = de.new_chunk + 1; + +		/* +		 * Otherwise we add the exception to the snapshot. +		 */ +		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); +		if (r) +			return r; +	} + +	return 0; +} + +static int read_exceptions(struct pstore *ps) +{ +	uint32_t area; +	int r, full = 1; + +	/* +	 * Keeping reading chunks and inserting exceptions until +	 * we find a partially full area. +	 */ +	for (area = 0; full; area++) { +		r = area_io(ps, area, READ); +		if (r) +			return r; + +		r = insert_exceptions(ps, &full); +		if (r) +			return r; +	} + +	return 0; +} + +static inline struct pstore *get_info(struct exception_store *store) +{ +	return (struct pstore *) store->context; +} + +static void persistent_fraction_full(struct exception_store *store, +				     sector_t *numerator, sector_t *denominator) +{ +	*numerator = get_info(store)->next_free * store->snap->chunk_size; +	*denominator = get_dev_size(store->snap->cow->bdev); +} + +static void persistent_destroy(struct exception_store *store) +{ +	struct pstore *ps = get_info(store); + +	dm_io_put(sectors_to_pages(ps->chunk_size)); +	vfree(ps->callbacks); +	free_area(ps); +	kfree(ps); +} + +static int persistent_read_metadata(struct exception_store *store) +{ +	int r, new_snapshot; +	struct pstore *ps = get_info(store); + +	/* +	 * Read the snapshot header. +	 */ +	r = read_header(ps, &new_snapshot); +	if (r) +		return r; + +	/* +	 * Do we need to setup a new snapshot ? +	 */ +	if (new_snapshot) { +		r = write_header(ps); +		if (r) { +			DMWARN("write_header failed"); +			return r; +		} + +		r = zero_area(ps, 0); +		if (r) { +			DMWARN("zero_area(0) failed"); +			return r; +		} + +	} else { +		/* +		 * Sanity checks. +		 */ +		if (!ps->valid) { +			DMWARN("snapshot is marked invalid"); +			return -EINVAL; +		} + +		if (ps->version != SNAPSHOT_DISK_VERSION) { +			DMWARN("unable to handle snapshot disk version %d", +			       ps->version); +			return -EINVAL; +		} + +		/* +		 * Read the metadata. +		 */ +		r = read_exceptions(ps); +		if (r) +			return r; +	} + +	return 0; +} + +static int persistent_prepare(struct exception_store *store, +			      struct exception *e) +{ +	struct pstore *ps = get_info(store); +	uint32_t stride; +	sector_t size = get_dev_size(store->snap->cow->bdev); + +	/* Is there enough room ? */ +	if (size < ((ps->next_free + 1) * store->snap->chunk_size)) +		return -ENOSPC; + +	e->new_chunk = ps->next_free; + +	/* +	 * Move onto the next free pending, making sure to take +	 * into account the location of the metadata chunks. +	 */ +	stride = (ps->exceptions_per_area + 1); +	if ((++ps->next_free % stride) == 1) +		ps->next_free++; + +	atomic_inc(&ps->pending_count); +	return 0; +} + +static void persistent_commit(struct exception_store *store, +			      struct exception *e, +			      void (*callback) (void *, int success), +			      void *callback_context) +{ +	int r; +	unsigned int i; +	struct pstore *ps = get_info(store); +	struct disk_exception de; +	struct commit_callback *cb; + +	de.old_chunk = e->old_chunk; +	de.new_chunk = e->new_chunk; +	write_exception(ps, ps->current_committed++, &de); + +	/* +	 * Add the callback to the back of the array.  This code +	 * is the only place where the callback array is +	 * manipulated, and we know that it will never be called +	 * multiple times concurrently. +	 */ +	cb = ps->callbacks + ps->callback_count++; +	cb->callback = callback; +	cb->context = callback_context; + +	/* +	 * If there are no more exceptions in flight, or we have +	 * filled this metadata area we commit the exceptions to +	 * disk. +	 */ +	if (atomic_dec_and_test(&ps->pending_count) || +	    (ps->current_committed == ps->exceptions_per_area)) { +		r = area_io(ps, ps->current_area, WRITE); +		if (r) +			ps->valid = 0; + +		for (i = 0; i < ps->callback_count; i++) { +			cb = ps->callbacks + i; +			cb->callback(cb->context, r == 0 ? 1 : 0); +		} + +		ps->callback_count = 0; +	} + +	/* +	 * Have we completely filled the current area ? +	 */ +	if (ps->current_committed == ps->exceptions_per_area) { +		ps->current_committed = 0; +		r = zero_area(ps, ps->current_area + 1); +		if (r) +			ps->valid = 0; +	} +} + +static void persistent_drop(struct exception_store *store) +{ +	struct pstore *ps = get_info(store); + +	ps->valid = 0; +	if (write_header(ps)) +		DMWARN("write header failed"); +} + +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) +{ +	int r; +	struct pstore *ps; + +	r = dm_io_get(sectors_to_pages(chunk_size)); +	if (r) +		return r; + +	/* allocate the pstore */ +	ps = kmalloc(sizeof(*ps), GFP_KERNEL); +	if (!ps) { +		r = -ENOMEM; +		goto bad; +	} + +	ps->snap = store->snap; +	ps->valid = 1; +	ps->version = SNAPSHOT_DISK_VERSION; +	ps->chunk_size = chunk_size; +	ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / +	    sizeof(struct disk_exception); +	ps->next_free = 2;	/* skipping the header and first area */ +	ps->current_committed = 0; + +	r = alloc_area(ps); +	if (r) +		goto bad; + +	/* +	 * Allocate space for all the callbacks. +	 */ +	ps->callback_count = 0; +	atomic_set(&ps->pending_count, 0); +	ps->callbacks = dm_vcalloc(ps->exceptions_per_area, +				   sizeof(*ps->callbacks)); + +	if (!ps->callbacks) { +		r = -ENOMEM; +		goto bad; +	} + +	store->destroy = persistent_destroy; +	store->read_metadata = persistent_read_metadata; +	store->prepare_exception = persistent_prepare; +	store->commit_exception = persistent_commit; +	store->drop_snapshot = persistent_drop; +	store->fraction_full = persistent_fraction_full; +	store->context = ps; + +	return 0; + +      bad: +	dm_io_put(sectors_to_pages(chunk_size)); +	if (ps) { +		if (ps->area) +			free_area(ps); + +		kfree(ps); +	} +	return r; +} + +/*----------------------------------------------------------------- + * Implementation of the store for non-persistent snapshots. + *---------------------------------------------------------------*/ +struct transient_c { +	sector_t next_free; +}; + +static void transient_destroy(struct exception_store *store) +{ +	kfree(store->context); +} + +static int transient_read_metadata(struct exception_store *store) +{ +	return 0; +} + +static int transient_prepare(struct exception_store *store, struct exception *e) +{ +	struct transient_c *tc = (struct transient_c *) store->context; +	sector_t size = get_dev_size(store->snap->cow->bdev); + +	if (size < (tc->next_free + store->snap->chunk_size)) +		return -1; + +	e->new_chunk = sector_to_chunk(store->snap, tc->next_free); +	tc->next_free += store->snap->chunk_size; + +	return 0; +} + +static void transient_commit(struct exception_store *store, +		      struct exception *e, +		      void (*callback) (void *, int success), +		      void *callback_context) +{ +	/* Just succeed */ +	callback(callback_context, 1); +} + +static void transient_fraction_full(struct exception_store *store, +				    sector_t *numerator, sector_t *denominator) +{ +	*numerator = ((struct transient_c *) store->context)->next_free; +	*denominator = get_dev_size(store->snap->cow->bdev); +} + +int dm_create_transient(struct exception_store *store, +			struct dm_snapshot *s, int blocksize) +{ +	struct transient_c *tc; + +	memset(store, 0, sizeof(*store)); +	store->destroy = transient_destroy; +	store->read_metadata = transient_read_metadata; +	store->prepare_exception = transient_prepare; +	store->commit_exception = transient_commit; +	store->fraction_full = transient_fraction_full; +	store->snap = s; + +	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); +	if (!tc) +		return -ENOMEM; + +	tc->next_free = 0; +	store->context = tc; + +	return 0; +} diff --git a/drivers/md/dm-hw-handler.c b/drivers/md/dm-hw-handler.c new file mode 100644 index 00000000000..ae63772e44c --- /dev/null +++ b/drivers/md/dm-hw-handler.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath hardware handler registration. + */ + +#include "dm.h" +#include "dm-hw-handler.h" + +#include <linux/slab.h> + +struct hwh_internal { +	struct hw_handler_type hwht; + +	struct list_head list; +	long use; +}; + +#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht) + +static LIST_HEAD(_hw_handlers); +static DECLARE_RWSEM(_hwh_lock); + +struct hwh_internal *__find_hw_handler_type(const char *name) +{ +	struct hwh_internal *hwhi; + +	list_for_each_entry(hwhi, &_hw_handlers, list) { +		if (!strcmp(name, hwhi->hwht.name)) +			return hwhi; +	} + +	return NULL; +} + +static struct hwh_internal *get_hw_handler(const char *name) +{ +	struct hwh_internal *hwhi; + +	down_read(&_hwh_lock); +	hwhi = __find_hw_handler_type(name); +	if (hwhi) { +		if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module)) +			hwhi = NULL; +		else +			hwhi->use++; +	} +	up_read(&_hwh_lock); + +	return hwhi; +} + +struct hw_handler_type *dm_get_hw_handler(const char *name) +{ +	struct hwh_internal *hwhi; + +	if (!name) +		return NULL; + +	hwhi = get_hw_handler(name); +	if (!hwhi) { +		request_module("dm-%s", name); +		hwhi = get_hw_handler(name); +	} + +	return hwhi ? &hwhi->hwht : NULL; +} + +void dm_put_hw_handler(struct hw_handler_type *hwht) +{ +	struct hwh_internal *hwhi; + +	if (!hwht) +		return; + +	down_read(&_hwh_lock); +	hwhi = __find_hw_handler_type(hwht->name); +	if (!hwhi) +		goto out; + +	if (--hwhi->use == 0) +		module_put(hwhi->hwht.module); + +	if (hwhi->use < 0) +		BUG(); + +      out: +	up_read(&_hwh_lock); +} + +static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht) +{ +	struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL); + +	if (hwhi) { +		memset(hwhi, 0, sizeof(*hwhi)); +		hwhi->hwht = *hwht; +	} + +	return hwhi; +} + +int dm_register_hw_handler(struct hw_handler_type *hwht) +{ +	int r = 0; +	struct hwh_internal *hwhi = _alloc_hw_handler(hwht); + +	if (!hwhi) +		return -ENOMEM; + +	down_write(&_hwh_lock); + +	if (__find_hw_handler_type(hwht->name)) { +		kfree(hwhi); +		r = -EEXIST; +	} else +		list_add(&hwhi->list, &_hw_handlers); + +	up_write(&_hwh_lock); + +	return r; +} + +int dm_unregister_hw_handler(struct hw_handler_type *hwht) +{ +	struct hwh_internal *hwhi; + +	down_write(&_hwh_lock); + +	hwhi = __find_hw_handler_type(hwht->name); +	if (!hwhi) { +		up_write(&_hwh_lock); +		return -EINVAL; +	} + +	if (hwhi->use) { +		up_write(&_hwh_lock); +		return -ETXTBSY; +	} + +	list_del(&hwhi->list); + +	up_write(&_hwh_lock); + +	kfree(hwhi); + +	return 0; +} + +unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio) +{ +#if 0 +	int sense_key, asc, ascq; + +	if (bio->bi_error & BIO_SENSE) { +		/* FIXME: This is just an initial guess. */ +		/* key / asc / ascq */ +		sense_key = (bio->bi_error >> 16) & 0xff; +		asc = (bio->bi_error >> 8) & 0xff; +		ascq = bio->bi_error & 0xff; + +		switch (sense_key) { +			/* This block as a whole comes from the device. +			 * So no point retrying on another path. */ +		case 0x03:	/* Medium error */ +		case 0x05:	/* Illegal request */ +		case 0x07:	/* Data protect */ +		case 0x08:	/* Blank check */ +		case 0x0a:	/* copy aborted */ +		case 0x0c:	/* obsolete - no clue ;-) */ +		case 0x0d:	/* volume overflow */ +		case 0x0e:	/* data miscompare */ +		case 0x0f:	/* reserved - no idea either. */ +			return MP_ERROR_IO; + +			/* For these errors it's unclear whether they +			 * come from the device or the controller. +			 * So just lets try a different path, and if +			 * it eventually succeeds, user-space will clear +			 * the paths again... */ +		case 0x02:	/* Not ready */ +		case 0x04:	/* Hardware error */ +		case 0x09:	/* vendor specific */ +		case 0x0b:	/* Aborted command */ +			return MP_FAIL_PATH; + +		case 0x06:	/* Unit attention - might want to decode */ +			if (asc == 0x04 && ascq == 0x01) +				/* "Unit in the process of +				 * becoming ready" */ +				return 0; +			return MP_FAIL_PATH; + +			/* FIXME: For Unit Not Ready we may want +			 * to have a generic pg activation +			 * feature (START_UNIT). */ + +			/* Should these two ever end up in the +			 * error path? I don't think so. */ +		case 0x00:	/* No sense */ +		case 0x01:	/* Recovered error */ +			return 0; +		} +	} +#endif + +	/* We got no idea how to decode the other kinds of errors -> +	 * assume generic error condition. */ +	return MP_FAIL_PATH; +} + +EXPORT_SYMBOL_GPL(dm_register_hw_handler); +EXPORT_SYMBOL_GPL(dm_unregister_hw_handler); +EXPORT_SYMBOL_GPL(dm_scsi_err_handler); diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h new file mode 100644 index 00000000000..15f5629e231 --- /dev/null +++ b/drivers/md/dm-hw-handler.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath hardware handler registration. + */ + +#ifndef	DM_HW_HANDLER_H +#define	DM_HW_HANDLER_H + +#include <linux/device-mapper.h> + +#include "dm-mpath.h" + +struct hw_handler_type; +struct hw_handler { +	struct hw_handler_type *type; +	void *context; +}; + +/* + * Constructs a hardware handler object, takes custom arguments + */ +/* Information about a hardware handler type */ +struct hw_handler_type { +	char *name; +	struct module *module; + +	int (*create) (struct hw_handler *handler, unsigned int argc, +		       char **argv); +	void (*destroy) (struct hw_handler *hwh); + +	void (*pg_init) (struct hw_handler *hwh, unsigned bypassed, +			 struct path *path); +	unsigned (*error) (struct hw_handler *hwh, struct bio *bio); +	int (*status) (struct hw_handler *hwh, status_type_t type, +		       char *result, unsigned int maxlen); +}; + +/* Register a hardware handler */ +int dm_register_hw_handler(struct hw_handler_type *type); + +/* Unregister a hardware handler */ +int dm_unregister_hw_handler(struct hw_handler_type *type); + +/* Returns a registered hardware handler type */ +struct hw_handler_type *dm_get_hw_handler(const char *name); + +/* Releases a hardware handler  */ +void dm_put_hw_handler(struct hw_handler_type *hwht); + +/* Default err function */ +unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio); + +/* Error flags for err and dm_pg_init_complete */ +#define MP_FAIL_PATH 1 +#define MP_BYPASS_PG 2 +#define MP_ERROR_IO  4	/* Don't retry this I/O */ + +#endif diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c new file mode 100644 index 00000000000..45754bb6a79 --- /dev/null +++ b/drivers/md/dm-io.c @@ -0,0 +1,426 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the GPL. + */ + +#include "dm-io.h" + +#include <linux/bio.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> + +static struct bio_set *_bios; + +/* FIXME: can we shrink this ? */ +struct io { +	unsigned long error; +	atomic_t count; +	struct task_struct *sleeper; +	io_notify_fn callback; +	void *context; +}; + +/* + * io contexts are only dynamically allocated for asynchronous + * io.  Since async io is likely to be the majority of io we'll + * have the same number of io contexts as buffer heads ! (FIXME: + * must reduce this). + */ +static unsigned _num_ios; +static mempool_t *_io_pool; + +static void *alloc_io(unsigned int __nocast gfp_mask, void *pool_data) +{ +	return kmalloc(sizeof(struct io), gfp_mask); +} + +static void free_io(void *element, void *pool_data) +{ +	kfree(element); +} + +static unsigned int pages_to_ios(unsigned int pages) +{ +	return 4 * pages;	/* too many ? */ +} + +static int resize_pool(unsigned int new_ios) +{ +	int r = 0; + +	if (_io_pool) { +		if (new_ios == 0) { +			/* free off the pool */ +			mempool_destroy(_io_pool); +			_io_pool = NULL; +			bioset_free(_bios); + +		} else { +			/* resize the pool */ +			r = mempool_resize(_io_pool, new_ios, GFP_KERNEL); +		} + +	} else { +		/* create new pool */ +		_io_pool = mempool_create(new_ios, alloc_io, free_io, NULL); +		if (!_io_pool) +			return -ENOMEM; + +		_bios = bioset_create(16, 16, 4); +		if (!_bios) { +			mempool_destroy(_io_pool); +			_io_pool = NULL; +			return -ENOMEM; +		} +	} + +	if (!r) +		_num_ios = new_ios; + +	return r; +} + +int dm_io_get(unsigned int num_pages) +{ +	return resize_pool(_num_ios + pages_to_ios(num_pages)); +} + +void dm_io_put(unsigned int num_pages) +{ +	resize_pool(_num_ios - pages_to_ios(num_pages)); +} + +/*----------------------------------------------------------------- + * We need to keep track of which region a bio is doing io for. + * In order to save a memory allocation we store this the last + * bvec which we know is unused (blech). + * XXX This is ugly and can OOPS with some configs... find another way. + *---------------------------------------------------------------*/ +static inline void bio_set_region(struct bio *bio, unsigned region) +{ +	bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region; +} + +static inline unsigned bio_get_region(struct bio *bio) +{ +	return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len; +} + +/*----------------------------------------------------------------- + * We need an io object to keep track of the number of bios that + * have been dispatched for a particular io. + *---------------------------------------------------------------*/ +static void dec_count(struct io *io, unsigned int region, int error) +{ +	if (error) +		set_bit(region, &io->error); + +	if (atomic_dec_and_test(&io->count)) { +		if (io->sleeper) +			wake_up_process(io->sleeper); + +		else { +			int r = io->error; +			io_notify_fn fn = io->callback; +			void *context = io->context; + +			mempool_free(io, _io_pool); +			fn(r, context); +		} +	} +} + +static int endio(struct bio *bio, unsigned int done, int error) +{ +	struct io *io = (struct io *) bio->bi_private; + +	/* keep going until we've finished */ +	if (bio->bi_size) +		return 1; + +	if (error && bio_data_dir(bio) == READ) +		zero_fill_bio(bio); + +	dec_count(io, bio_get_region(bio), error); +	bio_put(bio); + +	return 0; +} + +/*----------------------------------------------------------------- + * These little objects provide an abstraction for getting a new + * destination page for io. + *---------------------------------------------------------------*/ +struct dpages { +	void (*get_page)(struct dpages *dp, +			 struct page **p, unsigned long *len, unsigned *offset); +	void (*next_page)(struct dpages *dp); + +	unsigned context_u; +	void *context_ptr; +}; + +/* + * Functions for getting the pages from a list. + */ +static void list_get_page(struct dpages *dp, +		  struct page **p, unsigned long *len, unsigned *offset) +{ +	unsigned o = dp->context_u; +	struct page_list *pl = (struct page_list *) dp->context_ptr; + +	*p = pl->page; +	*len = PAGE_SIZE - o; +	*offset = o; +} + +static void list_next_page(struct dpages *dp) +{ +	struct page_list *pl = (struct page_list *) dp->context_ptr; +	dp->context_ptr = pl->next; +	dp->context_u = 0; +} + +static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset) +{ +	dp->get_page = list_get_page; +	dp->next_page = list_next_page; +	dp->context_u = offset; +	dp->context_ptr = pl; +} + +/* + * Functions for getting the pages from a bvec. + */ +static void bvec_get_page(struct dpages *dp, +		  struct page **p, unsigned long *len, unsigned *offset) +{ +	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; +	*p = bvec->bv_page; +	*len = bvec->bv_len; +	*offset = bvec->bv_offset; +} + +static void bvec_next_page(struct dpages *dp) +{ +	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; +	dp->context_ptr = bvec + 1; +} + +static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) +{ +	dp->get_page = bvec_get_page; +	dp->next_page = bvec_next_page; +	dp->context_ptr = bvec; +} + +static void vm_get_page(struct dpages *dp, +		 struct page **p, unsigned long *len, unsigned *offset) +{ +	*p = vmalloc_to_page(dp->context_ptr); +	*offset = dp->context_u; +	*len = PAGE_SIZE - dp->context_u; +} + +static void vm_next_page(struct dpages *dp) +{ +	dp->context_ptr += PAGE_SIZE - dp->context_u; +	dp->context_u = 0; +} + +static void vm_dp_init(struct dpages *dp, void *data) +{ +	dp->get_page = vm_get_page; +	dp->next_page = vm_next_page; +	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); +	dp->context_ptr = data; +} + +/*----------------------------------------------------------------- + * IO routines that accept a list of pages. + *---------------------------------------------------------------*/ +static void do_region(int rw, unsigned int region, struct io_region *where, +		      struct dpages *dp, struct io *io) +{ +	struct bio *bio; +	struct page *page; +	unsigned long len; +	unsigned offset; +	unsigned num_bvecs; +	sector_t remaining = where->count; + +	while (remaining) { +		/* +		 * Allocate a suitably sized bio, we add an extra +		 * bvec for bio_get/set_region(). +		 */ +		num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2; +		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios); +		bio->bi_sector = where->sector + (where->count - remaining); +		bio->bi_bdev = where->bdev; +		bio->bi_end_io = endio; +		bio->bi_private = io; +		bio_set_region(bio, region); + +		/* +		 * Try and add as many pages as possible. +		 */ +		while (remaining) { +			dp->get_page(dp, &page, &len, &offset); +			len = min(len, to_bytes(remaining)); +			if (!bio_add_page(bio, page, len, offset)) +				break; + +			offset = 0; +			remaining -= to_sector(len); +			dp->next_page(dp); +		} + +		atomic_inc(&io->count); +		submit_bio(rw, bio); +	} +} + +static void dispatch_io(int rw, unsigned int num_regions, +			struct io_region *where, struct dpages *dp, +			struct io *io, int sync) +{ +	int i; +	struct dpages old_pages = *dp; + +	if (sync) +		rw |= (1 << BIO_RW_SYNC); + +	/* +	 * For multiple regions we need to be careful to rewind +	 * the dp object for each call to do_region. +	 */ +	for (i = 0; i < num_regions; i++) { +		*dp = old_pages; +		if (where[i].count) +			do_region(rw, i, where + i, dp, io); +	} + +	/* +	 * Drop the extra refence that we were holding to avoid +	 * the io being completed too early. +	 */ +	dec_count(io, 0, 0); +} + +static int sync_io(unsigned int num_regions, struct io_region *where, +	    int rw, struct dpages *dp, unsigned long *error_bits) +{ +	struct io io; + +	if (num_regions > 1 && rw != WRITE) { +		WARN_ON(1); +		return -EIO; +	} + +	io.error = 0; +	atomic_set(&io.count, 1); /* see dispatch_io() */ +	io.sleeper = current; + +	dispatch_io(rw, num_regions, where, dp, &io, 1); + +	while (1) { +		set_current_state(TASK_UNINTERRUPTIBLE); + +		if (!atomic_read(&io.count) || signal_pending(current)) +			break; + +		io_schedule(); +	} +	set_current_state(TASK_RUNNING); + +	if (atomic_read(&io.count)) +		return -EINTR; + +	*error_bits = io.error; +	return io.error ? -EIO : 0; +} + +static int async_io(unsigned int num_regions, struct io_region *where, int rw, +	     struct dpages *dp, io_notify_fn fn, void *context) +{ +	struct io *io; + +	if (num_regions > 1 && rw != WRITE) { +		WARN_ON(1); +		fn(1, context); +		return -EIO; +	} + +	io = mempool_alloc(_io_pool, GFP_NOIO); +	io->error = 0; +	atomic_set(&io->count, 1); /* see dispatch_io() */ +	io->sleeper = NULL; +	io->callback = fn; +	io->context = context; + +	dispatch_io(rw, num_regions, where, dp, io, 0); +	return 0; +} + +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, +	       struct page_list *pl, unsigned int offset, +	       unsigned long *error_bits) +{ +	struct dpages dp; +	list_dp_init(&dp, pl, offset); +	return sync_io(num_regions, where, rw, &dp, error_bits); +} + +int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, +		    struct bio_vec *bvec, unsigned long *error_bits) +{ +	struct dpages dp; +	bvec_dp_init(&dp, bvec); +	return sync_io(num_regions, where, rw, &dp, error_bits); +} + +int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, +		  void *data, unsigned long *error_bits) +{ +	struct dpages dp; +	vm_dp_init(&dp, data); +	return sync_io(num_regions, where, rw, &dp, error_bits); +} + +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, +		struct page_list *pl, unsigned int offset, +		io_notify_fn fn, void *context) +{ +	struct dpages dp; +	list_dp_init(&dp, pl, offset); +	return async_io(num_regions, where, rw, &dp, fn, context); +} + +int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw, +		     struct bio_vec *bvec, io_notify_fn fn, void *context) +{ +	struct dpages dp; +	bvec_dp_init(&dp, bvec); +	return async_io(num_regions, where, rw, &dp, fn, context); +} + +int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, +		   void *data, io_notify_fn fn, void *context) +{ +	struct dpages dp; +	vm_dp_init(&dp, data); +	return async_io(num_regions, where, rw, &dp, fn, context); +} + +EXPORT_SYMBOL(dm_io_get); +EXPORT_SYMBOL(dm_io_put); +EXPORT_SYMBOL(dm_io_sync); +EXPORT_SYMBOL(dm_io_async); +EXPORT_SYMBOL(dm_io_sync_bvec); +EXPORT_SYMBOL(dm_io_async_bvec); +EXPORT_SYMBOL(dm_io_sync_vm); +EXPORT_SYMBOL(dm_io_async_vm); diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h new file mode 100644 index 00000000000..1a77f326570 --- /dev/null +++ b/drivers/md/dm-io.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the GPL. + */ + +#ifndef _DM_IO_H +#define _DM_IO_H + +#include "dm.h" + +/* FIXME make this configurable */ +#define DM_MAX_IO_REGIONS 8 + +struct io_region { +	struct block_device *bdev; +	sector_t sector; +	sector_t count; +}; + +struct page_list { +	struct page_list *next; +	struct page *page; +}; + + +/* + * 'error' is a bitset, with each bit indicating whether an error + * occurred doing io to the corresponding region. + */ +typedef void (*io_notify_fn)(unsigned long error, void *context); + + +/* + * Before anyone uses the IO interface they should call + * dm_io_get(), specifying roughly how many pages they are + * expecting to perform io on concurrently. + * + * This function may block. + */ +int dm_io_get(unsigned int num_pages); +void dm_io_put(unsigned int num_pages); + +/* + * Synchronous IO. + * + * Please ensure that the rw flag in the next two functions is + * either READ or WRITE, ie. we don't take READA.  Any + * regions with a zero count field will be ignored. + */ +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, +	       struct page_list *pl, unsigned int offset, +	       unsigned long *error_bits); + +int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, +		    struct bio_vec *bvec, unsigned long *error_bits); + +int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, +		  void *data, unsigned long *error_bits); + +/* + * Aynchronous IO. + * + * The 'where' array may be safely allocated on the stack since + * the function takes a copy. + */ +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, +		struct page_list *pl, unsigned int offset, +		io_notify_fn fn, void *context); + +int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw, +		     struct bio_vec *bvec, io_notify_fn fn, void *context); + +int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, +		   void *data, io_notify_fn fn, void *context); + +#endif diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c new file mode 100644 index 00000000000..ee3c869d970 --- /dev/null +++ b/drivers/md/dm-ioctl.c @@ -0,0 +1,1416 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/init.h> +#include <linux/wait.h> +#include <linux/slab.h> +#include <linux/devfs_fs_kernel.h> +#include <linux/dm-ioctl.h> + +#include <asm/uaccess.h> + +#define DM_DRIVER_EMAIL "dm-devel@redhat.com" + +/*----------------------------------------------------------------- + * The ioctl interface needs to be able to look up devices by + * name or uuid. + *---------------------------------------------------------------*/ +struct hash_cell { +	struct list_head name_list; +	struct list_head uuid_list; + +	char *name; +	char *uuid; +	struct mapped_device *md; +	struct dm_table *new_map; +}; + +struct vers_iter { +    size_t param_size; +    struct dm_target_versions *vers, *old_vers; +    char *end; +    uint32_t flags; +}; + + +#define NUM_BUCKETS 64 +#define MASK_BUCKETS (NUM_BUCKETS - 1) +static struct list_head _name_buckets[NUM_BUCKETS]; +static struct list_head _uuid_buckets[NUM_BUCKETS]; + +static void dm_hash_remove_all(void); + +/* + * Guards access to both hash tables. + */ +static DECLARE_RWSEM(_hash_lock); + +static void init_buckets(struct list_head *buckets) +{ +	unsigned int i; + +	for (i = 0; i < NUM_BUCKETS; i++) +		INIT_LIST_HEAD(buckets + i); +} + +static int dm_hash_init(void) +{ +	init_buckets(_name_buckets); +	init_buckets(_uuid_buckets); +	devfs_mk_dir(DM_DIR); +	return 0; +} + +static void dm_hash_exit(void) +{ +	dm_hash_remove_all(); +	devfs_remove(DM_DIR); +} + +/*----------------------------------------------------------------- + * Hash function: + * We're not really concerned with the str hash function being + * fast since it's only used by the ioctl interface. + *---------------------------------------------------------------*/ +static unsigned int hash_str(const char *str) +{ +	const unsigned int hash_mult = 2654435387U; +	unsigned int h = 0; + +	while (*str) +		h = (h + (unsigned int) *str++) * hash_mult; + +	return h & MASK_BUCKETS; +} + +/*----------------------------------------------------------------- + * Code for looking up a device by name + *---------------------------------------------------------------*/ +static struct hash_cell *__get_name_cell(const char *str) +{ +	struct hash_cell *hc; +	unsigned int h = hash_str(str); + +	list_for_each_entry (hc, _name_buckets + h, name_list) +		if (!strcmp(hc->name, str)) +			return hc; + +	return NULL; +} + +static struct hash_cell *__get_uuid_cell(const char *str) +{ +	struct hash_cell *hc; +	unsigned int h = hash_str(str); + +	list_for_each_entry (hc, _uuid_buckets + h, uuid_list) +		if (!strcmp(hc->uuid, str)) +			return hc; + +	return NULL; +} + +/*----------------------------------------------------------------- + * Inserting, removing and renaming a device. + *---------------------------------------------------------------*/ +static inline char *kstrdup(const char *str) +{ +	char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); +	if (r) +		strcpy(r, str); +	return r; +} + +static struct hash_cell *alloc_cell(const char *name, const char *uuid, +				    struct mapped_device *md) +{ +	struct hash_cell *hc; + +	hc = kmalloc(sizeof(*hc), GFP_KERNEL); +	if (!hc) +		return NULL; + +	hc->name = kstrdup(name); +	if (!hc->name) { +		kfree(hc); +		return NULL; +	} + +	if (!uuid) +		hc->uuid = NULL; + +	else { +		hc->uuid = kstrdup(uuid); +		if (!hc->uuid) { +			kfree(hc->name); +			kfree(hc); +			return NULL; +		} +	} + +	INIT_LIST_HEAD(&hc->name_list); +	INIT_LIST_HEAD(&hc->uuid_list); +	hc->md = md; +	hc->new_map = NULL; +	return hc; +} + +static void free_cell(struct hash_cell *hc) +{ +	if (hc) { +		kfree(hc->name); +		kfree(hc->uuid); +		kfree(hc); +	} +} + +/* + * devfs stuff. + */ +static int register_with_devfs(struct hash_cell *hc) +{ +	struct gendisk *disk = dm_disk(hc->md); + +	devfs_mk_bdev(MKDEV(disk->major, disk->first_minor), +		      S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, +		      DM_DIR "/%s", hc->name); +	return 0; +} + +static int unregister_with_devfs(struct hash_cell *hc) +{ +	devfs_remove(DM_DIR"/%s", hc->name); +	return 0; +} + +/* + * The kdev_t and uuid of a device can never change once it is + * initially inserted. + */ +static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) +{ +	struct hash_cell *cell; + +	/* +	 * Allocate the new cells. +	 */ +	cell = alloc_cell(name, uuid, md); +	if (!cell) +		return -ENOMEM; + +	/* +	 * Insert the cell into both hash tables. +	 */ +	down_write(&_hash_lock); +	if (__get_name_cell(name)) +		goto bad; + +	list_add(&cell->name_list, _name_buckets + hash_str(name)); + +	if (uuid) { +		if (__get_uuid_cell(uuid)) { +			list_del(&cell->name_list); +			goto bad; +		} +		list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); +	} +	register_with_devfs(cell); +	dm_get(md); +	dm_set_mdptr(md, cell); +	up_write(&_hash_lock); + +	return 0; + + bad: +	up_write(&_hash_lock); +	free_cell(cell); +	return -EBUSY; +} + +static void __hash_remove(struct hash_cell *hc) +{ +	/* remove from the dev hash */ +	list_del(&hc->uuid_list); +	list_del(&hc->name_list); +	unregister_with_devfs(hc); +	dm_set_mdptr(hc->md, NULL); +	dm_put(hc->md); +	if (hc->new_map) +		dm_table_put(hc->new_map); +	free_cell(hc); +} + +static void dm_hash_remove_all(void) +{ +	int i; +	struct hash_cell *hc; +	struct list_head *tmp, *n; + +	down_write(&_hash_lock); +	for (i = 0; i < NUM_BUCKETS; i++) { +		list_for_each_safe (tmp, n, _name_buckets + i) { +			hc = list_entry(tmp, struct hash_cell, name_list); +			__hash_remove(hc); +		} +	} +	up_write(&_hash_lock); +} + +static int dm_hash_rename(const char *old, const char *new) +{ +	char *new_name, *old_name; +	struct hash_cell *hc; + +	/* +	 * duplicate new. +	 */ +	new_name = kstrdup(new); +	if (!new_name) +		return -ENOMEM; + +	down_write(&_hash_lock); + +	/* +	 * Is new free ? +	 */ +	hc = __get_name_cell(new); +	if (hc) { +		DMWARN("asked to rename to an already existing name %s -> %s", +		       old, new); +		up_write(&_hash_lock); +		kfree(new_name); +		return -EBUSY; +	} + +	/* +	 * Is there such a device as 'old' ? +	 */ +	hc = __get_name_cell(old); +	if (!hc) { +		DMWARN("asked to rename a non existent device %s -> %s", +		       old, new); +		up_write(&_hash_lock); +		kfree(new_name); +		return -ENXIO; +	} + +	/* +	 * rename and move the name cell. +	 */ +	unregister_with_devfs(hc); + +	list_del(&hc->name_list); +	old_name = hc->name; +	hc->name = new_name; +	list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + +	/* rename the device node in devfs */ +	register_with_devfs(hc); + +	up_write(&_hash_lock); +	kfree(old_name); +	return 0; +} + +/*----------------------------------------------------------------- + * Implementation of the ioctl commands + *---------------------------------------------------------------*/ +/* + * All the ioctl commands get dispatched to functions with this + * prototype. + */ +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); + +static int remove_all(struct dm_ioctl *param, size_t param_size) +{ +	dm_hash_remove_all(); +	param->data_size = 0; +	return 0; +} + +/* + * Round up the ptr to an 8-byte boundary. + */ +#define ALIGN_MASK 7 +static inline void *align_ptr(void *ptr) +{ +	return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); +} + +/* + * Retrieves the data payload buffer from an already allocated + * struct dm_ioctl. + */ +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, +			       size_t *len) +{ +	param->data_start = align_ptr(param + 1) - (void *) param; + +	if (param->data_start < param_size) +		*len = param_size - param->data_start; +	else +		*len = 0; + +	return ((void *) param) + param->data_start; +} + +static int list_devices(struct dm_ioctl *param, size_t param_size) +{ +	unsigned int i; +	struct hash_cell *hc; +	size_t len, needed = 0; +	struct gendisk *disk; +	struct dm_name_list *nl, *old_nl = NULL; + +	down_write(&_hash_lock); + +	/* +	 * Loop through all the devices working out how much +	 * space we need. +	 */ +	for (i = 0; i < NUM_BUCKETS; i++) { +		list_for_each_entry (hc, _name_buckets + i, name_list) { +			needed += sizeof(struct dm_name_list); +			needed += strlen(hc->name) + 1; +			needed += ALIGN_MASK; +		} +	} + +	/* +	 * Grab our output buffer. +	 */ +	nl = get_result_buffer(param, param_size, &len); +	if (len < needed) { +		param->flags |= DM_BUFFER_FULL_FLAG; +		goto out; +	} +	param->data_size = param->data_start + needed; + +	nl->dev = 0;	/* Flags no data */ + +	/* +	 * Now loop through filling out the names. +	 */ +	for (i = 0; i < NUM_BUCKETS; i++) { +		list_for_each_entry (hc, _name_buckets + i, name_list) { +			if (old_nl) +				old_nl->next = (uint32_t) ((void *) nl - +							   (void *) old_nl); +			disk = dm_disk(hc->md); +			nl->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); +			nl->next = 0; +			strcpy(nl->name, hc->name); + +			old_nl = nl; +			nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1); +		} +	} + + out: +	up_write(&_hash_lock); +	return 0; +} + +static void list_version_get_needed(struct target_type *tt, void *needed_param) +{ +    size_t *needed = needed_param; + +    *needed += strlen(tt->name); +    *needed += sizeof(tt->version); +    *needed += ALIGN_MASK; +} + +static void list_version_get_info(struct target_type *tt, void *param) +{ +    struct vers_iter *info = param; + +    /* Check space - it might have changed since the first iteration */ +    if ((char *)info->vers + sizeof(tt->version) + strlen(tt->name) + 1 > +	info->end) { + +	info->flags = DM_BUFFER_FULL_FLAG; +	return; +    } + +    if (info->old_vers) +	info->old_vers->next = (uint32_t) ((void *)info->vers - +					   (void *)info->old_vers); +    info->vers->version[0] = tt->version[0]; +    info->vers->version[1] = tt->version[1]; +    info->vers->version[2] = tt->version[2]; +    info->vers->next = 0; +    strcpy(info->vers->name, tt->name); + +    info->old_vers = info->vers; +    info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1); +} + +static int list_versions(struct dm_ioctl *param, size_t param_size) +{ +	size_t len, needed = 0; +	struct dm_target_versions *vers; +	struct vers_iter iter_info; + +	/* +	 * Loop through all the devices working out how much +	 * space we need. +	 */ +	dm_target_iterate(list_version_get_needed, &needed); + +	/* +	 * Grab our output buffer. +	 */ +	vers = get_result_buffer(param, param_size, &len); +	if (len < needed) { +		param->flags |= DM_BUFFER_FULL_FLAG; +		goto out; +	} +	param->data_size = param->data_start + needed; + +	iter_info.param_size = param_size; +	iter_info.old_vers = NULL; +	iter_info.vers = vers; +	iter_info.flags = 0; +	iter_info.end = (char *)vers+len; + +	/* +	 * Now loop through filling out the names & versions. +	 */ +	dm_target_iterate(list_version_get_info, &iter_info); +	param->flags |= iter_info.flags; + + out: +	return 0; +} + + + +static int check_name(const char *name) +{ +	if (strchr(name, '/')) { +		DMWARN("invalid device name"); +		return -EINVAL; +	} + +	return 0; +} + +/* + * Fills in a dm_ioctl structure, ready for sending back to + * userland. + */ +static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) +{ +	struct gendisk *disk = dm_disk(md); +	struct dm_table *table; +	struct block_device *bdev; + +	param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | +			  DM_ACTIVE_PRESENT_FLAG); + +	if (dm_suspended(md)) +		param->flags |= DM_SUSPEND_FLAG; + +	param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); + +	if (!(param->flags & DM_SKIP_BDGET_FLAG)) { +		bdev = bdget_disk(disk, 0); +		if (!bdev) +			return -ENXIO; + +		/* +		 * Yes, this will be out of date by the time it gets back +		 * to userland, but it is still very useful for +		 * debugging. +		 */ +		param->open_count = bdev->bd_openers; +		bdput(bdev); +	} else +		param->open_count = -1; + +	if (disk->policy) +		param->flags |= DM_READONLY_FLAG; + +	param->event_nr = dm_get_event_nr(md); + +	table = dm_get_table(md); +	if (table) { +		param->flags |= DM_ACTIVE_PRESENT_FLAG; +		param->target_count = dm_table_get_num_targets(table); +		dm_table_put(table); +	} else +		param->target_count = 0; + +	return 0; +} + +static int dev_create(struct dm_ioctl *param, size_t param_size) +{ +	int r; +	struct mapped_device *md; + +	r = check_name(param->name); +	if (r) +		return r; + +	if (param->flags & DM_PERSISTENT_DEV_FLAG) +		r = dm_create_with_minor(MINOR(huge_decode_dev(param->dev)), &md); +	else +		r = dm_create(&md); + +	if (r) +		return r; + +	r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); +	if (r) { +		dm_put(md); +		return r; +	} + +	param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + +	r = __dev_status(md, param); +	dm_put(md); + +	return r; +} + +/* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) +{ +	if (*param->uuid) +		return __get_uuid_cell(param->uuid); +	else if (*param->name) +		return __get_name_cell(param->name); +	else +		return dm_get_mdptr(huge_decode_dev(param->dev)); +} + +static inline struct mapped_device *find_device(struct dm_ioctl *param) +{ +	struct hash_cell *hc; +	struct mapped_device *md = NULL; + +	down_read(&_hash_lock); +	hc = __find_device_hash_cell(param); +	if (hc) { +		md = hc->md; +		dm_get(md); + +		/* +		 * Sneakily write in both the name and the uuid +		 * while we have the cell. +		 */ +		strncpy(param->name, hc->name, sizeof(param->name)); +		if (hc->uuid) +			strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); +		else +			param->uuid[0] = '\0'; + +		if (hc->new_map) +			param->flags |= DM_INACTIVE_PRESENT_FLAG; +		else +			param->flags &= ~DM_INACTIVE_PRESENT_FLAG; +	} +	up_read(&_hash_lock); + +	return md; +} + +static int dev_remove(struct dm_ioctl *param, size_t param_size) +{ +	struct hash_cell *hc; + +	down_write(&_hash_lock); +	hc = __find_device_hash_cell(param); + +	if (!hc) { +		DMWARN("device doesn't appear to be in the dev hash table."); +		up_write(&_hash_lock); +		return -ENXIO; +	} + +	__hash_remove(hc); +	up_write(&_hash_lock); +	param->data_size = 0; +	return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. + */ +static int invalid_str(char *str, void *end) +{ +	while ((void *) str < end) +		if (!*str++) +			return 0; + +	return -EINVAL; +} + +static int dev_rename(struct dm_ioctl *param, size_t param_size) +{ +	int r; +	char *new_name = (char *) param + param->data_start; + +	if (new_name < (char *) (param + 1) || +	    invalid_str(new_name, (void *) param + param_size)) { +		DMWARN("Invalid new logical volume name supplied."); +		return -EINVAL; +	} + +	r = check_name(new_name); +	if (r) +		return r; + +	param->data_size = 0; +	return dm_hash_rename(param->name, new_name); +} + +static int do_suspend(struct dm_ioctl *param) +{ +	int r = 0; +	struct mapped_device *md; + +	md = find_device(param); +	if (!md) +		return -ENXIO; + +	if (!dm_suspended(md)) +		r = dm_suspend(md); + +	if (!r) +		r = __dev_status(md, param); + +	dm_put(md); +	return r; +} + +static int do_resume(struct dm_ioctl *param) +{ +	int r = 0; +	struct hash_cell *hc; +	struct mapped_device *md; +	struct dm_table *new_map; + +	down_write(&_hash_lock); + +	hc = __find_device_hash_cell(param); +	if (!hc) { +		DMWARN("device doesn't appear to be in the dev hash table."); +		up_write(&_hash_lock); +		return -ENXIO; +	} + +	md = hc->md; +	dm_get(md); + +	new_map = hc->new_map; +	hc->new_map = NULL; +	param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + +	up_write(&_hash_lock); + +	/* Do we need to load a new map ? */ +	if (new_map) { +		/* Suspend if it isn't already suspended */ +		if (!dm_suspended(md)) +			dm_suspend(md); + +		r = dm_swap_table(md, new_map); +		if (r) { +			dm_put(md); +			dm_table_put(new_map); +			return r; +		} + +		if (dm_table_get_mode(new_map) & FMODE_WRITE) +			set_disk_ro(dm_disk(md), 0); +		else +			set_disk_ro(dm_disk(md), 1); + +		dm_table_put(new_map); +	} + +	if (dm_suspended(md)) +		r = dm_resume(md); + +	if (!r) +		r = __dev_status(md, param); + +	dm_put(md); +	return r; +} + +/* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. + */ +static int dev_suspend(struct dm_ioctl *param, size_t param_size) +{ +	if (param->flags & DM_SUSPEND_FLAG) +		return do_suspend(param); + +	return do_resume(param); +} + +/* + * Copies device info back to user space, used by + * the create and info ioctls. + */ +static int dev_status(struct dm_ioctl *param, size_t param_size) +{ +	int r; +	struct mapped_device *md; + +	md = find_device(param); +	if (!md) +		return -ENXIO; + +	r = __dev_status(md, param); +	dm_put(md); +	return r; +} + +/* + * Build up the status struct for each target + */ +static void retrieve_status(struct dm_table *table, +			    struct dm_ioctl *param, size_t param_size) +{ +	unsigned int i, num_targets; +	struct dm_target_spec *spec; +	char *outbuf, *outptr; +	status_type_t type; +	size_t remaining, len, used = 0; + +	outptr = outbuf = get_result_buffer(param, param_size, &len); + +	if (param->flags & DM_STATUS_TABLE_FLAG) +		type = STATUSTYPE_TABLE; +	else +		type = STATUSTYPE_INFO; + +	/* Get all the target info */ +	num_targets = dm_table_get_num_targets(table); +	for (i = 0; i < num_targets; i++) { +		struct dm_target *ti = dm_table_get_target(table, i); + +		remaining = len - (outptr - outbuf); +		if (remaining <= sizeof(struct dm_target_spec)) { +			param->flags |= DM_BUFFER_FULL_FLAG; +			break; +		} + +		spec = (struct dm_target_spec *) outptr; + +		spec->status = 0; +		spec->sector_start = ti->begin; +		spec->length = ti->len; +		strncpy(spec->target_type, ti->type->name, +			sizeof(spec->target_type)); + +		outptr += sizeof(struct dm_target_spec); +		remaining = len - (outptr - outbuf); +		if (remaining <= 0) { +			param->flags |= DM_BUFFER_FULL_FLAG; +			break; +		} + +		/* Get the status/table string from the target driver */ +		if (ti->type->status) { +			if (ti->type->status(ti, type, outptr, remaining)) { +				param->flags |= DM_BUFFER_FULL_FLAG; +				break; +			} +		} else +			outptr[0] = '\0'; + +		outptr += strlen(outptr) + 1; +		used = param->data_start + (outptr - outbuf); + +		outptr = align_ptr(outptr); +		spec->next = outptr - outbuf; +	} + +	if (used) +		param->data_size = used; + +	param->target_count = num_targets; +} + +/* + * Wait for a device to report an event + */ +static int dev_wait(struct dm_ioctl *param, size_t param_size) +{ +	int r; +	struct mapped_device *md; +	struct dm_table *table; + +	md = find_device(param); +	if (!md) +		return -ENXIO; + +	/* +	 * Wait for a notification event +	 */ +	if (dm_wait_event(md, param->event_nr)) { +		r = -ERESTARTSYS; +		goto out; +	} + +	/* +	 * The userland program is going to want to know what +	 * changed to trigger the event, so we may as well tell +	 * him and save an ioctl. +	 */ +	r = __dev_status(md, param); +	if (r) +		goto out; + +	table = dm_get_table(md); +	if (table) { +		retrieve_status(table, param, param_size); +		dm_table_put(table); +	} + + out: +	dm_put(md); +	return r; +} + +static inline int get_mode(struct dm_ioctl *param) +{ +	int mode = FMODE_READ | FMODE_WRITE; + +	if (param->flags & DM_READONLY_FLAG) +		mode = FMODE_READ; + +	return mode; +} + +static int next_target(struct dm_target_spec *last, uint32_t next, void *end, +		       struct dm_target_spec **spec, char **target_params) +{ +	*spec = (struct dm_target_spec *) ((unsigned char *) last + next); +	*target_params = (char *) (*spec + 1); + +	if (*spec < (last + 1)) +		return -EINVAL; + +	return invalid_str(*target_params, end); +} + +static int populate_table(struct dm_table *table, +			  struct dm_ioctl *param, size_t param_size) +{ +	int r; +	unsigned int i = 0; +	struct dm_target_spec *spec = (struct dm_target_spec *) param; +	uint32_t next = param->data_start; +	void *end = (void *) param + param_size; +	char *target_params; + +	if (!param->target_count) { +		DMWARN("populate_table: no targets specified"); +		return -EINVAL; +	} + +	for (i = 0; i < param->target_count; i++) { + +		r = next_target(spec, next, end, &spec, &target_params); +		if (r) { +			DMWARN("unable to find target"); +			return r; +		} + +		r = dm_table_add_target(table, spec->target_type, +					(sector_t) spec->sector_start, +					(sector_t) spec->length, +					target_params); +		if (r) { +			DMWARN("error adding target to table"); +			return r; +		} + +		next = spec->next; +	} + +	return dm_table_complete(table); +} + +static int table_load(struct dm_ioctl *param, size_t param_size) +{ +	int r; +	struct hash_cell *hc; +	struct dm_table *t; + +	r = dm_table_create(&t, get_mode(param), param->target_count); +	if (r) +		return r; + +	r = populate_table(t, param, param_size); +	if (r) { +		dm_table_put(t); +		return r; +	} + +	down_write(&_hash_lock); +	hc = __find_device_hash_cell(param); +	if (!hc) { +		DMWARN("device doesn't appear to be in the dev hash table."); +		up_write(&_hash_lock); +		return -ENXIO; +	} + +	if (hc->new_map) +		dm_table_put(hc->new_map); +	hc->new_map = t; +	param->flags |= DM_INACTIVE_PRESENT_FLAG; + +	r = __dev_status(hc->md, param); +	up_write(&_hash_lock); +	return r; +} + +static int table_clear(struct dm_ioctl *param, size_t param_size) +{ +	int r; +	struct hash_cell *hc; + +	down_write(&_hash_lock); + +	hc = __find_device_hash_cell(param); +	if (!hc) { +		DMWARN("device doesn't appear to be in the dev hash table."); +		up_write(&_hash_lock); +		return -ENXIO; +	} + +	if (hc->new_map) { +		dm_table_put(hc->new_map); +		hc->new_map = NULL; +	} + +	param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + +	r = __dev_status(hc->md, param); +	up_write(&_hash_lock); +	return r; +} + +/* + * Retrieves a list of devices used by a particular dm device. + */ +static void retrieve_deps(struct dm_table *table, +			  struct dm_ioctl *param, size_t param_size) +{ +	unsigned int count = 0; +	struct list_head *tmp; +	size_t len, needed; +	struct dm_dev *dd; +	struct dm_target_deps *deps; + +	deps = get_result_buffer(param, param_size, &len); + +	/* +	 * Count the devices. +	 */ +	list_for_each (tmp, dm_table_get_devices(table)) +		count++; + +	/* +	 * Check we have enough space. +	 */ +	needed = sizeof(*deps) + (sizeof(*deps->dev) * count); +	if (len < needed) { +		param->flags |= DM_BUFFER_FULL_FLAG; +		return; +	} + +	/* +	 * Fill in the devices. +	 */ +	deps->count = count; +	count = 0; +	list_for_each_entry (dd, dm_table_get_devices(table), list) +		deps->dev[count++] = huge_encode_dev(dd->bdev->bd_dev); + +	param->data_size = param->data_start + needed; +} + +static int table_deps(struct dm_ioctl *param, size_t param_size) +{ +	int r = 0; +	struct mapped_device *md; +	struct dm_table *table; + +	md = find_device(param); +	if (!md) +		return -ENXIO; + +	r = __dev_status(md, param); +	if (r) +		goto out; + +	table = dm_get_table(md); +	if (table) { +		retrieve_deps(table, param, param_size); +		dm_table_put(table); +	} + + out: +	dm_put(md); +	return r; +} + +/* + * Return the status of a device as a text string for each + * target. + */ +static int table_status(struct dm_ioctl *param, size_t param_size) +{ +	int r; +	struct mapped_device *md; +	struct dm_table *table; + +	md = find_device(param); +	if (!md) +		return -ENXIO; + +	r = __dev_status(md, param); +	if (r) +		goto out; + +	table = dm_get_table(md); +	if (table) { +		retrieve_status(table, param, param_size); +		dm_table_put(table); +	} + + out: +	dm_put(md); +	return r; +} + +/* + * Pass a message to the target that's at the supplied device offset. + */ +static int target_message(struct dm_ioctl *param, size_t param_size) +{ +	int r, argc; +	char **argv; +	struct mapped_device *md; +	struct dm_table *table; +	struct dm_target *ti; +	struct dm_target_msg *tmsg = (void *) param + param->data_start; + +	md = find_device(param); +	if (!md) +		return -ENXIO; + +	r = __dev_status(md, param); +	if (r) +		goto out; + +	if (tmsg < (struct dm_target_msg *) (param + 1) || +	    invalid_str(tmsg->message, (void *) param + param_size)) { +		DMWARN("Invalid target message parameters."); +		r = -EINVAL; +		goto out; +	} + +	r = dm_split_args(&argc, &argv, tmsg->message); +	if (r) { +		DMWARN("Failed to split target message parameters"); +		goto out; +	} + +	table = dm_get_table(md); +	if (!table) +		goto out_argv; + +	if (tmsg->sector >= dm_table_get_size(table)) { +		DMWARN("Target message sector outside device."); +		r = -EINVAL; +		goto out_table; +	} + +	ti = dm_table_find_target(table, tmsg->sector); +	if (ti->type->message) +		r = ti->type->message(ti, argc, argv); +	else { +		DMWARN("Target type does not support messages"); +		r = -EINVAL; +	} + + out_table: +	dm_table_put(table); + out_argv: +	kfree(argv); + out: +	param->data_size = 0; +	dm_put(md); +	return r; +} + +/*----------------------------------------------------------------- + * Implementation of open/close/ioctl on the special char + * device. + *---------------------------------------------------------------*/ +static ioctl_fn lookup_ioctl(unsigned int cmd) +{ +	static struct { +		int cmd; +		ioctl_fn fn; +	} _ioctls[] = { +		{DM_VERSION_CMD, NULL},	/* version is dealt with elsewhere */ +		{DM_REMOVE_ALL_CMD, remove_all}, +		{DM_LIST_DEVICES_CMD, list_devices}, + +		{DM_DEV_CREATE_CMD, dev_create}, +		{DM_DEV_REMOVE_CMD, dev_remove}, +		{DM_DEV_RENAME_CMD, dev_rename}, +		{DM_DEV_SUSPEND_CMD, dev_suspend}, +		{DM_DEV_STATUS_CMD, dev_status}, +		{DM_DEV_WAIT_CMD, dev_wait}, + +		{DM_TABLE_LOAD_CMD, table_load}, +		{DM_TABLE_CLEAR_CMD, table_clear}, +		{DM_TABLE_DEPS_CMD, table_deps}, +		{DM_TABLE_STATUS_CMD, table_status}, + +		{DM_LIST_VERSIONS_CMD, list_versions}, + +		{DM_TARGET_MSG_CMD, target_message} +	}; + +	return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; +} + +/* + * As well as checking the version compatibility this always + * copies the kernel interface version out. + */ +static int check_version(unsigned int cmd, struct dm_ioctl __user *user) +{ +	uint32_t version[3]; +	int r = 0; + +	if (copy_from_user(version, user->version, sizeof(version))) +		return -EFAULT; + +	if ((DM_VERSION_MAJOR != version[0]) || +	    (DM_VERSION_MINOR < version[1])) { +		DMWARN("ioctl interface mismatch: " +		       "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", +		       DM_VERSION_MAJOR, DM_VERSION_MINOR, +		       DM_VERSION_PATCHLEVEL, +		       version[0], version[1], version[2], cmd); +		r = -EINVAL; +	} + +	/* +	 * Fill in the kernel version. +	 */ +	version[0] = DM_VERSION_MAJOR; +	version[1] = DM_VERSION_MINOR; +	version[2] = DM_VERSION_PATCHLEVEL; +	if (copy_to_user(user->version, version, sizeof(version))) +		return -EFAULT; + +	return r; +} + +static void free_params(struct dm_ioctl *param) +{ +	vfree(param); +} + +static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) +{ +	struct dm_ioctl tmp, *dmi; + +	if (copy_from_user(&tmp, user, sizeof(tmp))) +		return -EFAULT; + +	if (tmp.data_size < sizeof(tmp)) +		return -EINVAL; + +	dmi = (struct dm_ioctl *) vmalloc(tmp.data_size); +	if (!dmi) +		return -ENOMEM; + +	if (copy_from_user(dmi, user, tmp.data_size)) { +		vfree(dmi); +		return -EFAULT; +	} + +	*param = dmi; +	return 0; +} + +static int validate_params(uint cmd, struct dm_ioctl *param) +{ +	/* Always clear this flag */ +	param->flags &= ~DM_BUFFER_FULL_FLAG; + +	/* Ignores parameters */ +	if (cmd == DM_REMOVE_ALL_CMD || +	    cmd == DM_LIST_DEVICES_CMD || +	    cmd == DM_LIST_VERSIONS_CMD) +		return 0; + +	if ((cmd == DM_DEV_CREATE_CMD)) { +		if (!*param->name) { +			DMWARN("name not supplied when creating device"); +			return -EINVAL; +		} +	} else if ((*param->uuid && *param->name)) { +		DMWARN("only supply one of name or uuid, cmd(%u)", cmd); +		return -EINVAL; +	} + +	/* Ensure strings are terminated */ +	param->name[DM_NAME_LEN - 1] = '\0'; +	param->uuid[DM_UUID_LEN - 1] = '\0'; + +	return 0; +} + +static int ctl_ioctl(struct inode *inode, struct file *file, +		     uint command, ulong u) +{ +	int r = 0; +	unsigned int cmd; +	struct dm_ioctl *param; +	struct dm_ioctl __user *user = (struct dm_ioctl __user *) u; +	ioctl_fn fn = NULL; +	size_t param_size; + +	/* only root can play with this */ +	if (!capable(CAP_SYS_ADMIN)) +		return -EACCES; + +	if (_IOC_TYPE(command) != DM_IOCTL) +		return -ENOTTY; + +	cmd = _IOC_NR(command); + +	/* +	 * Check the interface version passed in.  This also +	 * writes out the kernel's interface version. +	 */ +	r = check_version(cmd, user); +	if (r) +		return r; + +	/* +	 * Nothing more to do for the version command. +	 */ +	if (cmd == DM_VERSION_CMD) +		return 0; + +	fn = lookup_ioctl(cmd); +	if (!fn) { +		DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); +		return -ENOTTY; +	} + +	/* +	 * Trying to avoid low memory issues when a device is +	 * suspended. +	 */ +	current->flags |= PF_MEMALLOC; + +	/* +	 * Copy the parameters into kernel space. +	 */ +	r = copy_params(user, ¶m); +	if (r) { +		current->flags &= ~PF_MEMALLOC; +		return r; +	} + +	/* +	 * FIXME: eventually we will remove the PF_MEMALLOC flag +	 * here.  However the tools still do nasty things like +	 * 'load' while a device is suspended. +	 */ + +	r = validate_params(cmd, param); +	if (r) +		goto out; + +	param_size = param->data_size; +	param->data_size = sizeof(*param); +	r = fn(param, param_size); + +	/* +	 * Copy the results back to userland. +	 */ +	if (!r && copy_to_user(user, param, param->data_size)) +		r = -EFAULT; + + out: +	free_params(param); +	current->flags &= ~PF_MEMALLOC; +	return r; +} + +static struct file_operations _ctl_fops = { +	.ioctl	 = ctl_ioctl, +	.owner	 = THIS_MODULE, +}; + +static struct miscdevice _dm_misc = { +	.minor 		= MISC_DYNAMIC_MINOR, +	.name  		= DM_NAME, +	.devfs_name 	= "mapper/control", +	.fops  		= &_ctl_fops +}; + +/* + * Create misc character device and link to DM_DIR/control. + */ +int __init dm_interface_init(void) +{ +	int r; + +	r = dm_hash_init(); +	if (r) +		return r; + +	r = misc_register(&_dm_misc); +	if (r) { +		DMERR("misc_register failed for control device"); +		dm_hash_exit(); +		return r; +	} + +	DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, +	       DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, +	       DM_DRIVER_EMAIL); +	return 0; +} + +void dm_interface_exit(void) +{ +	if (misc_deregister(&_dm_misc) < 0) +		DMERR("misc_deregister failed for control device"); + +	dm_hash_exit(); +} diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c new file mode 100644 index 00000000000..6a2cd5dc8a6 --- /dev/null +++ b/drivers/md/dm-linear.c @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2001-2003 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> + +/* + * Linear: maps a linear range of a device. + */ +struct linear_c { +	struct dm_dev *dev; +	sector_t start; +}; + +/* + * Construct a linear mapping: <dev_path> <offset> + */ +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ +	struct linear_c *lc; + +	if (argc != 2) { +		ti->error = "dm-linear: Invalid argument count"; +		return -EINVAL; +	} + +	lc = kmalloc(sizeof(*lc), GFP_KERNEL); +	if (lc == NULL) { +		ti->error = "dm-linear: Cannot allocate linear context"; +		return -ENOMEM; +	} + +	if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) { +		ti->error = "dm-linear: Invalid device sector"; +		goto bad; +	} + +	if (dm_get_device(ti, argv[0], lc->start, ti->len, +			  dm_table_get_mode(ti->table), &lc->dev)) { +		ti->error = "dm-linear: Device lookup failed"; +		goto bad; +	} + +	ti->private = lc; +	return 0; + +      bad: +	kfree(lc); +	return -EINVAL; +} + +static void linear_dtr(struct dm_target *ti) +{ +	struct linear_c *lc = (struct linear_c *) ti->private; + +	dm_put_device(ti, lc->dev); +	kfree(lc); +} + +static int linear_map(struct dm_target *ti, struct bio *bio, +		      union map_info *map_context) +{ +	struct linear_c *lc = (struct linear_c *) ti->private; + +	bio->bi_bdev = lc->dev->bdev; +	bio->bi_sector = lc->start + (bio->bi_sector - ti->begin); + +	return 1; +} + +static int linear_status(struct dm_target *ti, status_type_t type, +			 char *result, unsigned int maxlen) +{ +	struct linear_c *lc = (struct linear_c *) ti->private; + +	switch (type) { +	case STATUSTYPE_INFO: +		result[0] = '\0'; +		break; + +	case STATUSTYPE_TABLE: +		snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name, +			 lc->start); +		break; +	} +	return 0; +} + +static struct target_type linear_target = { +	.name   = "linear", +	.version= {1, 0, 1}, +	.module = THIS_MODULE, +	.ctr    = linear_ctr, +	.dtr    = linear_dtr, +	.map    = linear_map, +	.status = linear_status, +}; + +int __init dm_linear_init(void) +{ +	int r = dm_register_target(&linear_target); + +	if (r < 0) +		DMERR("linear: register failed %d", r); + +	return r; +} + +void dm_linear_exit(void) +{ +	int r = dm_unregister_target(&linear_target); + +	if (r < 0) +		DMERR("linear: unregister failed %d", r); +} diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c new file mode 100644 index 00000000000..e110655eabd --- /dev/null +++ b/drivers/md/dm-log.c @@ -0,0 +1,711 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/vmalloc.h> + +#include "dm-log.h" +#include "dm-io.h" + +static LIST_HEAD(_log_types); +static DEFINE_SPINLOCK(_lock); + +int dm_register_dirty_log_type(struct dirty_log_type *type) +{ +	spin_lock(&_lock); +	type->use_count = 0; +	list_add(&type->list, &_log_types); +	spin_unlock(&_lock); + +	return 0; +} + +int dm_unregister_dirty_log_type(struct dirty_log_type *type) +{ +	spin_lock(&_lock); + +	if (type->use_count) +		DMWARN("Attempt to unregister a log type that is still in use"); +	else +		list_del(&type->list); + +	spin_unlock(&_lock); + +	return 0; +} + +static struct dirty_log_type *get_type(const char *type_name) +{ +	struct dirty_log_type *type; + +	spin_lock(&_lock); +	list_for_each_entry (type, &_log_types, list) +		if (!strcmp(type_name, type->name)) { +			if (!type->use_count && !try_module_get(type->module)){ +				spin_unlock(&_lock); +				return NULL; +			} +			type->use_count++; +			spin_unlock(&_lock); +			return type; +		} + +	spin_unlock(&_lock); +	return NULL; +} + +static void put_type(struct dirty_log_type *type) +{ +	spin_lock(&_lock); +	if (!--type->use_count) +		module_put(type->module); +	spin_unlock(&_lock); +} + +struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti, +				      unsigned int argc, char **argv) +{ +	struct dirty_log_type *type; +	struct dirty_log *log; + +	log = kmalloc(sizeof(*log), GFP_KERNEL); +	if (!log) +		return NULL; + +	type = get_type(type_name); +	if (!type) { +		kfree(log); +		return NULL; +	} + +	log->type = type; +	if (type->ctr(log, ti, argc, argv)) { +		kfree(log); +		put_type(type); +		return NULL; +	} + +	return log; +} + +void dm_destroy_dirty_log(struct dirty_log *log) +{ +	log->type->dtr(log); +	put_type(log->type); +	kfree(log); +} + +/*----------------------------------------------------------------- + * Persistent and core logs share a lot of their implementation. + * FIXME: need a reload method to be called from a resume + *---------------------------------------------------------------*/ +/* + * Magic for persistent mirrors: "MiRr" + */ +#define MIRROR_MAGIC 0x4D695272 + +/* + * The on-disk version of the metadata. + */ +#define MIRROR_DISK_VERSION 1 +#define LOG_OFFSET 2 + +struct log_header { +	uint32_t magic; + +	/* +	 * Simple, incrementing version. no backward +	 * compatibility. +	 */ +	uint32_t version; +	sector_t nr_regions; +}; + +struct log_c { +	struct dm_target *ti; +	int touched; +	uint32_t region_size; +	unsigned int region_count; +	region_t sync_count; + +	unsigned bitset_uint32_count; +	uint32_t *clean_bits; +	uint32_t *sync_bits; +	uint32_t *recovering_bits;	/* FIXME: this seems excessive */ + +	int sync_search; + +	/* Resync flag */ +	enum sync { +		DEFAULTSYNC,	/* Synchronize if necessary */ +		NOSYNC,		/* Devices known to be already in sync */ +		FORCESYNC,	/* Force a sync to happen */ +	} sync; + +	/* +	 * Disk log fields +	 */ +	struct dm_dev *log_dev; +	struct log_header header; + +	struct io_region header_location; +	struct log_header *disk_header; + +	struct io_region bits_location; +	uint32_t *disk_bits; +}; + +/* + * The touched member needs to be updated every time we access + * one of the bitsets. + */ +static  inline int log_test_bit(uint32_t *bs, unsigned bit) +{ +	return test_bit(bit, (unsigned long *) bs) ? 1 : 0; +} + +static inline void log_set_bit(struct log_c *l, +			       uint32_t *bs, unsigned bit) +{ +	set_bit(bit, (unsigned long *) bs); +	l->touched = 1; +} + +static inline void log_clear_bit(struct log_c *l, +				 uint32_t *bs, unsigned bit) +{ +	clear_bit(bit, (unsigned long *) bs); +	l->touched = 1; +} + +/*---------------------------------------------------------------- + * Header IO + *--------------------------------------------------------------*/ +static void header_to_disk(struct log_header *core, struct log_header *disk) +{ +	disk->magic = cpu_to_le32(core->magic); +	disk->version = cpu_to_le32(core->version); +	disk->nr_regions = cpu_to_le64(core->nr_regions); +} + +static void header_from_disk(struct log_header *core, struct log_header *disk) +{ +	core->magic = le32_to_cpu(disk->magic); +	core->version = le32_to_cpu(disk->version); +	core->nr_regions = le64_to_cpu(disk->nr_regions); +} + +static int read_header(struct log_c *log) +{ +	int r; +	unsigned long ebits; + +	r = dm_io_sync_vm(1, &log->header_location, READ, +			  log->disk_header, &ebits); +	if (r) +		return r; + +	header_from_disk(&log->header, log->disk_header); + +	/* New log required? */ +	if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) { +		log->header.magic = MIRROR_MAGIC; +		log->header.version = MIRROR_DISK_VERSION; +		log->header.nr_regions = 0; +	} + +	if (log->header.version != MIRROR_DISK_VERSION) { +		DMWARN("incompatible disk log version"); +		return -EINVAL; +	} + +	return 0; +} + +static inline int write_header(struct log_c *log) +{ +	unsigned long ebits; + +	header_to_disk(&log->header, log->disk_header); +	return dm_io_sync_vm(1, &log->header_location, WRITE, +			     log->disk_header, &ebits); +} + +/*---------------------------------------------------------------- + * Bits IO + *--------------------------------------------------------------*/ +static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count) +{ +	unsigned i; + +	for (i = 0; i < count; i++) +		core[i] = le32_to_cpu(disk[i]); +} + +static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count) +{ +	unsigned i; + +	/* copy across the clean/dirty bitset */ +	for (i = 0; i < count; i++) +		disk[i] = cpu_to_le32(core[i]); +} + +static int read_bits(struct log_c *log) +{ +	int r; +	unsigned long ebits; + +	r = dm_io_sync_vm(1, &log->bits_location, READ, +			  log->disk_bits, &ebits); +	if (r) +		return r; + +	bits_to_core(log->clean_bits, log->disk_bits, +		     log->bitset_uint32_count); +	return 0; +} + +static int write_bits(struct log_c *log) +{ +	unsigned long ebits; +	bits_to_disk(log->clean_bits, log->disk_bits, +		     log->bitset_uint32_count); +	return dm_io_sync_vm(1, &log->bits_location, WRITE, +			     log->disk_bits, &ebits); +} + +/*---------------------------------------------------------------- + * core log constructor/destructor + * + * argv contains region_size followed optionally by [no]sync + *--------------------------------------------------------------*/ +#define BYTE_SHIFT 3 +static int core_ctr(struct dirty_log *log, struct dm_target *ti, +		    unsigned int argc, char **argv) +{ +	enum sync sync = DEFAULTSYNC; + +	struct log_c *lc; +	uint32_t region_size; +	unsigned int region_count; +	size_t bitset_size; + +	if (argc < 1 || argc > 2) { +		DMWARN("wrong number of arguments to mirror log"); +		return -EINVAL; +	} + +	if (argc > 1) { +		if (!strcmp(argv[1], "sync")) +			sync = FORCESYNC; +		else if (!strcmp(argv[1], "nosync")) +			sync = NOSYNC; +		else { +			DMWARN("unrecognised sync argument to mirror log: %s", +			       argv[1]); +			return -EINVAL; +		} +	} + +	if (sscanf(argv[0], "%u", ®ion_size) != 1) { +		DMWARN("invalid region size string"); +		return -EINVAL; +	} + +	region_count = dm_sector_div_up(ti->len, region_size); + +	lc = kmalloc(sizeof(*lc), GFP_KERNEL); +	if (!lc) { +		DMWARN("couldn't allocate core log"); +		return -ENOMEM; +	} + +	lc->ti = ti; +	lc->touched = 0; +	lc->region_size = region_size; +	lc->region_count = region_count; +	lc->sync = sync; + +	/* +	 * Work out how many words we need to hold the bitset. +	 */ +	bitset_size = dm_round_up(region_count, +				  sizeof(*lc->clean_bits) << BYTE_SHIFT); +	bitset_size >>= BYTE_SHIFT; + +	lc->bitset_uint32_count = bitset_size / 4; +	lc->clean_bits = vmalloc(bitset_size); +	if (!lc->clean_bits) { +		DMWARN("couldn't allocate clean bitset"); +		kfree(lc); +		return -ENOMEM; +	} +	memset(lc->clean_bits, -1, bitset_size); + +	lc->sync_bits = vmalloc(bitset_size); +	if (!lc->sync_bits) { +		DMWARN("couldn't allocate sync bitset"); +		vfree(lc->clean_bits); +		kfree(lc); +		return -ENOMEM; +	} +	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); +	lc->sync_count = (sync == NOSYNC) ? region_count : 0; + +	lc->recovering_bits = vmalloc(bitset_size); +	if (!lc->recovering_bits) { +		DMWARN("couldn't allocate sync bitset"); +		vfree(lc->sync_bits); +		vfree(lc->clean_bits); +		kfree(lc); +		return -ENOMEM; +	} +	memset(lc->recovering_bits, 0, bitset_size); +	lc->sync_search = 0; +	log->context = lc; +	return 0; +} + +static void core_dtr(struct dirty_log *log) +{ +	struct log_c *lc = (struct log_c *) log->context; +	vfree(lc->clean_bits); +	vfree(lc->sync_bits); +	vfree(lc->recovering_bits); +	kfree(lc); +} + +/*---------------------------------------------------------------- + * disk log constructor/destructor + * + * argv contains log_device region_size followed optionally by [no]sync + *--------------------------------------------------------------*/ +static int disk_ctr(struct dirty_log *log, struct dm_target *ti, +		    unsigned int argc, char **argv) +{ +	int r; +	size_t size; +	struct log_c *lc; +	struct dm_dev *dev; + +	if (argc < 2 || argc > 3) { +		DMWARN("wrong number of arguments to disk mirror log"); +		return -EINVAL; +	} + +	r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */, +			  FMODE_READ | FMODE_WRITE, &dev); +	if (r) +		return r; + +	r = core_ctr(log, ti, argc - 1, argv + 1); +	if (r) { +		dm_put_device(ti, dev); +		return r; +	} + +	lc = (struct log_c *) log->context; +	lc->log_dev = dev; + +	/* setup the disk header fields */ +	lc->header_location.bdev = lc->log_dev->bdev; +	lc->header_location.sector = 0; +	lc->header_location.count = 1; + +	/* +	 * We can't read less than this amount, even though we'll +	 * not be using most of this space. +	 */ +	lc->disk_header = vmalloc(1 << SECTOR_SHIFT); +	if (!lc->disk_header) +		goto bad; + +	/* setup the disk bitset fields */ +	lc->bits_location.bdev = lc->log_dev->bdev; +	lc->bits_location.sector = LOG_OFFSET; + +	size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t), +			   1 << SECTOR_SHIFT); +	lc->bits_location.count = size >> SECTOR_SHIFT; +	lc->disk_bits = vmalloc(size); +	if (!lc->disk_bits) { +		vfree(lc->disk_header); +		goto bad; +	} +	return 0; + + bad: +	dm_put_device(ti, lc->log_dev); +	core_dtr(log); +	return -ENOMEM; +} + +static void disk_dtr(struct dirty_log *log) +{ +	struct log_c *lc = (struct log_c *) log->context; +	dm_put_device(lc->ti, lc->log_dev); +	vfree(lc->disk_header); +	vfree(lc->disk_bits); +	core_dtr(log); +} + +static int count_bits32(uint32_t *addr, unsigned size) +{ +	int count = 0, i; + +	for (i = 0; i < size; i++) { +		count += hweight32(*(addr+i)); +	} +	return count; +} + +static int disk_resume(struct dirty_log *log) +{ +	int r; +	unsigned i; +	struct log_c *lc = (struct log_c *) log->context; +	size_t size = lc->bitset_uint32_count * sizeof(uint32_t); + +	/* read the disk header */ +	r = read_header(lc); +	if (r) +		return r; + +	/* read the bits */ +	r = read_bits(lc); +	if (r) +		return r; + +	/* set or clear any new bits */ +	if (lc->sync == NOSYNC) +		for (i = lc->header.nr_regions; i < lc->region_count; i++) +			/* FIXME: amazingly inefficient */ +			log_set_bit(lc, lc->clean_bits, i); +	else +		for (i = lc->header.nr_regions; i < lc->region_count; i++) +			/* FIXME: amazingly inefficient */ +			log_clear_bit(lc, lc->clean_bits, i); + +	/* copy clean across to sync */ +	memcpy(lc->sync_bits, lc->clean_bits, size); +	lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); + +	/* write the bits */ +	r = write_bits(lc); +	if (r) +		return r; + +	/* set the correct number of regions in the header */ +	lc->header.nr_regions = lc->region_count; + +	/* write the new header */ +	return write_header(lc); +} + +static uint32_t core_get_region_size(struct dirty_log *log) +{ +	struct log_c *lc = (struct log_c *) log->context; +	return lc->region_size; +} + +static int core_is_clean(struct dirty_log *log, region_t region) +{ +	struct log_c *lc = (struct log_c *) log->context; +	return log_test_bit(lc->clean_bits, region); +} + +static int core_in_sync(struct dirty_log *log, region_t region, int block) +{ +	struct log_c *lc = (struct log_c *) log->context; +	return log_test_bit(lc->sync_bits, region); +} + +static int core_flush(struct dirty_log *log) +{ +	/* no op */ +	return 0; +} + +static int disk_flush(struct dirty_log *log) +{ +	int r; +	struct log_c *lc = (struct log_c *) log->context; + +	/* only write if the log has changed */ +	if (!lc->touched) +		return 0; + +	r = write_bits(lc); +	if (!r) +		lc->touched = 0; + +	return r; +} + +static void core_mark_region(struct dirty_log *log, region_t region) +{ +	struct log_c *lc = (struct log_c *) log->context; +	log_clear_bit(lc, lc->clean_bits, region); +} + +static void core_clear_region(struct dirty_log *log, region_t region) +{ +	struct log_c *lc = (struct log_c *) log->context; +	log_set_bit(lc, lc->clean_bits, region); +} + +static int core_get_resync_work(struct dirty_log *log, region_t *region) +{ +	struct log_c *lc = (struct log_c *) log->context; + +	if (lc->sync_search >= lc->region_count) +		return 0; + +	do { +		*region = find_next_zero_bit((unsigned long *) lc->sync_bits, +					     lc->region_count, +					     lc->sync_search); +		lc->sync_search = *region + 1; + +		if (*region == lc->region_count) +			return 0; + +	} while (log_test_bit(lc->recovering_bits, *region)); + +	log_set_bit(lc, lc->recovering_bits, *region); +	return 1; +} + +static void core_complete_resync_work(struct dirty_log *log, region_t region, +				      int success) +{ +	struct log_c *lc = (struct log_c *) log->context; + +	log_clear_bit(lc, lc->recovering_bits, region); +	if (success) { +		log_set_bit(lc, lc->sync_bits, region); +                lc->sync_count++; +        } +} + +static region_t core_get_sync_count(struct dirty_log *log) +{ +        struct log_c *lc = (struct log_c *) log->context; + +        return lc->sync_count; +} + +#define	DMEMIT_SYNC \ +	if (lc->sync != DEFAULTSYNC) \ +		DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "") + +static int core_status(struct dirty_log *log, status_type_t status, +		       char *result, unsigned int maxlen) +{ +	int sz = 0; +	struct log_c *lc = log->context; + +	switch(status) { +	case STATUSTYPE_INFO: +		break; + +	case STATUSTYPE_TABLE: +		DMEMIT("%s %u %u ", log->type->name, +		       lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size); +		DMEMIT_SYNC; +	} + +	return sz; +} + +static int disk_status(struct dirty_log *log, status_type_t status, +		       char *result, unsigned int maxlen) +{ +	int sz = 0; +	char buffer[16]; +	struct log_c *lc = log->context; + +	switch(status) { +	case STATUSTYPE_INFO: +		break; + +	case STATUSTYPE_TABLE: +		format_dev_t(buffer, lc->log_dev->bdev->bd_dev); +		DMEMIT("%s %u %s %u ", log->type->name, +		       lc->sync == DEFAULTSYNC ? 2 : 3, buffer, +		       lc->region_size); +		DMEMIT_SYNC; +	} + +	return sz; +} + +static struct dirty_log_type _core_type = { +	.name = "core", +	.module = THIS_MODULE, +	.ctr = core_ctr, +	.dtr = core_dtr, +	.get_region_size = core_get_region_size, +	.is_clean = core_is_clean, +	.in_sync = core_in_sync, +	.flush = core_flush, +	.mark_region = core_mark_region, +	.clear_region = core_clear_region, +	.get_resync_work = core_get_resync_work, +	.complete_resync_work = core_complete_resync_work, +	.get_sync_count = core_get_sync_count, +	.status = core_status, +}; + +static struct dirty_log_type _disk_type = { +	.name = "disk", +	.module = THIS_MODULE, +	.ctr = disk_ctr, +	.dtr = disk_dtr, +	.suspend = disk_flush, +	.resume = disk_resume, +	.get_region_size = core_get_region_size, +	.is_clean = core_is_clean, +	.in_sync = core_in_sync, +	.flush = disk_flush, +	.mark_region = core_mark_region, +	.clear_region = core_clear_region, +	.get_resync_work = core_get_resync_work, +	.complete_resync_work = core_complete_resync_work, +	.get_sync_count = core_get_sync_count, +	.status = disk_status, +}; + +int __init dm_dirty_log_init(void) +{ +	int r; + +	r = dm_register_dirty_log_type(&_core_type); +	if (r) +		DMWARN("couldn't register core log"); + +	r = dm_register_dirty_log_type(&_disk_type); +	if (r) { +		DMWARN("couldn't register disk type"); +		dm_unregister_dirty_log_type(&_core_type); +	} + +	return r; +} + +void dm_dirty_log_exit(void) +{ +	dm_unregister_dirty_log_type(&_disk_type); +	dm_unregister_dirty_log_type(&_core_type); +} + +EXPORT_SYMBOL(dm_register_dirty_log_type); +EXPORT_SYMBOL(dm_unregister_dirty_log_type); +EXPORT_SYMBOL(dm_create_dirty_log); +EXPORT_SYMBOL(dm_destroy_dirty_log); diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h new file mode 100644 index 00000000000..5ae5309ebf2 --- /dev/null +++ b/drivers/md/dm-log.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_DIRTY_LOG +#define DM_DIRTY_LOG + +#include "dm.h" + +typedef sector_t region_t; + +struct dirty_log_type; + +struct dirty_log { +	struct dirty_log_type *type; +	void *context; +}; + +struct dirty_log_type { +	struct list_head list; +	const char *name; +	struct module *module; +	unsigned int use_count; + +	int (*ctr)(struct dirty_log *log, struct dm_target *ti, +		   unsigned int argc, char **argv); +	void (*dtr)(struct dirty_log *log); + +	/* +	 * There are times when we don't want the log to touch +	 * the disk. +	 */ +	int (*suspend)(struct dirty_log *log); +	int (*resume)(struct dirty_log *log); + +	/* +	 * Retrieves the smallest size of region that the log can +	 * deal with. +	 */ +	uint32_t (*get_region_size)(struct dirty_log *log); + +        /* +	 * A predicate to say whether a region is clean or not. +	 * May block. +	 */ +	int (*is_clean)(struct dirty_log *log, region_t region); + +	/* +	 *  Returns: 0, 1, -EWOULDBLOCK, < 0 +	 * +	 * A predicate function to check the area given by +	 * [sector, sector + len) is in sync. +	 * +	 * If -EWOULDBLOCK is returned the state of the region is +	 * unknown, typically this will result in a read being +	 * passed to a daemon to deal with, since a daemon is +	 * allowed to block. +	 */ +	int (*in_sync)(struct dirty_log *log, region_t region, int can_block); + +	/* +	 * Flush the current log state (eg, to disk).  This +	 * function may block. +	 */ +	int (*flush)(struct dirty_log *log); + +	/* +	 * Mark an area as clean or dirty.  These functions may +	 * block, though for performance reasons blocking should +	 * be extremely rare (eg, allocating another chunk of +	 * memory for some reason). +	 */ +	void (*mark_region)(struct dirty_log *log, region_t region); +	void (*clear_region)(struct dirty_log *log, region_t region); + +	/* +	 * Returns: <0 (error), 0 (no region), 1 (region) +	 * +	 * The mirrord will need perform recovery on regions of +	 * the mirror that are in the NOSYNC state.  This +	 * function asks the log to tell the caller about the +	 * next region that this machine should recover. +	 * +	 * Do not confuse this function with 'in_sync()', one +	 * tells you if an area is synchronised, the other +	 * assigns recovery work. +	*/ +	int (*get_resync_work)(struct dirty_log *log, region_t *region); + +	/* +	 * This notifies the log that the resync of an area has +	 * been completed.  The log should then mark this region +	 * as CLEAN. +	 */ +	void (*complete_resync_work)(struct dirty_log *log, +				     region_t region, int success); + +        /* +	 * Returns the number of regions that are in sync. +         */ +        region_t (*get_sync_count)(struct dirty_log *log); + +	/* +	 * Support function for mirror status requests. +	 */ +	int (*status)(struct dirty_log *log, status_type_t status_type, +		      char *result, unsigned int maxlen); +}; + +int dm_register_dirty_log_type(struct dirty_log_type *type); +int dm_unregister_dirty_log_type(struct dirty_log_type *type); + + +/* + * Make sure you use these two functions, rather than calling + * type->constructor/destructor() directly. + */ +struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti, +				      unsigned int argc, char **argv); +void dm_destroy_dirty_log(struct dirty_log *log); + +/* + * init/exit functions. + */ +int dm_dirty_log_init(void); +void dm_dirty_log_exit(void); + +#endif diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c new file mode 100644 index 00000000000..43763a0bd09 --- /dev/null +++ b/drivers/md/dm-mpath.c @@ -0,0 +1,1302 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-path-selector.h" +#include "dm-hw-handler.h" +#include "dm-bio-list.h" +#include "dm-bio-record.h" + +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/workqueue.h> +#include <asm/atomic.h> + +#define MESG_STR(x) x, sizeof(x) + +/* Path properties */ +struct pgpath { +	struct list_head list; + +	struct priority_group *pg;	/* Owning PG */ +	unsigned fail_count;		/* Cumulative failure count */ + +	struct path path; +}; + +#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) + +/* + * Paths are grouped into Priority Groups and numbered from 1 upwards. + * Each has a path selector which controls which path gets used. + */ +struct priority_group { +	struct list_head list; + +	struct multipath *m;		/* Owning multipath instance */ +	struct path_selector ps; + +	unsigned pg_num;		/* Reference number */ +	unsigned bypassed;		/* Temporarily bypass this PG? */ + +	unsigned nr_pgpaths;		/* Number of paths in PG */ +	struct list_head pgpaths; +}; + +/* Multipath context */ +struct multipath { +	struct list_head list; +	struct dm_target *ti; + +	spinlock_t lock; + +	struct hw_handler hw_handler; +	unsigned nr_priority_groups; +	struct list_head priority_groups; +	unsigned pg_init_required;	/* pg_init needs calling? */ + +	unsigned nr_valid_paths;	/* Total number of usable paths */ +	struct pgpath *current_pgpath; +	struct priority_group *current_pg; +	struct priority_group *next_pg;	/* Switch to this PG if set */ +	unsigned repeat_count;		/* I/Os left before calling PS again */ + +	unsigned queue_io;		/* Must we queue all I/O? */ +	unsigned queue_if_no_path;	/* Queue I/O if last path fails? */ +	unsigned suspended;		/* Has dm core suspended our I/O? */ + +	struct work_struct process_queued_ios; +	struct bio_list queued_ios; +	unsigned queue_size; + +	struct work_struct trigger_event; + +	/* +	 * We must use a mempool of mpath_io structs so that we +	 * can resubmit bios on error. +	 */ +	mempool_t *mpio_pool; +}; + +/* + * Context information attached to each bio we process. + */ +struct mpath_io { +	struct pgpath *pgpath; +	struct dm_bio_details details; +}; + +typedef int (*action_fn) (struct pgpath *pgpath); + +#define MIN_IOS 256	/* Mempool size */ + +static kmem_cache_t *_mpio_cache; + +static void process_queued_ios(void *data); +static void trigger_event(void *data); + + +/*----------------------------------------------- + * Allocation routines + *-----------------------------------------------*/ + +static struct pgpath *alloc_pgpath(void) +{ +	struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL); + +	if (pgpath) { +		memset(pgpath, 0, sizeof(*pgpath)); +		pgpath->path.is_active = 1; +	} + +	return pgpath; +} + +static inline void free_pgpath(struct pgpath *pgpath) +{ +	kfree(pgpath); +} + +static struct priority_group *alloc_priority_group(void) +{ +	struct priority_group *pg; + +	pg = kmalloc(sizeof(*pg), GFP_KERNEL); +	if (!pg) +		return NULL; + +	memset(pg, 0, sizeof(*pg)); +	INIT_LIST_HEAD(&pg->pgpaths); + +	return pg; +} + +static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) +{ +	struct pgpath *pgpath, *tmp; + +	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { +		list_del(&pgpath->list); +		dm_put_device(ti, pgpath->path.dev); +		free_pgpath(pgpath); +	} +} + +static void free_priority_group(struct priority_group *pg, +				struct dm_target *ti) +{ +	struct path_selector *ps = &pg->ps; + +	if (ps->type) { +		ps->type->destroy(ps); +		dm_put_path_selector(ps->type); +	} + +	free_pgpaths(&pg->pgpaths, ti); +	kfree(pg); +} + +static struct multipath *alloc_multipath(void) +{ +	struct multipath *m; + +	m = kmalloc(sizeof(*m), GFP_KERNEL); +	if (m) { +		memset(m, 0, sizeof(*m)); +		INIT_LIST_HEAD(&m->priority_groups); +		spin_lock_init(&m->lock); +		m->queue_io = 1; +		INIT_WORK(&m->process_queued_ios, process_queued_ios, m); +		INIT_WORK(&m->trigger_event, trigger_event, m); +		m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab, +					      mempool_free_slab, _mpio_cache); +		if (!m->mpio_pool) { +			kfree(m); +			return NULL; +		} +	} + +	return m; +} + +static void free_multipath(struct multipath *m) +{ +	struct priority_group *pg, *tmp; +	struct hw_handler *hwh = &m->hw_handler; + +	list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { +		list_del(&pg->list); +		free_priority_group(pg, m->ti); +	} + +	if (hwh->type) { +		hwh->type->destroy(hwh); +		dm_put_hw_handler(hwh->type); +	} + +	mempool_destroy(m->mpio_pool); +	kfree(m); +} + + +/*----------------------------------------------- + * Path selection + *-----------------------------------------------*/ + +static void __switch_pg(struct multipath *m, struct pgpath *pgpath) +{ +	struct hw_handler *hwh = &m->hw_handler; + +	m->current_pg = pgpath->pg; + +	/* Must we initialise the PG first, and queue I/O till it's ready? */ +	if (hwh->type && hwh->type->pg_init) { +		m->pg_init_required = 1; +		m->queue_io = 1; +	} else { +		m->pg_init_required = 0; +		m->queue_io = 0; +	} +} + +static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) +{ +	struct path *path; + +	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); +	if (!path) +		return -ENXIO; + +	m->current_pgpath = path_to_pgpath(path); + +	if (m->current_pg != pg) +		__switch_pg(m, m->current_pgpath); + +	return 0; +} + +static void __choose_pgpath(struct multipath *m) +{ +	struct priority_group *pg; +	unsigned bypassed = 1; + +	if (!m->nr_valid_paths) +		goto failed; + +	/* Were we instructed to switch PG? */ +	if (m->next_pg) { +		pg = m->next_pg; +		m->next_pg = NULL; +		if (!__choose_path_in_pg(m, pg)) +			return; +	} + +	/* Don't change PG until it has no remaining paths */ +	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) +		return; + +	/* +	 * Loop through priority groups until we find a valid path. +	 * First time we skip PGs marked 'bypassed'. +	 * Second time we only try the ones we skipped. +	 */ +	do { +		list_for_each_entry(pg, &m->priority_groups, list) { +			if (pg->bypassed == bypassed) +				continue; +			if (!__choose_path_in_pg(m, pg)) +				return; +		} +	} while (bypassed--); + +failed: +	m->current_pgpath = NULL; +	m->current_pg = NULL; +} + +static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio, +		  unsigned was_queued) +{ +	int r = 1; +	unsigned long flags; +	struct pgpath *pgpath; + +	spin_lock_irqsave(&m->lock, flags); + +	/* Do we need to select a new pgpath? */ +	if (!m->current_pgpath || +	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) +		__choose_pgpath(m); + +	pgpath = m->current_pgpath; + +	if (was_queued) +		m->queue_size--; + +	if ((pgpath && m->queue_io) || +	    (!pgpath && m->queue_if_no_path && !m->suspended)) { +		/* Queue for the daemon to resubmit */ +		bio_list_add(&m->queued_ios, bio); +		m->queue_size++; +		if (m->pg_init_required || !m->queue_io) +			schedule_work(&m->process_queued_ios); +		pgpath = NULL; +		r = 0; +	} else if (!pgpath) +		r = -EIO;		/* Failed */ +	else +		bio->bi_bdev = pgpath->path.dev->bdev; + +	mpio->pgpath = pgpath; + +	spin_unlock_irqrestore(&m->lock, flags); + +	return r; +} + +/* + * If we run out of usable paths, should we queue I/O or error it? + */ +static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path) +{ +	unsigned long flags; + +	spin_lock_irqsave(&m->lock, flags); + +	m->queue_if_no_path = queue_if_no_path; +	if (!m->queue_if_no_path) +		schedule_work(&m->process_queued_ios); + +	spin_unlock_irqrestore(&m->lock, flags); + +	return 0; +} + +/*----------------------------------------------------------------- + * The multipath daemon is responsible for resubmitting queued ios. + *---------------------------------------------------------------*/ + +static void dispatch_queued_ios(struct multipath *m) +{ +	int r; +	unsigned long flags; +	struct bio *bio = NULL, *next; +	struct mpath_io *mpio; +	union map_info *info; + +	spin_lock_irqsave(&m->lock, flags); +	bio = bio_list_get(&m->queued_ios); +	spin_unlock_irqrestore(&m->lock, flags); + +	while (bio) { +		next = bio->bi_next; +		bio->bi_next = NULL; + +		info = dm_get_mapinfo(bio); +		mpio = info->ptr; + +		r = map_io(m, bio, mpio, 1); +		if (r < 0) +			bio_endio(bio, bio->bi_size, r); +		else if (r == 1) +			generic_make_request(bio); + +		bio = next; +	} +} + +static void process_queued_ios(void *data) +{ +	struct multipath *m = (struct multipath *) data; +	struct hw_handler *hwh = &m->hw_handler; +	struct pgpath *pgpath; +	unsigned init_required, must_queue = 0; +	unsigned long flags; + +	spin_lock_irqsave(&m->lock, flags); + +	if (!m->current_pgpath) +		__choose_pgpath(m); + +	pgpath = m->current_pgpath; + +	if ((pgpath && m->queue_io) || +	    (!pgpath && m->queue_if_no_path && !m->suspended)) +		must_queue = 1; + +	init_required = m->pg_init_required; +	if (init_required) +		m->pg_init_required = 0; + +	spin_unlock_irqrestore(&m->lock, flags); + +	if (init_required) +		hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path); + +	if (!must_queue) +		dispatch_queued_ios(m); +} + +/* + * An event is triggered whenever a path is taken out of use. + * Includes path failure and PG bypass. + */ +static void trigger_event(void *data) +{ +	struct multipath *m = (struct multipath *) data; + +	dm_table_event(m->ti->table); +} + +/*----------------------------------------------------------------- + * Constructor/argument parsing: + * <#multipath feature args> [<arg>]* + * <#hw_handler args> [hw_handler [<arg>]*] + * <#priority groups> + * <initial priority group> + *     [<selector> <#selector args> [<arg>]* + *      <#paths> <#per-path selector args> + *         [<path> [<arg>]* ]+ ]+ + *---------------------------------------------------------------*/ +struct param { +	unsigned min; +	unsigned max; +	char *error; +}; + +#define ESTR(s) ("dm-multipath: " s) + +static int read_param(struct param *param, char *str, unsigned *v, char **error) +{ +	if (!str || +	    (sscanf(str, "%u", v) != 1) || +	    (*v < param->min) || +	    (*v > param->max)) { +		*error = param->error; +		return -EINVAL; +	} + +	return 0; +} + +struct arg_set { +	unsigned argc; +	char **argv; +}; + +static char *shift(struct arg_set *as) +{ +	char *r; + +	if (as->argc) { +		as->argc--; +		r = *as->argv; +		as->argv++; +		return r; +	} + +	return NULL; +} + +static void consume(struct arg_set *as, unsigned n) +{ +	BUG_ON (as->argc < n); +	as->argc -= n; +	as->argv += n; +} + +static int parse_path_selector(struct arg_set *as, struct priority_group *pg, +			       struct dm_target *ti) +{ +	int r; +	struct path_selector_type *pst; +	unsigned ps_argc; + +	static struct param _params[] = { +		{0, 1024, ESTR("invalid number of path selector args")}, +	}; + +	pst = dm_get_path_selector(shift(as)); +	if (!pst) { +		ti->error = ESTR("unknown path selector type"); +		return -EINVAL; +	} + +	r = read_param(_params, shift(as), &ps_argc, &ti->error); +	if (r) +		return -EINVAL; + +	r = pst->create(&pg->ps, ps_argc, as->argv); +	if (r) { +		dm_put_path_selector(pst); +		ti->error = ESTR("path selector constructor failed"); +		return r; +	} + +	pg->ps.type = pst; +	consume(as, ps_argc); + +	return 0; +} + +static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, +			       struct dm_target *ti) +{ +	int r; +	struct pgpath *p; + +	/* we need at least a path arg */ +	if (as->argc < 1) { +		ti->error = ESTR("no device given"); +		return NULL; +	} + +	p = alloc_pgpath(); +	if (!p) +		return NULL; + +	r = dm_get_device(ti, shift(as), ti->begin, ti->len, +			  dm_table_get_mode(ti->table), &p->path.dev); +	if (r) { +		ti->error = ESTR("error getting device"); +		goto bad; +	} + +	r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); +	if (r) { +		dm_put_device(ti, p->path.dev); +		goto bad; +	} + +	return p; + + bad: +	free_pgpath(p); +	return NULL; +} + +static struct priority_group *parse_priority_group(struct arg_set *as, +						   struct multipath *m, +						   struct dm_target *ti) +{ +	static struct param _params[] = { +		{1, 1024, ESTR("invalid number of paths")}, +		{0, 1024, ESTR("invalid number of selector args")} +	}; + +	int r; +	unsigned i, nr_selector_args, nr_params; +	struct priority_group *pg; + +	if (as->argc < 2) { +		as->argc = 0; +		ti->error = ESTR("not enough priority group aruments"); +		return NULL; +	} + +	pg = alloc_priority_group(); +	if (!pg) { +		ti->error = ESTR("couldn't allocate priority group"); +		return NULL; +	} +	pg->m = m; + +	r = parse_path_selector(as, pg, ti); +	if (r) +		goto bad; + +	/* +	 * read the paths +	 */ +	r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); +	if (r) +		goto bad; + +	r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); +	if (r) +		goto bad; + +	nr_params = 1 + nr_selector_args; +	for (i = 0; i < pg->nr_pgpaths; i++) { +		struct pgpath *pgpath; +		struct arg_set path_args; + +		if (as->argc < nr_params) +			goto bad; + +		path_args.argc = nr_params; +		path_args.argv = as->argv; + +		pgpath = parse_path(&path_args, &pg->ps, ti); +		if (!pgpath) +			goto bad; + +		pgpath->pg = pg; +		list_add_tail(&pgpath->list, &pg->pgpaths); +		consume(as, nr_params); +	} + +	return pg; + + bad: +	free_priority_group(pg, ti); +	return NULL; +} + +static int parse_hw_handler(struct arg_set *as, struct multipath *m, +			    struct dm_target *ti) +{ +	int r; +	struct hw_handler_type *hwht; +	unsigned hw_argc; + +	static struct param _params[] = { +		{0, 1024, ESTR("invalid number of hardware handler args")}, +	}; + +	r = read_param(_params, shift(as), &hw_argc, &ti->error); +	if (r) +		return -EINVAL; + +	if (!hw_argc) +		return 0; + +	hwht = dm_get_hw_handler(shift(as)); +	if (!hwht) { +		ti->error = ESTR("unknown hardware handler type"); +		return -EINVAL; +	} + +	r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); +	if (r) { +		dm_put_hw_handler(hwht); +		ti->error = ESTR("hardware handler constructor failed"); +		return r; +	} + +	m->hw_handler.type = hwht; +	consume(as, hw_argc - 1); + +	return 0; +} + +static int parse_features(struct arg_set *as, struct multipath *m, +			  struct dm_target *ti) +{ +	int r; +	unsigned argc; + +	static struct param _params[] = { +		{0, 1, ESTR("invalid number of feature args")}, +	}; + +	r = read_param(_params, shift(as), &argc, &ti->error); +	if (r) +		return -EINVAL; + +	if (!argc) +		return 0; + +	if (!strnicmp(shift(as), MESG_STR("queue_if_no_path"))) +		return queue_if_no_path(m, 1); +	else { +		ti->error = "Unrecognised multipath feature request"; +		return -EINVAL; +	} +} + +static int multipath_ctr(struct dm_target *ti, unsigned int argc, +			 char **argv) +{ +	/* target parameters */ +	static struct param _params[] = { +		{1, 1024, ESTR("invalid number of priority groups")}, +		{1, 1024, ESTR("invalid initial priority group number")}, +	}; + +	int r; +	struct multipath *m; +	struct arg_set as; +	unsigned pg_count = 0; +	unsigned next_pg_num; + +	as.argc = argc; +	as.argv = argv; + +	m = alloc_multipath(); +	if (!m) { +		ti->error = ESTR("can't allocate multipath"); +		return -EINVAL; +	} + +	r = parse_features(&as, m, ti); +	if (r) +		goto bad; + +	r = parse_hw_handler(&as, m, ti); +	if (r) +		goto bad; + +	r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); +	if (r) +		goto bad; + +	r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); +	if (r) +		goto bad; + +	/* parse the priority groups */ +	while (as.argc) { +		struct priority_group *pg; + +		pg = parse_priority_group(&as, m, ti); +		if (!pg) { +			r = -EINVAL; +			goto bad; +		} + +		m->nr_valid_paths += pg->nr_pgpaths; +		list_add_tail(&pg->list, &m->priority_groups); +		pg_count++; +		pg->pg_num = pg_count; +		if (!--next_pg_num) +			m->next_pg = pg; +	} + +	if (pg_count != m->nr_priority_groups) { +		ti->error = ESTR("priority group count mismatch"); +		r = -EINVAL; +		goto bad; +	} + +	ti->private = m; +	m->ti = ti; + +	return 0; + + bad: +	free_multipath(m); +	return r; +} + +static void multipath_dtr(struct dm_target *ti) +{ +	struct multipath *m = (struct multipath *) ti->private; +	free_multipath(m); +} + +/* + * Map bios, recording original fields for later in case we have to resubmit + */ +static int multipath_map(struct dm_target *ti, struct bio *bio, +			 union map_info *map_context) +{ +	int r; +	struct mpath_io *mpio; +	struct multipath *m = (struct multipath *) ti->private; + +	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); +	dm_bio_record(&mpio->details, bio); + +	map_context->ptr = mpio; +	bio->bi_rw |= (1 << BIO_RW_FAILFAST); +	r = map_io(m, bio, mpio, 0); +	if (r < 0) +		mempool_free(mpio, m->mpio_pool); + +	return r; +} + +/* + * Take a path out of use. + */ +static int fail_path(struct pgpath *pgpath) +{ +	unsigned long flags; +	struct multipath *m = pgpath->pg->m; + +	spin_lock_irqsave(&m->lock, flags); + +	if (!pgpath->path.is_active) +		goto out; + +	DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name); + +	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); +	pgpath->path.is_active = 0; +	pgpath->fail_count++; + +	m->nr_valid_paths--; + +	if (pgpath == m->current_pgpath) +		m->current_pgpath = NULL; + +	schedule_work(&m->trigger_event); + +out: +	spin_unlock_irqrestore(&m->lock, flags); + +	return 0; +} + +/* + * Reinstate a previously-failed path + */ +static int reinstate_path(struct pgpath *pgpath) +{ +	int r = 0; +	unsigned long flags; +	struct multipath *m = pgpath->pg->m; + +	spin_lock_irqsave(&m->lock, flags); + +	if (pgpath->path.is_active) +		goto out; + +	if (!pgpath->pg->ps.type) { +		DMWARN("Reinstate path not supported by path selector %s", +		       pgpath->pg->ps.type->name); +		r = -EINVAL; +		goto out; +	} + +	r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); +	if (r) +		goto out; + +	pgpath->path.is_active = 1; + +	m->current_pgpath = NULL; +	if (!m->nr_valid_paths++) +		schedule_work(&m->process_queued_ios); + +	schedule_work(&m->trigger_event); + +out: +	spin_unlock_irqrestore(&m->lock, flags); + +	return r; +} + +/* + * Fail or reinstate all paths that match the provided struct dm_dev. + */ +static int action_dev(struct multipath *m, struct dm_dev *dev, +		      action_fn action) +{ +	int r = 0; +	struct pgpath *pgpath; +	struct priority_group *pg; + +	list_for_each_entry(pg, &m->priority_groups, list) { +		list_for_each_entry(pgpath, &pg->pgpaths, list) { +			if (pgpath->path.dev == dev) +				r = action(pgpath); +		} +	} + +	return r; +} + +/* + * Temporarily try to avoid having to use the specified PG + */ +static void bypass_pg(struct multipath *m, struct priority_group *pg, +		      int bypassed) +{ +	unsigned long flags; + +	spin_lock_irqsave(&m->lock, flags); + +	pg->bypassed = bypassed; +	m->current_pgpath = NULL; +	m->current_pg = NULL; + +	spin_unlock_irqrestore(&m->lock, flags); + +	schedule_work(&m->trigger_event); +} + +/* + * Switch to using the specified PG from the next I/O that gets mapped + */ +static int switch_pg_num(struct multipath *m, const char *pgstr) +{ +	struct priority_group *pg; +	unsigned pgnum; +	unsigned long flags; + +	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || +	    (pgnum > m->nr_priority_groups)) { +		DMWARN("invalid PG number supplied to switch_pg_num"); +		return -EINVAL; +	} + +	spin_lock_irqsave(&m->lock, flags); +	list_for_each_entry(pg, &m->priority_groups, list) { +		pg->bypassed = 0; +		if (--pgnum) +			continue; + +		m->current_pgpath = NULL; +		m->current_pg = NULL; +		m->next_pg = pg; +	} +	spin_unlock_irqrestore(&m->lock, flags); + +	schedule_work(&m->trigger_event); +	return 0; +} + +/* + * Set/clear bypassed status of a PG. + * PGs are numbered upwards from 1 in the order they were declared. + */ +static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) +{ +	struct priority_group *pg; +	unsigned pgnum; + +	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || +	    (pgnum > m->nr_priority_groups)) { +		DMWARN("invalid PG number supplied to bypass_pg"); +		return -EINVAL; +	} + +	list_for_each_entry(pg, &m->priority_groups, list) { +		if (!--pgnum) +			break; +	} + +	bypass_pg(m, pg, bypassed); +	return 0; +} + +/* + * pg_init must call this when it has completed its initialisation + */ +void dm_pg_init_complete(struct path *path, unsigned err_flags) +{ +	struct pgpath *pgpath = path_to_pgpath(path); +	struct priority_group *pg = pgpath->pg; +	struct multipath *m = pg->m; +	unsigned long flags; + +	/* We insist on failing the path if the PG is already bypassed. */ +	if (err_flags && pg->bypassed) +		err_flags |= MP_FAIL_PATH; + +	if (err_flags & MP_FAIL_PATH) +		fail_path(pgpath); + +	if (err_flags & MP_BYPASS_PG) +		bypass_pg(m, pg, 1); + +	spin_lock_irqsave(&m->lock, flags); +	if (!err_flags) +		m->queue_io = 0; +	else { +		m->current_pgpath = NULL; +		m->current_pg = NULL; +	} +	schedule_work(&m->process_queued_ios); +	spin_unlock_irqrestore(&m->lock, flags); +} + +/* + * end_io handling + */ +static int do_end_io(struct multipath *m, struct bio *bio, +		     int error, struct mpath_io *mpio) +{ +	struct hw_handler *hwh = &m->hw_handler; +	unsigned err_flags = MP_FAIL_PATH;	/* Default behavior */ + +	if (!error) +		return 0;	/* I/O complete */ + +	spin_lock(&m->lock); +	if (!m->nr_valid_paths) { +		if (!m->queue_if_no_path || m->suspended) { +			spin_unlock(&m->lock); +			return -EIO; +		} else { +			spin_unlock(&m->lock); +			goto requeue; +		} +	} +	spin_unlock(&m->lock); + +	if (hwh->type && hwh->type->error) +		err_flags = hwh->type->error(hwh, bio); + +	if (mpio->pgpath) { +		if (err_flags & MP_FAIL_PATH) +			fail_path(mpio->pgpath); + +		if (err_flags & MP_BYPASS_PG) +			bypass_pg(m, mpio->pgpath->pg, 1); +	} + +	if (err_flags & MP_ERROR_IO) +		return -EIO; + +      requeue: +	dm_bio_restore(&mpio->details, bio); + +	/* queue for the daemon to resubmit or fail */ +	spin_lock(&m->lock); +	bio_list_add(&m->queued_ios, bio); +	m->queue_size++; +	if (!m->queue_io) +		schedule_work(&m->process_queued_ios); +	spin_unlock(&m->lock); + +	return 1;	/* io not complete */ +} + +static int multipath_end_io(struct dm_target *ti, struct bio *bio, +			    int error, union map_info *map_context) +{ +	struct multipath *m = (struct multipath *) ti->private; +	struct mpath_io *mpio = (struct mpath_io *) map_context->ptr; +	struct pgpath *pgpath = mpio->pgpath; +	struct path_selector *ps; +	int r; + +	r  = do_end_io(m, bio, error, mpio); +	if (pgpath) { +		ps = &pgpath->pg->ps; +		if (ps->type->end_io) +			ps->type->end_io(ps, &pgpath->path); +	} +	if (r <= 0) +		mempool_free(mpio, m->mpio_pool); + +	return r; +} + +/* + * Suspend can't complete until all the I/O is processed so if + * the last path failed we will now error any queued I/O. + */ +static void multipath_presuspend(struct dm_target *ti) +{ +	struct multipath *m = (struct multipath *) ti->private; +	unsigned long flags; + +	spin_lock_irqsave(&m->lock, flags); +	m->suspended = 1; +	if (m->queue_if_no_path) +		schedule_work(&m->process_queued_ios); +	spin_unlock_irqrestore(&m->lock, flags); +} + +static void multipath_resume(struct dm_target *ti) +{ +	struct multipath *m = (struct multipath *) ti->private; +	unsigned long flags; + +	spin_lock_irqsave(&m->lock, flags); +	m->suspended = 0; +	spin_unlock_irqrestore(&m->lock, flags); +} + +/* + * Info output has the following format: + * num_multipath_feature_args [multipath_feature_args]* + * num_handler_status_args [handler_status_args]* + * num_groups init_group_number + *            [A|D|E num_ps_status_args [ps_status_args]* + *             num_paths num_selector_args + *             [path_dev A|F fail_count [selector_args]* ]+ ]+ + * + * Table output has the following format (identical to the constructor string): + * num_feature_args [features_args]* + * num_handler_args hw_handler [hw_handler_args]* + * num_groups init_group_number + *     [priority selector-name num_ps_args [ps_args]* + *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ + */ +static int multipath_status(struct dm_target *ti, status_type_t type, +			    char *result, unsigned int maxlen) +{ +	int sz = 0; +	unsigned long flags; +	struct multipath *m = (struct multipath *) ti->private; +	struct hw_handler *hwh = &m->hw_handler; +	struct priority_group *pg; +	struct pgpath *p; +	unsigned pg_num; +	char state; + +	spin_lock_irqsave(&m->lock, flags); + +	/* Features */ +	if (type == STATUSTYPE_INFO) +		DMEMIT("1 %u ", m->queue_size); +	else if (m->queue_if_no_path) +		DMEMIT("1 queue_if_no_path "); +	else +		DMEMIT("0 "); + +	if (hwh->type && hwh->type->status) +		sz += hwh->type->status(hwh, type, result + sz, maxlen - sz); +	else if (!hwh->type || type == STATUSTYPE_INFO) +		DMEMIT("0 "); +	else +		DMEMIT("1 %s ", hwh->type->name); + +	DMEMIT("%u ", m->nr_priority_groups); + +	if (m->next_pg) +		pg_num = m->next_pg->pg_num; +	else if (m->current_pg) +		pg_num = m->current_pg->pg_num; +	else +			pg_num = 1; + +	DMEMIT("%u ", pg_num); + +	switch (type) { +	case STATUSTYPE_INFO: +		list_for_each_entry(pg, &m->priority_groups, list) { +			if (pg->bypassed) +				state = 'D';	/* Disabled */ +			else if (pg == m->current_pg) +				state = 'A';	/* Currently Active */ +			else +				state = 'E';	/* Enabled */ + +			DMEMIT("%c ", state); + +			if (pg->ps.type->status) +				sz += pg->ps.type->status(&pg->ps, NULL, type, +							  result + sz, +							  maxlen - sz); +			else +				DMEMIT("0 "); + +			DMEMIT("%u %u ", pg->nr_pgpaths, +			       pg->ps.type->info_args); + +			list_for_each_entry(p, &pg->pgpaths, list) { +				DMEMIT("%s %s %u ", p->path.dev->name, +				       p->path.is_active ? "A" : "F", +				       p->fail_count); +				if (pg->ps.type->status) +					sz += pg->ps.type->status(&pg->ps, +					      &p->path, type, result + sz, +					      maxlen - sz); +			} +		} +		break; + +	case STATUSTYPE_TABLE: +		list_for_each_entry(pg, &m->priority_groups, list) { +			DMEMIT("%s ", pg->ps.type->name); + +			if (pg->ps.type->status) +				sz += pg->ps.type->status(&pg->ps, NULL, type, +							  result + sz, +							  maxlen - sz); +			else +				DMEMIT("0 "); + +			DMEMIT("%u %u ", pg->nr_pgpaths, +			       pg->ps.type->table_args); + +			list_for_each_entry(p, &pg->pgpaths, list) { +				DMEMIT("%s ", p->path.dev->name); +				if (pg->ps.type->status) +					sz += pg->ps.type->status(&pg->ps, +					      &p->path, type, result + sz, +					      maxlen - sz); +			} +		} +		break; +	} + +	spin_unlock_irqrestore(&m->lock, flags); + +	return 0; +} + +static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) +{ +	int r; +	struct dm_dev *dev; +	struct multipath *m = (struct multipath *) ti->private; +	action_fn action; + +	if (argc == 1) { +		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) +			return queue_if_no_path(m, 1); +		else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) +			return queue_if_no_path(m, 0); +	} + +	if (argc != 2) +		goto error; + +	if (!strnicmp(argv[0], MESG_STR("disable_group"))) +		return bypass_pg_num(m, argv[1], 1); +	else if (!strnicmp(argv[0], MESG_STR("enable_group"))) +		return bypass_pg_num(m, argv[1], 0); +	else if (!strnicmp(argv[0], MESG_STR("switch_group"))) +		return switch_pg_num(m, argv[1]); +	else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) +		action = reinstate_path; +	else if (!strnicmp(argv[0], MESG_STR("fail_path"))) +		action = fail_path; +	else +		goto error; + +	r = dm_get_device(ti, argv[1], ti->begin, ti->len, +			  dm_table_get_mode(ti->table), &dev); +	if (r) { +		DMWARN("dm-multipath message: error getting device %s", +		       argv[1]); +		return -EINVAL; +	} + +	r = action_dev(m, dev, action); + +	dm_put_device(ti, dev); + +	return r; + +error: +	DMWARN("Unrecognised multipath message received."); +	return -EINVAL; +} + +/*----------------------------------------------------------------- + * Module setup + *---------------------------------------------------------------*/ +static struct target_type multipath_target = { +	.name = "multipath", +	.version = {1, 0, 4}, +	.module = THIS_MODULE, +	.ctr = multipath_ctr, +	.dtr = multipath_dtr, +	.map = multipath_map, +	.end_io = multipath_end_io, +	.presuspend = multipath_presuspend, +	.resume = multipath_resume, +	.status = multipath_status, +	.message = multipath_message, +}; + +static int __init dm_multipath_init(void) +{ +	int r; + +	/* allocate a slab for the dm_ios */ +	_mpio_cache = kmem_cache_create("dm_mpath", sizeof(struct mpath_io), +					0, 0, NULL, NULL); +	if (!_mpio_cache) +		return -ENOMEM; + +	r = dm_register_target(&multipath_target); +	if (r < 0) { +		DMERR("%s: register failed %d", multipath_target.name, r); +		kmem_cache_destroy(_mpio_cache); +		return -EINVAL; +	} + +	DMINFO("dm-multipath version %u.%u.%u loaded", +	       multipath_target.version[0], multipath_target.version[1], +	       multipath_target.version[2]); + +	return r; +} + +static void __exit dm_multipath_exit(void) +{ +	int r; + +	r = dm_unregister_target(&multipath_target); +	if (r < 0) +		DMERR("%s: target unregister failed %d", +		      multipath_target.name, r); +	kmem_cache_destroy(_mpio_cache); +} + +EXPORT_SYMBOL_GPL(dm_pg_init_complete); + +module_init(dm_multipath_init); +module_exit(dm_multipath_exit); + +MODULE_DESCRIPTION(DM_NAME " multipath target"); +MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h new file mode 100644 index 00000000000..8a4bf2b6d52 --- /dev/null +++ b/drivers/md/dm-mpath.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath. + */ + +#ifndef	DM_MPATH_H +#define	DM_MPATH_H + +struct dm_dev; + +struct path { +	struct dm_dev *dev;	/* Read-only */ +	unsigned is_active;	/* Read-only */ + +	void *pscontext;	/* For path-selector use */ +	void *hwhcontext;	/* For hw-handler use */ +}; + +/* Callback for hwh_pg_init_fn to use when complete */ +void dm_pg_init_complete(struct path *path, unsigned err_flags); + +#endif diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c new file mode 100644 index 00000000000..ac5c4bbec6c --- /dev/null +++ b/drivers/md/dm-path-selector.c @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Path selector registration. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include <linux/slab.h> + +struct ps_internal { +	struct path_selector_type pst; + +	struct list_head list; +	long use; +}; + +#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) + +static LIST_HEAD(_path_selectors); +static DECLARE_RWSEM(_ps_lock); + +struct ps_internal *__find_path_selector_type(const char *name) +{ +	struct ps_internal *psi; + +	list_for_each_entry(psi, &_path_selectors, list) { +		if (!strcmp(name, psi->pst.name)) +			return psi; +	} + +	return NULL; +} + +static struct ps_internal *get_path_selector(const char *name) +{ +	struct ps_internal *psi; + +	down_read(&_ps_lock); +	psi = __find_path_selector_type(name); +	if (psi) { +		if ((psi->use == 0) && !try_module_get(psi->pst.module)) +			psi = NULL; +		else +			psi->use++; +	} +	up_read(&_ps_lock); + +	return psi; +} + +struct path_selector_type *dm_get_path_selector(const char *name) +{ +	struct ps_internal *psi; + +	if (!name) +		return NULL; + +	psi = get_path_selector(name); +	if (!psi) { +		request_module("dm-%s", name); +		psi = get_path_selector(name); +	} + +	return psi ? &psi->pst : NULL; +} + +void dm_put_path_selector(struct path_selector_type *pst) +{ +	struct ps_internal *psi; + +	if (!pst) +		return; + +	down_read(&_ps_lock); +	psi = __find_path_selector_type(pst->name); +	if (!psi) +		goto out; + +	if (--psi->use == 0) +		module_put(psi->pst.module); + +	if (psi->use < 0) +		BUG(); + +out: +	up_read(&_ps_lock); +} + +static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst) +{ +	struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL); + +	if (psi) { +		memset(psi, 0, sizeof(*psi)); +		psi->pst = *pst; +	} + +	return psi; +} + +int dm_register_path_selector(struct path_selector_type *pst) +{ +	int r = 0; +	struct ps_internal *psi = _alloc_path_selector(pst); + +	if (!psi) +		return -ENOMEM; + +	down_write(&_ps_lock); + +	if (__find_path_selector_type(pst->name)) { +		kfree(psi); +		r = -EEXIST; +	} else +		list_add(&psi->list, &_path_selectors); + +	up_write(&_ps_lock); + +	return r; +} + +int dm_unregister_path_selector(struct path_selector_type *pst) +{ +	struct ps_internal *psi; + +	down_write(&_ps_lock); + +	psi = __find_path_selector_type(pst->name); +	if (!psi) { +		up_write(&_ps_lock); +		return -EINVAL; +	} + +	if (psi->use) { +		up_write(&_ps_lock); +		return -ETXTBSY; +	} + +	list_del(&psi->list); + +	up_write(&_ps_lock); + +	kfree(psi); + +	return 0; +} + +EXPORT_SYMBOL_GPL(dm_register_path_selector); +EXPORT_SYMBOL_GPL(dm_unregister_path_selector); diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h new file mode 100644 index 00000000000..732d06a84f8 --- /dev/null +++ b/drivers/md/dm-path-selector.h @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Path-Selector registration. + */ + +#ifndef	DM_PATH_SELECTOR_H +#define	DM_PATH_SELECTOR_H + +#include <linux/device-mapper.h> + +#include "dm-mpath.h" + +/* + * We provide an abstraction for the code that chooses which path + * to send some io down. + */ +struct path_selector_type; +struct path_selector { +	struct path_selector_type *type; +	void *context; +}; + +/* Information about a path selector type */ +struct path_selector_type { +	char *name; +	struct module *module; + +	unsigned int table_args; +	unsigned int info_args; + +	/* +	 * Constructs a path selector object, takes custom arguments +	 */ +	int (*create) (struct path_selector *ps, unsigned argc, char **argv); +	void (*destroy) (struct path_selector *ps); + +	/* +	 * Add an opaque path object, along with some selector specific +	 * path args (eg, path priority). +	 */ +	int (*add_path) (struct path_selector *ps, struct path *path, +			 int argc, char **argv, char **error); + +	/* +	 * Chooses a path for this io, if no paths are available then +	 * NULL will be returned. +	 * +	 * repeat_count is the number of times to use the path before +	 * calling the function again.  0 means don't call it again unless +	 * the path fails. +	 */ +	struct path *(*select_path) (struct path_selector *ps, +				     unsigned *repeat_count); + +	/* +	 * Notify the selector that a path has failed. +	 */ +	void (*fail_path) (struct path_selector *ps, struct path *p); + +	/* +	 * Ask selector to reinstate a path. +	 */ +	int (*reinstate_path) (struct path_selector *ps, struct path *p); + +	/* +	 * Table content based on parameters added in ps_add_path_fn +	 * or path selector status +	 */ +	int (*status) (struct path_selector *ps, struct path *path, +		       status_type_t type, char *result, unsigned int maxlen); + +	int (*end_io) (struct path_selector *ps, struct path *path); +}; + +/* Register a path selector */ +int dm_register_path_selector(struct path_selector_type *type); + +/* Unregister a path selector */ +int dm_unregister_path_selector(struct path_selector_type *type); + +/* Returns a registered path selector type */ +struct path_selector_type *dm_get_path_selector(const char *name); + +/* Releases a path selector  */ +void dm_put_path_selector(struct path_selector_type *pst); + +#endif diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c new file mode 100644 index 00000000000..6e3cf7e1345 --- /dev/null +++ b/drivers/md/dm-raid1.c @@ -0,0 +1,1269 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-list.h" +#include "dm-io.h" +#include "dm-log.h" +#include "kcopyd.h" + +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> + +static struct workqueue_struct *_kmirrord_wq; +static struct work_struct _kmirrord_work; + +static inline void wake(void) +{ +	queue_work(_kmirrord_wq, &_kmirrord_work); +} + +/*----------------------------------------------------------------- + * Region hash + * + * The mirror splits itself up into discrete regions.  Each + * region can be in one of three states: clean, dirty, + * nosync.  There is no need to put clean regions in the hash. + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + *   clean_regions: Regions on this list have no io pending to + *   them, they are in sync, we are no longer interested in them, + *   they are dull.  rh_update_states() will remove them from the + *   hash table. + * + *   quiesced_regions: These regions have been spun down, ready + *   for recovery.  rh_recovery_start() will remove regions from + *   this list and hand them to kmirrord, which will schedule the + *   recovery io with kcopyd. + * + *   recovered_regions: Regions that kcopyd has successfully + *   recovered.  rh_update_states() will now schedule any delayed + *   io, up the recovery_count, and remove the region from the + *   hash. + * + * There are 2 locks: + *   A rw spin lock 'hash_lock' protects just the hash table, + *   this is never held in write mode from interrupt context, + *   which I believe means that we only have to disable irqs when + *   doing a write lock. + * + *   An ordinary spin lock 'region_lock' that protects the three + *   lists in the region_hash, with the 'state', 'list' and + *   'bhs_delayed' fields of the regions.  This is used from irq + *   context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +struct mirror_set; +struct region_hash { +	struct mirror_set *ms; +	uint32_t region_size; +	unsigned region_shift; + +	/* holds persistent region state */ +	struct dirty_log *log; + +	/* hash table */ +	rwlock_t hash_lock; +	mempool_t *region_pool; +	unsigned int mask; +	unsigned int nr_buckets; +	struct list_head *buckets; + +	spinlock_t region_lock; +	struct semaphore recovery_count; +	struct list_head clean_regions; +	struct list_head quiesced_regions; +	struct list_head recovered_regions; +}; + +enum { +	RH_CLEAN, +	RH_DIRTY, +	RH_NOSYNC, +	RH_RECOVERING +}; + +struct region { +	struct region_hash *rh;	/* FIXME: can we get rid of this ? */ +	region_t key; +	int state; + +	struct list_head hash_list; +	struct list_head list; + +	atomic_t pending; +	struct bio_list delayed_bios; +}; + +/* + * Conversion fns + */ +static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) +{ +	return bio->bi_sector >> rh->region_shift; +} + +static inline sector_t region_to_sector(struct region_hash *rh, region_t region) +{ +	return region << rh->region_shift; +} + +/* FIXME move this */ +static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); + +static void *region_alloc(unsigned int __nocast gfp_mask, void *pool_data) +{ +	return kmalloc(sizeof(struct region), gfp_mask); +} + +static void region_free(void *element, void *pool_data) +{ +	kfree(element); +} + +#define MIN_REGIONS 64 +#define MAX_RECOVERY 1 +static int rh_init(struct region_hash *rh, struct mirror_set *ms, +		   struct dirty_log *log, uint32_t region_size, +		   region_t nr_regions) +{ +	unsigned int nr_buckets, max_buckets; +	size_t i; + +	/* +	 * Calculate a suitable number of buckets for our hash +	 * table. +	 */ +	max_buckets = nr_regions >> 6; +	for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) +		; +	nr_buckets >>= 1; + +	rh->ms = ms; +	rh->log = log; +	rh->region_size = region_size; +	rh->region_shift = ffs(region_size) - 1; +	rwlock_init(&rh->hash_lock); +	rh->mask = nr_buckets - 1; +	rh->nr_buckets = nr_buckets; + +	rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); +	if (!rh->buckets) { +		DMERR("unable to allocate region hash memory"); +		return -ENOMEM; +	} + +	for (i = 0; i < nr_buckets; i++) +		INIT_LIST_HEAD(rh->buckets + i); + +	spin_lock_init(&rh->region_lock); +	sema_init(&rh->recovery_count, 0); +	INIT_LIST_HEAD(&rh->clean_regions); +	INIT_LIST_HEAD(&rh->quiesced_regions); +	INIT_LIST_HEAD(&rh->recovered_regions); + +	rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, +					 region_free, NULL); +	if (!rh->region_pool) { +		vfree(rh->buckets); +		rh->buckets = NULL; +		return -ENOMEM; +	} + +	return 0; +} + +static void rh_exit(struct region_hash *rh) +{ +	unsigned int h; +	struct region *reg, *nreg; + +	BUG_ON(!list_empty(&rh->quiesced_regions)); +	for (h = 0; h < rh->nr_buckets; h++) { +		list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { +			BUG_ON(atomic_read(®->pending)); +			mempool_free(reg, rh->region_pool); +		} +	} + +	if (rh->log) +		dm_destroy_dirty_log(rh->log); +	if (rh->region_pool) +		mempool_destroy(rh->region_pool); +	vfree(rh->buckets); +} + +#define RH_HASH_MULT 2654435387U + +static inline unsigned int rh_hash(struct region_hash *rh, region_t region) +{ +	return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; +} + +static struct region *__rh_lookup(struct region_hash *rh, region_t region) +{ +	struct region *reg; + +	list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) +		if (reg->key == region) +			return reg; + +	return NULL; +} + +static void __rh_insert(struct region_hash *rh, struct region *reg) +{ +	unsigned int h = rh_hash(rh, reg->key); +	list_add(®->hash_list, rh->buckets + h); +} + +static struct region *__rh_alloc(struct region_hash *rh, region_t region) +{ +	struct region *reg, *nreg; + +	read_unlock(&rh->hash_lock); +	nreg = mempool_alloc(rh->region_pool, GFP_NOIO); +	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? +		RH_CLEAN : RH_NOSYNC; +	nreg->rh = rh; +	nreg->key = region; + +	INIT_LIST_HEAD(&nreg->list); + +	atomic_set(&nreg->pending, 0); +	bio_list_init(&nreg->delayed_bios); +	write_lock_irq(&rh->hash_lock); + +	reg = __rh_lookup(rh, region); +	if (reg) +		/* we lost the race */ +		mempool_free(nreg, rh->region_pool); + +	else { +		__rh_insert(rh, nreg); +		if (nreg->state == RH_CLEAN) { +			spin_lock(&rh->region_lock); +			list_add(&nreg->list, &rh->clean_regions); +			spin_unlock(&rh->region_lock); +		} +		reg = nreg; +	} +	write_unlock_irq(&rh->hash_lock); +	read_lock(&rh->hash_lock); + +	return reg; +} + +static inline struct region *__rh_find(struct region_hash *rh, region_t region) +{ +	struct region *reg; + +	reg = __rh_lookup(rh, region); +	if (!reg) +		reg = __rh_alloc(rh, region); + +	return reg; +} + +static int rh_state(struct region_hash *rh, region_t region, int may_block) +{ +	int r; +	struct region *reg; + +	read_lock(&rh->hash_lock); +	reg = __rh_lookup(rh, region); +	read_unlock(&rh->hash_lock); + +	if (reg) +		return reg->state; + +	/* +	 * The region wasn't in the hash, so we fall back to the +	 * dirty log. +	 */ +	r = rh->log->type->in_sync(rh->log, region, may_block); + +	/* +	 * Any error from the dirty log (eg. -EWOULDBLOCK) gets +	 * taken as a RH_NOSYNC +	 */ +	return r == 1 ? RH_CLEAN : RH_NOSYNC; +} + +static inline int rh_in_sync(struct region_hash *rh, +			     region_t region, int may_block) +{ +	int state = rh_state(rh, region, may_block); +	return state == RH_CLEAN || state == RH_DIRTY; +} + +static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) +{ +	struct bio *bio; + +	while ((bio = bio_list_pop(bio_list))) { +		queue_bio(ms, bio, WRITE); +	} +} + +static void rh_update_states(struct region_hash *rh) +{ +	struct region *reg, *next; + +	LIST_HEAD(clean); +	LIST_HEAD(recovered); + +	/* +	 * Quickly grab the lists. +	 */ +	write_lock_irq(&rh->hash_lock); +	spin_lock(&rh->region_lock); +	if (!list_empty(&rh->clean_regions)) { +		list_splice(&rh->clean_regions, &clean); +		INIT_LIST_HEAD(&rh->clean_regions); + +		list_for_each_entry (reg, &clean, list) { +			rh->log->type->clear_region(rh->log, reg->key); +			list_del(®->hash_list); +		} +	} + +	if (!list_empty(&rh->recovered_regions)) { +		list_splice(&rh->recovered_regions, &recovered); +		INIT_LIST_HEAD(&rh->recovered_regions); + +		list_for_each_entry (reg, &recovered, list) +			list_del(®->hash_list); +	} +	spin_unlock(&rh->region_lock); +	write_unlock_irq(&rh->hash_lock); + +	/* +	 * All the regions on the recovered and clean lists have +	 * now been pulled out of the system, so no need to do +	 * any more locking. +	 */ +	list_for_each_entry_safe (reg, next, &recovered, list) { +		rh->log->type->clear_region(rh->log, reg->key); +		rh->log->type->complete_resync_work(rh->log, reg->key, 1); +		dispatch_bios(rh->ms, ®->delayed_bios); +		up(&rh->recovery_count); +		mempool_free(reg, rh->region_pool); +	} + +	if (!list_empty(&recovered)) +		rh->log->type->flush(rh->log); + +	list_for_each_entry_safe (reg, next, &clean, list) +		mempool_free(reg, rh->region_pool); +} + +static void rh_inc(struct region_hash *rh, region_t region) +{ +	struct region *reg; + +	read_lock(&rh->hash_lock); +	reg = __rh_find(rh, region); +	if (reg->state == RH_CLEAN) { +		rh->log->type->mark_region(rh->log, reg->key); + +		spin_lock_irq(&rh->region_lock); +		reg->state = RH_DIRTY; +		list_del_init(®->list);	/* take off the clean list */ +		spin_unlock_irq(&rh->region_lock); +	} + +	atomic_inc(®->pending); +	read_unlock(&rh->hash_lock); +} + +static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios) +{ +	struct bio *bio; + +	for (bio = bios->head; bio; bio = bio->bi_next) +		rh_inc(rh, bio_to_region(rh, bio)); +} + +static void rh_dec(struct region_hash *rh, region_t region) +{ +	unsigned long flags; +	struct region *reg; +	int should_wake = 0; + +	read_lock(&rh->hash_lock); +	reg = __rh_lookup(rh, region); +	read_unlock(&rh->hash_lock); + +	if (atomic_dec_and_test(®->pending)) { +		spin_lock_irqsave(&rh->region_lock, flags); +		if (reg->state == RH_RECOVERING) { +			list_add_tail(®->list, &rh->quiesced_regions); +		} else { +			reg->state = RH_CLEAN; +			list_add(®->list, &rh->clean_regions); +		} +		spin_unlock_irqrestore(&rh->region_lock, flags); +		should_wake = 1; +	} + +	if (should_wake) +		wake(); +} + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct region_hash *rh) +{ +	int r; +	struct region *reg; +	region_t region; + +	/* +	 * Ask the dirty log what's next. +	 */ +	r = rh->log->type->get_resync_work(rh->log, ®ion); +	if (r <= 0) +		return r; + +	/* +	 * Get this region, and start it quiescing by setting the +	 * recovering flag. +	 */ +	read_lock(&rh->hash_lock); +	reg = __rh_find(rh, region); +	read_unlock(&rh->hash_lock); + +	spin_lock_irq(&rh->region_lock); +	reg->state = RH_RECOVERING; + +	/* Already quiesced ? */ +	if (atomic_read(®->pending)) +		list_del_init(®->list); + +	else { +		list_del_init(®->list); +		list_add(®->list, &rh->quiesced_regions); +	} +	spin_unlock_irq(&rh->region_lock); + +	return 1; +} + +static void rh_recovery_prepare(struct region_hash *rh) +{ +	while (!down_trylock(&rh->recovery_count)) +		if (__rh_recovery_prepare(rh) <= 0) { +			up(&rh->recovery_count); +			break; +		} +} + +/* + * Returns any quiesced regions. + */ +static struct region *rh_recovery_start(struct region_hash *rh) +{ +	struct region *reg = NULL; + +	spin_lock_irq(&rh->region_lock); +	if (!list_empty(&rh->quiesced_regions)) { +		reg = list_entry(rh->quiesced_regions.next, +				 struct region, list); +		list_del_init(®->list);	/* remove from the quiesced list */ +	} +	spin_unlock_irq(&rh->region_lock); + +	return reg; +} + +/* FIXME: success ignored for now */ +static void rh_recovery_end(struct region *reg, int success) +{ +	struct region_hash *rh = reg->rh; + +	spin_lock_irq(&rh->region_lock); +	list_add(®->list, ®->rh->recovered_regions); +	spin_unlock_irq(&rh->region_lock); + +	wake(); +} + +static void rh_flush(struct region_hash *rh) +{ +	rh->log->type->flush(rh->log); +} + +static void rh_delay(struct region_hash *rh, struct bio *bio) +{ +	struct region *reg; + +	read_lock(&rh->hash_lock); +	reg = __rh_find(rh, bio_to_region(rh, bio)); +	bio_list_add(®->delayed_bios, bio); +	read_unlock(&rh->hash_lock); +} + +static void rh_stop_recovery(struct region_hash *rh) +{ +	int i; + +	/* wait for any recovering regions */ +	for (i = 0; i < MAX_RECOVERY; i++) +		down(&rh->recovery_count); +} + +static void rh_start_recovery(struct region_hash *rh) +{ +	int i; + +	for (i = 0; i < MAX_RECOVERY; i++) +		up(&rh->recovery_count); + +	wake(); +} + +/*----------------------------------------------------------------- + * Mirror set structures. + *---------------------------------------------------------------*/ +struct mirror { +	atomic_t error_count; +	struct dm_dev *dev; +	sector_t offset; +}; + +struct mirror_set { +	struct dm_target *ti; +	struct list_head list; +	struct region_hash rh; +	struct kcopyd_client *kcopyd_client; + +	spinlock_t lock;	/* protects the next two lists */ +	struct bio_list reads; +	struct bio_list writes; + +	/* recovery */ +	region_t nr_regions; +	int in_sync; + +	unsigned int nr_mirrors; +	struct mirror mirror[0]; +}; + +/* + * Every mirror should look like this one. + */ +#define DEFAULT_MIRROR 0 + +/* + * This is yucky.  We squirrel the mirror_set struct away inside + * bi_next for write buffers.  This is safe since the bh + * doesn't get submitted to the lower levels of block layer. + */ +static struct mirror_set *bio_get_ms(struct bio *bio) +{ +	return (struct mirror_set *) bio->bi_next; +} + +static void bio_set_ms(struct bio *bio, struct mirror_set *ms) +{ +	bio->bi_next = (struct bio *) ms; +} + +/*----------------------------------------------------------------- + * Recovery. + * + * When a mirror is first activated we may find that some regions + * are in the no-sync state.  We have to recover these by + * recopying from the default mirror to all the others. + *---------------------------------------------------------------*/ +static void recovery_complete(int read_err, unsigned int write_err, +			      void *context) +{ +	struct region *reg = (struct region *) context; + +	/* FIXME: better error handling */ +	rh_recovery_end(reg, read_err || write_err); +} + +static int recover(struct mirror_set *ms, struct region *reg) +{ +	int r; +	unsigned int i; +	struct io_region from, to[KCOPYD_MAX_REGIONS], *dest; +	struct mirror *m; +	unsigned long flags = 0; + +	/* fill in the source */ +	m = ms->mirror + DEFAULT_MIRROR; +	from.bdev = m->dev->bdev; +	from.sector = m->offset + region_to_sector(reg->rh, reg->key); +	if (reg->key == (ms->nr_regions - 1)) { +		/* +		 * The final region may be smaller than +		 * region_size. +		 */ +		from.count = ms->ti->len & (reg->rh->region_size - 1); +		if (!from.count) +			from.count = reg->rh->region_size; +	} else +		from.count = reg->rh->region_size; + +	/* fill in the destinations */ +	for (i = 0, dest = to; i < ms->nr_mirrors; i++) { +		if (i == DEFAULT_MIRROR) +			continue; + +		m = ms->mirror + i; +		dest->bdev = m->dev->bdev; +		dest->sector = m->offset + region_to_sector(reg->rh, reg->key); +		dest->count = from.count; +		dest++; +	} + +	/* hand to kcopyd */ +	set_bit(KCOPYD_IGNORE_ERROR, &flags); +	r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, +			recovery_complete, reg); + +	return r; +} + +static void do_recovery(struct mirror_set *ms) +{ +	int r; +	struct region *reg; +	struct dirty_log *log = ms->rh.log; + +	/* +	 * Start quiescing some regions. +	 */ +	rh_recovery_prepare(&ms->rh); + +	/* +	 * Copy any already quiesced regions. +	 */ +	while ((reg = rh_recovery_start(&ms->rh))) { +		r = recover(ms, reg); +		if (r) +			rh_recovery_end(reg, 0); +	} + +	/* +	 * Update the in sync flag. +	 */ +	if (!ms->in_sync && +	    (log->type->get_sync_count(log) == ms->nr_regions)) { +		/* the sync is complete */ +		dm_table_event(ms->ti->table); +		ms->in_sync = 1; +	} +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) +{ +	/* FIXME: add read balancing */ +	return ms->mirror + DEFAULT_MIRROR; +} + +/* + * remap a buffer to a particular mirror. + */ +static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) +{ +	bio->bi_bdev = m->dev->bdev; +	bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); +} + +static void do_reads(struct mirror_set *ms, struct bio_list *reads) +{ +	region_t region; +	struct bio *bio; +	struct mirror *m; + +	while ((bio = bio_list_pop(reads))) { +		region = bio_to_region(&ms->rh, bio); + +		/* +		 * We can only read balance if the region is in sync. +		 */ +		if (rh_in_sync(&ms->rh, region, 0)) +			m = choose_mirror(ms, bio->bi_sector); +		else +			m = ms->mirror + DEFAULT_MIRROR; + +		map_bio(ms, m, bio); +		generic_make_request(bio); +	} +} + +/*----------------------------------------------------------------- + * Writes. + * + * We do different things with the write io depending on the + * state of the region that it's in: + * + * SYNC: 	increment pending, use kcopyd to write to *all* mirrors + * RECOVERING:	delay the io until recovery completes + * NOSYNC:	increment pending, just write to the default mirror + *---------------------------------------------------------------*/ +static void write_callback(unsigned long error, void *context) +{ +	unsigned int i; +	int uptodate = 1; +	struct bio *bio = (struct bio *) context; +	struct mirror_set *ms; + +	ms = bio_get_ms(bio); +	bio_set_ms(bio, NULL); + +	/* +	 * NOTE: We don't decrement the pending count here, +	 * instead it is done by the targets endio function. +	 * This way we handle both writes to SYNC and NOSYNC +	 * regions with the same code. +	 */ + +	if (error) { +		/* +		 * only error the io if all mirrors failed. +		 * FIXME: bogus +		 */ +		uptodate = 0; +		for (i = 0; i < ms->nr_mirrors; i++) +			if (!test_bit(i, &error)) { +				uptodate = 1; +				break; +			} +	} +	bio_endio(bio, bio->bi_size, 0); +} + +static void do_write(struct mirror_set *ms, struct bio *bio) +{ +	unsigned int i; +	struct io_region io[KCOPYD_MAX_REGIONS+1]; +	struct mirror *m; + +	for (i = 0; i < ms->nr_mirrors; i++) { +		m = ms->mirror + i; + +		io[i].bdev = m->dev->bdev; +		io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); +		io[i].count = bio->bi_size >> 9; +	} + +	bio_set_ms(bio, ms); +	dm_io_async_bvec(ms->nr_mirrors, io, WRITE, +			 bio->bi_io_vec + bio->bi_idx, +			 write_callback, bio); +} + +static void do_writes(struct mirror_set *ms, struct bio_list *writes) +{ +	int state; +	struct bio *bio; +	struct bio_list sync, nosync, recover, *this_list = NULL; + +	if (!writes->head) +		return; + +	/* +	 * Classify each write. +	 */ +	bio_list_init(&sync); +	bio_list_init(&nosync); +	bio_list_init(&recover); + +	while ((bio = bio_list_pop(writes))) { +		state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); +		switch (state) { +		case RH_CLEAN: +		case RH_DIRTY: +			this_list = &sync; +			break; + +		case RH_NOSYNC: +			this_list = &nosync; +			break; + +		case RH_RECOVERING: +			this_list = &recover; +			break; +		} + +		bio_list_add(this_list, bio); +	} + +	/* +	 * Increment the pending counts for any regions that will +	 * be written to (writes to recover regions are going to +	 * be delayed). +	 */ +	rh_inc_pending(&ms->rh, &sync); +	rh_inc_pending(&ms->rh, &nosync); +	rh_flush(&ms->rh); + +	/* +	 * Dispatch io. +	 */ +	while ((bio = bio_list_pop(&sync))) +		do_write(ms, bio); + +	while ((bio = bio_list_pop(&recover))) +		rh_delay(&ms->rh, bio); + +	while ((bio = bio_list_pop(&nosync))) { +		map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); +		generic_make_request(bio); +	} +} + +/*----------------------------------------------------------------- + * kmirrord + *---------------------------------------------------------------*/ +static LIST_HEAD(_mirror_sets); +static DECLARE_RWSEM(_mirror_sets_lock); + +static void do_mirror(struct mirror_set *ms) +{ +	struct bio_list reads, writes; + +	spin_lock(&ms->lock); +	reads = ms->reads; +	writes = ms->writes; +	bio_list_init(&ms->reads); +	bio_list_init(&ms->writes); +	spin_unlock(&ms->lock); + +	rh_update_states(&ms->rh); +	do_recovery(ms); +	do_reads(ms, &reads); +	do_writes(ms, &writes); +} + +static void do_work(void *ignored) +{ +	struct mirror_set *ms; + +	down_read(&_mirror_sets_lock); +	list_for_each_entry (ms, &_mirror_sets, list) +		do_mirror(ms); +	up_read(&_mirror_sets_lock); +} + +/*----------------------------------------------------------------- + * Target functions + *---------------------------------------------------------------*/ +static struct mirror_set *alloc_context(unsigned int nr_mirrors, +					uint32_t region_size, +					struct dm_target *ti, +					struct dirty_log *dl) +{ +	size_t len; +	struct mirror_set *ms = NULL; + +	if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) +		return NULL; + +	len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); + +	ms = kmalloc(len, GFP_KERNEL); +	if (!ms) { +		ti->error = "dm-mirror: Cannot allocate mirror context"; +		return NULL; +	} + +	memset(ms, 0, len); +	spin_lock_init(&ms->lock); + +	ms->ti = ti; +	ms->nr_mirrors = nr_mirrors; +	ms->nr_regions = dm_sector_div_up(ti->len, region_size); +	ms->in_sync = 0; + +	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { +		ti->error = "dm-mirror: Error creating dirty region hash"; +		kfree(ms); +		return NULL; +	} + +	return ms; +} + +static void free_context(struct mirror_set *ms, struct dm_target *ti, +			 unsigned int m) +{ +	while (m--) +		dm_put_device(ti, ms->mirror[m].dev); + +	rh_exit(&ms->rh); +	kfree(ms); +} + +static inline int _check_region_size(struct dm_target *ti, uint32_t size) +{ +	return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || +		 size > ti->len); +} + +static int get_mirror(struct mirror_set *ms, struct dm_target *ti, +		      unsigned int mirror, char **argv) +{ +	sector_t offset; + +	if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) { +		ti->error = "dm-mirror: Invalid offset"; +		return -EINVAL; +	} + +	if (dm_get_device(ti, argv[0], offset, ti->len, +			  dm_table_get_mode(ti->table), +			  &ms->mirror[mirror].dev)) { +		ti->error = "dm-mirror: Device lookup failure"; +		return -ENXIO; +	} + +	ms->mirror[mirror].offset = offset; + +	return 0; +} + +static int add_mirror_set(struct mirror_set *ms) +{ +	down_write(&_mirror_sets_lock); +	list_add_tail(&ms->list, &_mirror_sets); +	up_write(&_mirror_sets_lock); +	wake(); + +	return 0; +} + +static void del_mirror_set(struct mirror_set *ms) +{ +	down_write(&_mirror_sets_lock); +	list_del(&ms->list); +	up_write(&_mirror_sets_lock); +} + +/* + * Create dirty log: log_type #log_params <log_params> + */ +static struct dirty_log *create_dirty_log(struct dm_target *ti, +					  unsigned int argc, char **argv, +					  unsigned int *args_used) +{ +	unsigned int param_count; +	struct dirty_log *dl; + +	if (argc < 2) { +		ti->error = "dm-mirror: Insufficient mirror log arguments"; +		return NULL; +	} + +	if (sscanf(argv[1], "%u", ¶m_count) != 1) { +		ti->error = "dm-mirror: Invalid mirror log argument count"; +		return NULL; +	} + +	*args_used = 2 + param_count; + +	if (argc < *args_used) { +		ti->error = "dm-mirror: Insufficient mirror log arguments"; +		return NULL; +	} + +	dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); +	if (!dl) { +		ti->error = "dm-mirror: Error creating mirror dirty log"; +		return NULL; +	} + +	if (!_check_region_size(ti, dl->type->get_region_size(dl))) { +		ti->error = "dm-mirror: Invalid region size"; +		dm_destroy_dirty_log(dl); +		return NULL; +	} + +	return dl; +} + +/* + * Construct a mirror mapping: + * + * log_type #log_params <log_params> + * #mirrors [mirror_path offset]{2,} + * + * log_type is "core" or "disk" + * #log_params is between 1 and 3 + */ +#define DM_IO_PAGES 64 +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ +	int r; +	unsigned int nr_mirrors, m, args_used; +	struct mirror_set *ms; +	struct dirty_log *dl; + +	dl = create_dirty_log(ti, argc, argv, &args_used); +	if (!dl) +		return -EINVAL; + +	argv += args_used; +	argc -= args_used; + +	if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || +	    nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { +		ti->error = "dm-mirror: Invalid number of mirrors"; +		dm_destroy_dirty_log(dl); +		return -EINVAL; +	} + +	argv++, argc--; + +	if (argc != nr_mirrors * 2) { +		ti->error = "dm-mirror: Wrong number of mirror arguments"; +		dm_destroy_dirty_log(dl); +		return -EINVAL; +	} + +	ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); +	if (!ms) { +		dm_destroy_dirty_log(dl); +		return -ENOMEM; +	} + +	/* Get the mirror parameter sets */ +	for (m = 0; m < nr_mirrors; m++) { +		r = get_mirror(ms, ti, m, argv); +		if (r) { +			free_context(ms, ti, m); +			return r; +		} +		argv += 2; +		argc -= 2; +	} + +	ti->private = ms; + +	r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); +	if (r) { +		free_context(ms, ti, ms->nr_mirrors); +		return r; +	} + +	add_mirror_set(ms); +	return 0; +} + +static void mirror_dtr(struct dm_target *ti) +{ +	struct mirror_set *ms = (struct mirror_set *) ti->private; + +	del_mirror_set(ms); +	kcopyd_client_destroy(ms->kcopyd_client); +	free_context(ms, ti, ms->nr_mirrors); +} + +static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) +{ +	int should_wake = 0; +	struct bio_list *bl; + +	bl = (rw == WRITE) ? &ms->writes : &ms->reads; +	spin_lock(&ms->lock); +	should_wake = !(bl->head); +	bio_list_add(bl, bio); +	spin_unlock(&ms->lock); + +	if (should_wake) +		wake(); +} + +/* + * Mirror mapping function + */ +static int mirror_map(struct dm_target *ti, struct bio *bio, +		      union map_info *map_context) +{ +	int r, rw = bio_rw(bio); +	struct mirror *m; +	struct mirror_set *ms = ti->private; + +	map_context->ll = bio->bi_sector >> ms->rh.region_shift; + +	if (rw == WRITE) { +		queue_bio(ms, bio, rw); +		return 0; +	} + +	r = ms->rh.log->type->in_sync(ms->rh.log, +				      bio_to_region(&ms->rh, bio), 0); +	if (r < 0 && r != -EWOULDBLOCK) +		return r; + +	if (r == -EWOULDBLOCK)	/* FIXME: ugly */ +		r = 0; + +	/* +	 * We don't want to fast track a recovery just for a read +	 * ahead.  So we just let it silently fail. +	 * FIXME: get rid of this. +	 */ +	if (!r && rw == READA) +		return -EIO; + +	if (!r) { +		/* Pass this io over to the daemon */ +		queue_bio(ms, bio, rw); +		return 0; +	} + +	m = choose_mirror(ms, bio->bi_sector); +	if (!m) +		return -EIO; + +	map_bio(ms, m, bio); +	return 1; +} + +static int mirror_end_io(struct dm_target *ti, struct bio *bio, +			 int error, union map_info *map_context) +{ +	int rw = bio_rw(bio); +	struct mirror_set *ms = (struct mirror_set *) ti->private; +	region_t region = map_context->ll; + +	/* +	 * We need to dec pending if this was a write. +	 */ +	if (rw == WRITE) +		rh_dec(&ms->rh, region); + +	return 0; +} + +static void mirror_postsuspend(struct dm_target *ti) +{ +	struct mirror_set *ms = (struct mirror_set *) ti->private; +	struct dirty_log *log = ms->rh.log; + +	rh_stop_recovery(&ms->rh); +	if (log->type->suspend && log->type->suspend(log)) +		/* FIXME: need better error handling */ +		DMWARN("log suspend failed"); +} + +static void mirror_resume(struct dm_target *ti) +{ +	struct mirror_set *ms = (struct mirror_set *) ti->private; +	struct dirty_log *log = ms->rh.log; +	if (log->type->resume && log->type->resume(log)) +		/* FIXME: need better error handling */ +		DMWARN("log resume failed"); +	rh_start_recovery(&ms->rh); +} + +static int mirror_status(struct dm_target *ti, status_type_t type, +			 char *result, unsigned int maxlen) +{ +	unsigned int m, sz; +	struct mirror_set *ms = (struct mirror_set *) ti->private; + +	sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); + +	switch (type) { +	case STATUSTYPE_INFO: +		DMEMIT("%d ", ms->nr_mirrors); +		for (m = 0; m < ms->nr_mirrors; m++) +			DMEMIT("%s ", ms->mirror[m].dev->name); + +		DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT, +		       ms->rh.log->type->get_sync_count(ms->rh.log), +		       ms->nr_regions); +		break; + +	case STATUSTYPE_TABLE: +		DMEMIT("%d ", ms->nr_mirrors); +		for (m = 0; m < ms->nr_mirrors; m++) +			DMEMIT("%s " SECTOR_FORMAT " ", +			       ms->mirror[m].dev->name, ms->mirror[m].offset); +	} + +	return 0; +} + +static struct target_type mirror_target = { +	.name	 = "mirror", +	.version = {1, 0, 1}, +	.module	 = THIS_MODULE, +	.ctr	 = mirror_ctr, +	.dtr	 = mirror_dtr, +	.map	 = mirror_map, +	.end_io	 = mirror_end_io, +	.postsuspend = mirror_postsuspend, +	.resume	 = mirror_resume, +	.status	 = mirror_status, +}; + +static int __init dm_mirror_init(void) +{ +	int r; + +	r = dm_dirty_log_init(); +	if (r) +		return r; + +	_kmirrord_wq = create_workqueue("kmirrord"); +	if (!_kmirrord_wq) { +		DMERR("couldn't start kmirrord"); +		dm_dirty_log_exit(); +		return r; +	} +	INIT_WORK(&_kmirrord_work, do_work, NULL); + +	r = dm_register_target(&mirror_target); +	if (r < 0) { +		DMERR("%s: Failed to register mirror target", +		      mirror_target.name); +		dm_dirty_log_exit(); +		destroy_workqueue(_kmirrord_wq); +	} + +	return r; +} + +static void __exit dm_mirror_exit(void) +{ +	int r; + +	r = dm_unregister_target(&mirror_target); +	if (r < 0) +		DMERR("%s: unregister failed %d", mirror_target.name, r); + +	destroy_workqueue(_kmirrord_wq); +	dm_dirty_log_exit(); +} + +/* Module hooks */ +module_init(dm_mirror_init); +module_exit(dm_mirror_exit); + +MODULE_DESCRIPTION(DM_NAME " mirror target"); +MODULE_AUTHOR("Joe Thornber"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c new file mode 100644 index 00000000000..d0024865a78 --- /dev/null +++ b/drivers/md/dm-round-robin.c @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Round-robin path selector. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include <linux/slab.h> + +/*----------------------------------------------------------------- + * Path-handling code, paths are held in lists + *---------------------------------------------------------------*/ +struct path_info { +	struct list_head list; +	struct path *path; +	unsigned repeat_count; +}; + +static void free_paths(struct list_head *paths) +{ +	struct path_info *pi, *next; + +	list_for_each_entry_safe(pi, next, paths, list) { +		list_del(&pi->list); +		kfree(pi); +	} +} + +/*----------------------------------------------------------------- + * Round-robin selector + *---------------------------------------------------------------*/ + +#define RR_MIN_IO		1000 + +struct selector { +	struct list_head valid_paths; +	struct list_head invalid_paths; +}; + +static struct selector *alloc_selector(void) +{ +	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + +	if (s) { +		INIT_LIST_HEAD(&s->valid_paths); +		INIT_LIST_HEAD(&s->invalid_paths); +	} + +	return s; +} + +static int rr_create(struct path_selector *ps, unsigned argc, char **argv) +{ +	struct selector *s; + +	s = alloc_selector(); +	if (!s) +		return -ENOMEM; + +	ps->context = s; +	return 0; +} + +static void rr_destroy(struct path_selector *ps) +{ +	struct selector *s = (struct selector *) ps->context; + +	free_paths(&s->valid_paths); +	free_paths(&s->invalid_paths); +	kfree(s); +	ps->context = NULL; +} + +static int rr_status(struct path_selector *ps, struct path *path, +		     status_type_t type, char *result, unsigned int maxlen) +{ +	struct path_info *pi; +	int sz = 0; + +	if (!path) +		DMEMIT("0 "); +	else { +		switch(type) { +		case STATUSTYPE_INFO: +			break; +		case STATUSTYPE_TABLE: +			pi = path->pscontext; +			DMEMIT("%u ", pi->repeat_count); +			break; +		} +	} + +	return sz; +} + +/* + * Called during initialisation to register each path with an + * optional repeat_count. + */ +static int rr_add_path(struct path_selector *ps, struct path *path, +		       int argc, char **argv, char **error) +{ +	struct selector *s = (struct selector *) ps->context; +	struct path_info *pi; +	unsigned repeat_count = RR_MIN_IO; + +	if (argc > 1) { +		*error = "round-robin ps: incorrect number of arguments"; +		return -EINVAL; +	} + +	/* First path argument is number of I/Os before switching path */ +	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { +		*error = "round-robin ps: invalid repeat count"; +		return -EINVAL; +	} + +	/* allocate the path */ +	pi = kmalloc(sizeof(*pi), GFP_KERNEL); +	if (!pi) { +		*error = "round-robin ps: Error allocating path context"; +		return -ENOMEM; +	} + +	pi->path = path; +	pi->repeat_count = repeat_count; + +	path->pscontext = pi; + +	list_add(&pi->list, &s->valid_paths); + +	return 0; +} + +static void rr_fail_path(struct path_selector *ps, struct path *p) +{ +	struct selector *s = (struct selector *) ps->context; +	struct path_info *pi = p->pscontext; + +	list_move(&pi->list, &s->invalid_paths); +} + +static int rr_reinstate_path(struct path_selector *ps, struct path *p) +{ +	struct selector *s = (struct selector *) ps->context; +	struct path_info *pi = p->pscontext; + +	list_move(&pi->list, &s->valid_paths); + +	return 0; +} + +static struct path *rr_select_path(struct path_selector *ps, +				   unsigned *repeat_count) +{ +	struct selector *s = (struct selector *) ps->context; +	struct path_info *pi = NULL; + +	if (!list_empty(&s->valid_paths)) { +		pi = list_entry(s->valid_paths.next, struct path_info, list); +		list_move_tail(&pi->list, &s->valid_paths); +		*repeat_count = pi->repeat_count; +	} + +	return pi ? pi->path : NULL; +} + +static struct path_selector_type rr_ps = { +	.name = "round-robin", +	.module = THIS_MODULE, +	.table_args = 1, +	.info_args = 0, +	.create = rr_create, +	.destroy = rr_destroy, +	.status = rr_status, +	.add_path = rr_add_path, +	.fail_path = rr_fail_path, +	.reinstate_path = rr_reinstate_path, +	.select_path = rr_select_path, +}; + +static int __init dm_rr_init(void) +{ +	int r = dm_register_path_selector(&rr_ps); + +	if (r < 0) +		DMERR("round-robin: register failed %d", r); + +	DMINFO("dm-round-robin version 1.0.0 loaded"); + +	return r; +} + +static void __exit dm_rr_exit(void) +{ +	int r = dm_unregister_path_selector(&rr_ps); + +	if (r < 0) +		DMERR("round-robin: unregister failed %d", r); +} + +module_init(dm_rr_init); +module_exit(dm_rr_exit); + +MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector"); +MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c new file mode 100644 index 00000000000..7e691ab9a74 --- /dev/null +++ b/drivers/md/dm-snap.c @@ -0,0 +1,1208 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include <linux/blkdev.h> +#include <linux/config.h> +#include <linux/ctype.h> +#include <linux/device-mapper.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kdev_t.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include "dm-snap.h" +#include "dm-bio-list.h" +#include "kcopyd.h" + +/* + * The percentage increment we will wake up users at + */ +#define WAKE_UP_PERCENT 5 + +/* + * kcopyd priority of snapshot operations + */ +#define SNAPSHOT_COPY_PRIORITY 2 + +/* + * Each snapshot reserves this many pages for io + */ +#define SNAPSHOT_PAGES 256 + +struct pending_exception { +	struct exception e; + +	/* +	 * Origin buffers waiting for this to complete are held +	 * in a bio list +	 */ +	struct bio_list origin_bios; +	struct bio_list snapshot_bios; + +	/* +	 * Other pending_exceptions that are processing this +	 * chunk.  When this list is empty, we know we can +	 * complete the origins. +	 */ +	struct list_head siblings; + +	/* Pointer back to snapshot context */ +	struct dm_snapshot *snap; + +	/* +	 * 1 indicates the exception has already been sent to +	 * kcopyd. +	 */ +	int started; +}; + +/* + * Hash table mapping origin volumes to lists of snapshots and + * a lock to protect it + */ +static kmem_cache_t *exception_cache; +static kmem_cache_t *pending_cache; +static mempool_t *pending_pool; + +/* + * One of these per registered origin, held in the snapshot_origins hash + */ +struct origin { +	/* The origin device */ +	struct block_device *bdev; + +	struct list_head hash_list; + +	/* List of snapshots for this origin */ +	struct list_head snapshots; +}; + +/* + * Size of the hash table for origin volumes. If we make this + * the size of the minors list then it should be nearly perfect + */ +#define ORIGIN_HASH_SIZE 256 +#define ORIGIN_MASK      0xFF +static struct list_head *_origins; +static struct rw_semaphore _origins_lock; + +static int init_origin_hash(void) +{ +	int i; + +	_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), +			   GFP_KERNEL); +	if (!_origins) { +		DMERR("Device mapper: Snapshot: unable to allocate memory"); +		return -ENOMEM; +	} + +	for (i = 0; i < ORIGIN_HASH_SIZE; i++) +		INIT_LIST_HEAD(_origins + i); +	init_rwsem(&_origins_lock); + +	return 0; +} + +static void exit_origin_hash(void) +{ +	kfree(_origins); +} + +static inline unsigned int origin_hash(struct block_device *bdev) +{ +	return bdev->bd_dev & ORIGIN_MASK; +} + +static struct origin *__lookup_origin(struct block_device *origin) +{ +	struct list_head *ol; +	struct origin *o; + +	ol = &_origins[origin_hash(origin)]; +	list_for_each_entry (o, ol, hash_list) +		if (bdev_equal(o->bdev, origin)) +			return o; + +	return NULL; +} + +static void __insert_origin(struct origin *o) +{ +	struct list_head *sl = &_origins[origin_hash(o->bdev)]; +	list_add_tail(&o->hash_list, sl); +} + +/* + * Make a note of the snapshot and its origin so we can look it + * up when the origin has a write on it. + */ +static int register_snapshot(struct dm_snapshot *snap) +{ +	struct origin *o; +	struct block_device *bdev = snap->origin->bdev; + +	down_write(&_origins_lock); +	o = __lookup_origin(bdev); + +	if (!o) { +		/* New origin */ +		o = kmalloc(sizeof(*o), GFP_KERNEL); +		if (!o) { +			up_write(&_origins_lock); +			return -ENOMEM; +		} + +		/* Initialise the struct */ +		INIT_LIST_HEAD(&o->snapshots); +		o->bdev = bdev; + +		__insert_origin(o); +	} + +	list_add_tail(&snap->list, &o->snapshots); + +	up_write(&_origins_lock); +	return 0; +} + +static void unregister_snapshot(struct dm_snapshot *s) +{ +	struct origin *o; + +	down_write(&_origins_lock); +	o = __lookup_origin(s->origin->bdev); + +	list_del(&s->list); +	if (list_empty(&o->snapshots)) { +		list_del(&o->hash_list); +		kfree(o); +	} + +	up_write(&_origins_lock); +} + +/* + * Implementation of the exception hash tables. + */ +static int init_exception_table(struct exception_table *et, uint32_t size) +{ +	unsigned int i; + +	et->hash_mask = size - 1; +	et->table = dm_vcalloc(size, sizeof(struct list_head)); +	if (!et->table) +		return -ENOMEM; + +	for (i = 0; i < size; i++) +		INIT_LIST_HEAD(et->table + i); + +	return 0; +} + +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) +{ +	struct list_head *slot; +	struct exception *ex, *next; +	int i, size; + +	size = et->hash_mask + 1; +	for (i = 0; i < size; i++) { +		slot = et->table + i; + +		list_for_each_entry_safe (ex, next, slot, hash_list) +			kmem_cache_free(mem, ex); +	} + +	vfree(et->table); +} + +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk) +{ +	return chunk & et->hash_mask; +} + +static void insert_exception(struct exception_table *eh, struct exception *e) +{ +	struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; +	list_add(&e->hash_list, l); +} + +static inline void remove_exception(struct exception *e) +{ +	list_del(&e->hash_list); +} + +/* + * Return the exception data for a sector, or NULL if not + * remapped. + */ +static struct exception *lookup_exception(struct exception_table *et, +					  chunk_t chunk) +{ +	struct list_head *slot; +	struct exception *e; + +	slot = &et->table[exception_hash(et, chunk)]; +	list_for_each_entry (e, slot, hash_list) +		if (e->old_chunk == chunk) +			return e; + +	return NULL; +} + +static inline struct exception *alloc_exception(void) +{ +	struct exception *e; + +	e = kmem_cache_alloc(exception_cache, GFP_NOIO); +	if (!e) +		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); + +	return e; +} + +static inline void free_exception(struct exception *e) +{ +	kmem_cache_free(exception_cache, e); +} + +static inline struct pending_exception *alloc_pending_exception(void) +{ +	return mempool_alloc(pending_pool, GFP_NOIO); +} + +static inline void free_pending_exception(struct pending_exception *pe) +{ +	mempool_free(pe, pending_pool); +} + +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) +{ +	struct exception *e; + +	e = alloc_exception(); +	if (!e) +		return -ENOMEM; + +	e->old_chunk = old; +	e->new_chunk = new; +	insert_exception(&s->complete, e); +	return 0; +} + +/* + * Hard coded magic. + */ +static int calc_max_buckets(void) +{ +	/* use a fixed size of 2MB */ +	unsigned long mem = 2 * 1024 * 1024; +	mem /= sizeof(struct list_head); + +	return mem; +} + +/* + * Rounds a number down to a power of 2. + */ +static inline uint32_t round_down(uint32_t n) +{ +	while (n & (n - 1)) +		n &= (n - 1); +	return n; +} + +/* + * Allocate room for a suitable hash table. + */ +static int init_hash_tables(struct dm_snapshot *s) +{ +	sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + +	/* +	 * Calculate based on the size of the original volume or +	 * the COW volume... +	 */ +	cow_dev_size = get_dev_size(s->cow->bdev); +	origin_dev_size = get_dev_size(s->origin->bdev); +	max_buckets = calc_max_buckets(); + +	hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; +	hash_size = min(hash_size, max_buckets); + +	/* Round it down to a power of 2 */ +	hash_size = round_down(hash_size); +	if (init_exception_table(&s->complete, hash_size)) +		return -ENOMEM; + +	/* +	 * Allocate hash table for in-flight exceptions +	 * Make this smaller than the real hash table +	 */ +	hash_size >>= 3; +	if (hash_size < 64) +		hash_size = 64; + +	if (init_exception_table(&s->pending, hash_size)) { +		exit_exception_table(&s->complete, exception_cache); +		return -ENOMEM; +	} + +	return 0; +} + +/* + * Round a number up to the nearest 'size' boundary.  size must + * be a power of 2. + */ +static inline ulong round_up(ulong n, ulong size) +{ +	size--; +	return (n + size) & ~size; +} + +/* + * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> + */ +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ +	struct dm_snapshot *s; +	unsigned long chunk_size; +	int r = -EINVAL; +	char persistent; +	char *origin_path; +	char *cow_path; +	char *value; +	int blocksize; + +	if (argc < 4) { +		ti->error = "dm-snapshot: requires exactly 4 arguments"; +		r = -EINVAL; +		goto bad1; +	} + +	origin_path = argv[0]; +	cow_path = argv[1]; +	persistent = toupper(*argv[2]); + +	if (persistent != 'P' && persistent != 'N') { +		ti->error = "Persistent flag is not P or N"; +		r = -EINVAL; +		goto bad1; +	} + +	chunk_size = simple_strtoul(argv[3], &value, 10); +	if (chunk_size == 0 || value == NULL) { +		ti->error = "Invalid chunk size"; +		r = -EINVAL; +		goto bad1; +	} + +	s = kmalloc(sizeof(*s), GFP_KERNEL); +	if (s == NULL) { +		ti->error = "Cannot allocate snapshot context private " +		    "structure"; +		r = -ENOMEM; +		goto bad1; +	} + +	r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); +	if (r) { +		ti->error = "Cannot get origin device"; +		goto bad2; +	} + +	r = dm_get_device(ti, cow_path, 0, 0, +			  FMODE_READ | FMODE_WRITE, &s->cow); +	if (r) { +		dm_put_device(ti, s->origin); +		ti->error = "Cannot get COW device"; +		goto bad2; +	} + +	/* +	 * Chunk size must be multiple of page size.  Silently +	 * round up if it's not. +	 */ +	chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); + +	/* Validate the chunk size against the device block size */ +	blocksize = s->cow->bdev->bd_disk->queue->hardsect_size; +	if (chunk_size % (blocksize >> 9)) { +		ti->error = "Chunk size is not a multiple of device blocksize"; +		r = -EINVAL; +		goto bad3; +	} + +	/* Check chunk_size is a power of 2 */ +	if (chunk_size & (chunk_size - 1)) { +		ti->error = "Chunk size is not a power of 2"; +		r = -EINVAL; +		goto bad3; +	} + +	s->chunk_size = chunk_size; +	s->chunk_mask = chunk_size - 1; +	s->type = persistent; +	s->chunk_shift = ffs(chunk_size) - 1; + +	s->valid = 1; +	s->have_metadata = 0; +	s->last_percent = 0; +	init_rwsem(&s->lock); +	s->table = ti->table; + +	/* Allocate hash table for COW data */ +	if (init_hash_tables(s)) { +		ti->error = "Unable to allocate hash table space"; +		r = -ENOMEM; +		goto bad3; +	} + +	/* +	 * Check the persistent flag - done here because we need the iobuf +	 * to check the LV header +	 */ +	s->store.snap = s; + +	if (persistent == 'P') +		r = dm_create_persistent(&s->store, chunk_size); +	else +		r = dm_create_transient(&s->store, s, blocksize); + +	if (r) { +		ti->error = "Couldn't create exception store"; +		r = -EINVAL; +		goto bad4; +	} + +	r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); +	if (r) { +		ti->error = "Could not create kcopyd client"; +		goto bad5; +	} + +	/* Add snapshot to the list of snapshots for this origin */ +	if (register_snapshot(s)) { +		r = -EINVAL; +		ti->error = "Cannot register snapshot origin"; +		goto bad6; +	} + +	ti->private = s; +	ti->split_io = chunk_size; + +	return 0; + + bad6: +	kcopyd_client_destroy(s->kcopyd_client); + + bad5: +	s->store.destroy(&s->store); + + bad4: +	exit_exception_table(&s->pending, pending_cache); +	exit_exception_table(&s->complete, exception_cache); + + bad3: +	dm_put_device(ti, s->cow); +	dm_put_device(ti, s->origin); + + bad2: +	kfree(s); + + bad1: +	return r; +} + +static void snapshot_dtr(struct dm_target *ti) +{ +	struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + +	unregister_snapshot(s); + +	exit_exception_table(&s->pending, pending_cache); +	exit_exception_table(&s->complete, exception_cache); + +	/* Deallocate memory used */ +	s->store.destroy(&s->store); + +	dm_put_device(ti, s->origin); +	dm_put_device(ti, s->cow); +	kcopyd_client_destroy(s->kcopyd_client); +	kfree(s); +} + +/* + * Flush a list of buffers. + */ +static void flush_bios(struct bio *bio) +{ +	struct bio *n; + +	while (bio) { +		n = bio->bi_next; +		bio->bi_next = NULL; +		generic_make_request(bio); +		bio = n; +	} +} + +/* + * Error a list of buffers. + */ +static void error_bios(struct bio *bio) +{ +	struct bio *n; + +	while (bio) { +		n = bio->bi_next; +		bio->bi_next = NULL; +		bio_io_error(bio, bio->bi_size); +		bio = n; +	} +} + +static struct bio *__flush_bios(struct pending_exception *pe) +{ +	struct pending_exception *sibling; + +	if (list_empty(&pe->siblings)) +		return bio_list_get(&pe->origin_bios); + +	sibling = list_entry(pe->siblings.next, +			     struct pending_exception, siblings); + +	list_del(&pe->siblings); + +	/* This is fine as long as kcopyd is single-threaded. If kcopyd +	 * becomes multi-threaded, we'll need some locking here. +	 */ +	bio_list_merge(&sibling->origin_bios, &pe->origin_bios); + +	return NULL; +} + +static void pending_complete(struct pending_exception *pe, int success) +{ +	struct exception *e; +	struct dm_snapshot *s = pe->snap; +	struct bio *flush = NULL; + +	if (success) { +		e = alloc_exception(); +		if (!e) { +			DMWARN("Unable to allocate exception."); +			down_write(&s->lock); +			s->store.drop_snapshot(&s->store); +			s->valid = 0; +			flush = __flush_bios(pe); +			up_write(&s->lock); + +			error_bios(bio_list_get(&pe->snapshot_bios)); +			goto out; +		} +		*e = pe->e; + +		/* +		 * Add a proper exception, and remove the +		 * in-flight exception from the list. +		 */ +		down_write(&s->lock); +		insert_exception(&s->complete, e); +		remove_exception(&pe->e); +		flush = __flush_bios(pe); + +		/* Submit any pending write bios */ +		up_write(&s->lock); + +		flush_bios(bio_list_get(&pe->snapshot_bios)); +	} else { +		/* Read/write error - snapshot is unusable */ +		down_write(&s->lock); +		if (s->valid) +			DMERR("Error reading/writing snapshot"); +		s->store.drop_snapshot(&s->store); +		s->valid = 0; +		remove_exception(&pe->e); +		flush = __flush_bios(pe); +		up_write(&s->lock); + +		error_bios(bio_list_get(&pe->snapshot_bios)); + +		dm_table_event(s->table); +	} + + out: +	free_pending_exception(pe); + +	if (flush) +		flush_bios(flush); +} + +static void commit_callback(void *context, int success) +{ +	struct pending_exception *pe = (struct pending_exception *) context; +	pending_complete(pe, success); +} + +/* + * Called when the copy I/O has finished.  kcopyd actually runs + * this code so don't block. + */ +static void copy_callback(int read_err, unsigned int write_err, void *context) +{ +	struct pending_exception *pe = (struct pending_exception *) context; +	struct dm_snapshot *s = pe->snap; + +	if (read_err || write_err) +		pending_complete(pe, 0); + +	else +		/* Update the metadata if we are persistent */ +		s->store.commit_exception(&s->store, &pe->e, commit_callback, +					  pe); +} + +/* + * Dispatches the copy operation to kcopyd. + */ +static inline void start_copy(struct pending_exception *pe) +{ +	struct dm_snapshot *s = pe->snap; +	struct io_region src, dest; +	struct block_device *bdev = s->origin->bdev; +	sector_t dev_size; + +	dev_size = get_dev_size(bdev); + +	src.bdev = bdev; +	src.sector = chunk_to_sector(s, pe->e.old_chunk); +	src.count = min(s->chunk_size, dev_size - src.sector); + +	dest.bdev = s->cow->bdev; +	dest.sector = chunk_to_sector(s, pe->e.new_chunk); +	dest.count = src.count; + +	/* Hand over to kcopyd */ +	kcopyd_copy(s->kcopyd_client, +		    &src, 1, &dest, 0, copy_callback, pe); +} + +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + * + * NOTE: a write lock must be held on snap->lock before calling + * this. + */ +static struct pending_exception * +__find_pending_exception(struct dm_snapshot *s, struct bio *bio) +{ +	struct exception *e; +	struct pending_exception *pe; +	chunk_t chunk = sector_to_chunk(s, bio->bi_sector); + +	/* +	 * Is there a pending exception for this already ? +	 */ +	e = lookup_exception(&s->pending, chunk); +	if (e) { +		/* cast the exception to a pending exception */ +		pe = container_of(e, struct pending_exception, e); + +	} else { +		/* +		 * Create a new pending exception, we don't want +		 * to hold the lock while we do this. +		 */ +		up_write(&s->lock); +		pe = alloc_pending_exception(); +		down_write(&s->lock); + +		e = lookup_exception(&s->pending, chunk); +		if (e) { +			free_pending_exception(pe); +			pe = container_of(e, struct pending_exception, e); +		} else { +			pe->e.old_chunk = chunk; +			bio_list_init(&pe->origin_bios); +			bio_list_init(&pe->snapshot_bios); +			INIT_LIST_HEAD(&pe->siblings); +			pe->snap = s; +			pe->started = 0; + +			if (s->store.prepare_exception(&s->store, &pe->e)) { +				free_pending_exception(pe); +				s->valid = 0; +				return NULL; +			} + +			insert_exception(&s->pending, &pe->e); +		} +	} + +	return pe; +} + +static inline void remap_exception(struct dm_snapshot *s, struct exception *e, +				   struct bio *bio) +{ +	bio->bi_bdev = s->cow->bdev; +	bio->bi_sector = chunk_to_sector(s, e->new_chunk) + +		(bio->bi_sector & s->chunk_mask); +} + +static int snapshot_map(struct dm_target *ti, struct bio *bio, +			union map_info *map_context) +{ +	struct exception *e; +	struct dm_snapshot *s = (struct dm_snapshot *) ti->private; +	int r = 1; +	chunk_t chunk; +	struct pending_exception *pe; + +	chunk = sector_to_chunk(s, bio->bi_sector); + +	/* Full snapshots are not usable */ +	if (!s->valid) +		return -1; + +	/* +	 * Write to snapshot - higher level takes care of RW/RO +	 * flags so we should only get this if we are +	 * writeable. +	 */ +	if (bio_rw(bio) == WRITE) { + +		/* FIXME: should only take write lock if we need +		 * to copy an exception */ +		down_write(&s->lock); + +		/* If the block is already remapped - use that, else remap it */ +		e = lookup_exception(&s->complete, chunk); +		if (e) { +			remap_exception(s, e, bio); +			up_write(&s->lock); + +		} else { +			pe = __find_pending_exception(s, bio); + +			if (!pe) { +				if (s->store.drop_snapshot) +					s->store.drop_snapshot(&s->store); +				s->valid = 0; +				r = -EIO; +				up_write(&s->lock); +			} else { +				remap_exception(s, &pe->e, bio); +				bio_list_add(&pe->snapshot_bios, bio); + +				if (!pe->started) { +					/* this is protected by snap->lock */ +					pe->started = 1; +					up_write(&s->lock); +					start_copy(pe); +				} else +					up_write(&s->lock); +				r = 0; +			} +		} + +	} else { +		/* +		 * FIXME: this read path scares me because we +		 * always use the origin when we have a pending +		 * exception.  However I can't think of a +		 * situation where this is wrong - ejt. +		 */ + +		/* Do reads */ +		down_read(&s->lock); + +		/* See if it it has been remapped */ +		e = lookup_exception(&s->complete, chunk); +		if (e) +			remap_exception(s, e, bio); +		else +			bio->bi_bdev = s->origin->bdev; + +		up_read(&s->lock); +	} + +	return r; +} + +static void snapshot_resume(struct dm_target *ti) +{ +	struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + +	if (s->have_metadata) +		return; + +	if (s->store.read_metadata(&s->store)) { +		down_write(&s->lock); +		s->valid = 0; +		up_write(&s->lock); +	} + +	s->have_metadata = 1; +} + +static int snapshot_status(struct dm_target *ti, status_type_t type, +			   char *result, unsigned int maxlen) +{ +	struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; + +	switch (type) { +	case STATUSTYPE_INFO: +		if (!snap->valid) +			snprintf(result, maxlen, "Invalid"); +		else { +			if (snap->store.fraction_full) { +				sector_t numerator, denominator; +				snap->store.fraction_full(&snap->store, +							  &numerator, +							  &denominator); +				snprintf(result, maxlen, +					 SECTOR_FORMAT "/" SECTOR_FORMAT, +					 numerator, denominator); +			} +			else +				snprintf(result, maxlen, "Unknown"); +		} +		break; + +	case STATUSTYPE_TABLE: +		/* +		 * kdevname returns a static pointer so we need +		 * to make private copies if the output is to +		 * make sense. +		 */ +		snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, +			 snap->origin->name, snap->cow->name, +			 snap->type, snap->chunk_size); +		break; +	} + +	return 0; +} + +/*----------------------------------------------------------------- + * Origin methods + *---------------------------------------------------------------*/ +static void list_merge(struct list_head *l1, struct list_head *l2) +{ +	struct list_head *l1_n, *l2_p; + +	l1_n = l1->next; +	l2_p = l2->prev; + +	l1->next = l2; +	l2->prev = l1; + +	l2_p->next = l1_n; +	l1_n->prev = l2_p; +} + +static int __origin_write(struct list_head *snapshots, struct bio *bio) +{ +	int r = 1, first = 1; +	struct dm_snapshot *snap; +	struct exception *e; +	struct pending_exception *pe, *last = NULL; +	chunk_t chunk; + +	/* Do all the snapshots on this origin */ +	list_for_each_entry (snap, snapshots, list) { + +		/* Only deal with valid snapshots */ +		if (!snap->valid) +			continue; + +		down_write(&snap->lock); + +		/* +		 * Remember, different snapshots can have +		 * different chunk sizes. +		 */ +		chunk = sector_to_chunk(snap, bio->bi_sector); + +		/* +		 * Check exception table to see if block +		 * is already remapped in this snapshot +		 * and trigger an exception if not. +		 */ +		e = lookup_exception(&snap->complete, chunk); +		if (!e) { +			pe = __find_pending_exception(snap, bio); +			if (!pe) { +				snap->store.drop_snapshot(&snap->store); +				snap->valid = 0; + +			} else { +				if (last) +					list_merge(&pe->siblings, +						   &last->siblings); + +				last = pe; +				r = 0; +			} +		} + +		up_write(&snap->lock); +	} + +	/* +	 * Now that we have a complete pe list we can start the copying. +	 */ +	if (last) { +		pe = last; +		do { +			down_write(&pe->snap->lock); +			if (first) +				bio_list_add(&pe->origin_bios, bio); +			if (!pe->started) { +				pe->started = 1; +				up_write(&pe->snap->lock); +				start_copy(pe); +			} else +				up_write(&pe->snap->lock); +			first = 0; +			pe = list_entry(pe->siblings.next, +					struct pending_exception, siblings); + +		} while (pe != last); +	} + +	return r; +} + +/* + * Called on a write from the origin driver. + */ +static int do_origin(struct dm_dev *origin, struct bio *bio) +{ +	struct origin *o; +	int r = 1; + +	down_read(&_origins_lock); +	o = __lookup_origin(origin->bdev); +	if (o) +		r = __origin_write(&o->snapshots, bio); +	up_read(&_origins_lock); + +	return r; +} + +/* + * Origin: maps a linear range of a device, with hooks for snapshotting. + */ + +/* + * Construct an origin mapping: <dev_path> + * The context for an origin is merely a 'struct dm_dev *' + * pointing to the real device. + */ +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ +	int r; +	struct dm_dev *dev; + +	if (argc != 1) { +		ti->error = "dm-origin: incorrect number of arguments"; +		return -EINVAL; +	} + +	r = dm_get_device(ti, argv[0], 0, ti->len, +			  dm_table_get_mode(ti->table), &dev); +	if (r) { +		ti->error = "Cannot get target device"; +		return r; +	} + +	ti->private = dev; +	return 0; +} + +static void origin_dtr(struct dm_target *ti) +{ +	struct dm_dev *dev = (struct dm_dev *) ti->private; +	dm_put_device(ti, dev); +} + +static int origin_map(struct dm_target *ti, struct bio *bio, +		      union map_info *map_context) +{ +	struct dm_dev *dev = (struct dm_dev *) ti->private; +	bio->bi_bdev = dev->bdev; + +	/* Only tell snapshots if this is a write */ +	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1; +} + +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + +/* + * Set the target "split_io" field to the minimum of all the snapshots' + * chunk sizes. + */ +static void origin_resume(struct dm_target *ti) +{ +	struct dm_dev *dev = (struct dm_dev *) ti->private; +	struct dm_snapshot *snap; +	struct origin *o; +	chunk_t chunk_size = 0; + +	down_read(&_origins_lock); +	o = __lookup_origin(dev->bdev); +	if (o) +		list_for_each_entry (snap, &o->snapshots, list) +			chunk_size = min_not_zero(chunk_size, snap->chunk_size); +	up_read(&_origins_lock); + +	ti->split_io = chunk_size; +} + +static int origin_status(struct dm_target *ti, status_type_t type, char *result, +			 unsigned int maxlen) +{ +	struct dm_dev *dev = (struct dm_dev *) ti->private; + +	switch (type) { +	case STATUSTYPE_INFO: +		result[0] = '\0'; +		break; + +	case STATUSTYPE_TABLE: +		snprintf(result, maxlen, "%s", dev->name); +		break; +	} + +	return 0; +} + +static struct target_type origin_target = { +	.name    = "snapshot-origin", +	.version = {1, 0, 1}, +	.module  = THIS_MODULE, +	.ctr     = origin_ctr, +	.dtr     = origin_dtr, +	.map     = origin_map, +	.resume  = origin_resume, +	.status  = origin_status, +}; + +static struct target_type snapshot_target = { +	.name    = "snapshot", +	.version = {1, 0, 1}, +	.module  = THIS_MODULE, +	.ctr     = snapshot_ctr, +	.dtr     = snapshot_dtr, +	.map     = snapshot_map, +	.resume  = snapshot_resume, +	.status  = snapshot_status, +}; + +static int __init dm_snapshot_init(void) +{ +	int r; + +	r = dm_register_target(&snapshot_target); +	if (r) { +		DMERR("snapshot target register failed %d", r); +		return r; +	} + +	r = dm_register_target(&origin_target); +	if (r < 0) { +		DMERR("Device mapper: Origin: register failed %d\n", r); +		goto bad1; +	} + +	r = init_origin_hash(); +	if (r) { +		DMERR("init_origin_hash failed."); +		goto bad2; +	} + +	exception_cache = kmem_cache_create("dm-snapshot-ex", +					    sizeof(struct exception), +					    __alignof__(struct exception), +					    0, NULL, NULL); +	if (!exception_cache) { +		DMERR("Couldn't create exception cache."); +		r = -ENOMEM; +		goto bad3; +	} + +	pending_cache = +	    kmem_cache_create("dm-snapshot-in", +			      sizeof(struct pending_exception), +			      __alignof__(struct pending_exception), +			      0, NULL, NULL); +	if (!pending_cache) { +		DMERR("Couldn't create pending cache."); +		r = -ENOMEM; +		goto bad4; +	} + +	pending_pool = mempool_create(128, mempool_alloc_slab, +				      mempool_free_slab, pending_cache); +	if (!pending_pool) { +		DMERR("Couldn't create pending pool."); +		r = -ENOMEM; +		goto bad5; +	} + +	return 0; + +      bad5: +	kmem_cache_destroy(pending_cache); +      bad4: +	kmem_cache_destroy(exception_cache); +      bad3: +	exit_origin_hash(); +      bad2: +	dm_unregister_target(&origin_target); +      bad1: +	dm_unregister_target(&snapshot_target); +	return r; +} + +static void __exit dm_snapshot_exit(void) +{ +	int r; + +	r = dm_unregister_target(&snapshot_target); +	if (r) +		DMERR("snapshot unregister failed %d", r); + +	r = dm_unregister_target(&origin_target); +	if (r) +		DMERR("origin unregister failed %d", r); + +	exit_origin_hash(); +	mempool_destroy(pending_pool); +	kmem_cache_destroy(pending_cache); +	kmem_cache_destroy(exception_cache); +} + +/* Module hooks */ +module_init(dm_snapshot_init); +module_exit(dm_snapshot_exit); + +MODULE_DESCRIPTION(DM_NAME " snapshot target"); +MODULE_AUTHOR("Joe Thornber"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h new file mode 100644 index 00000000000..375aa24d4d7 --- /dev/null +++ b/drivers/md/dm-snap.h @@ -0,0 +1,161 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#ifndef DM_SNAPSHOT_H +#define DM_SNAPSHOT_H + +#include "dm.h" +#include <linux/blkdev.h> + +struct exception_table { +	uint32_t hash_mask; +	struct list_head *table; +}; + +/* + * The snapshot code deals with largish chunks of the disk at a + * time. Typically 64k - 256k. + */ +/* FIXME: can we get away with limiting these to a uint32_t ? */ +typedef sector_t chunk_t; + +/* + * An exception is used where an old chunk of data has been + * replaced by a new one. + */ +struct exception { +	struct list_head hash_list; + +	chunk_t old_chunk; +	chunk_t new_chunk; +}; + +/* + * Abstraction to handle the meta/layout of exception stores (the + * COW device). + */ +struct exception_store { + +	/* +	 * Destroys this object when you've finished with it. +	 */ +	void (*destroy) (struct exception_store *store); + +	/* +	 * The target shouldn't read the COW device until this is +	 * called. +	 */ +	int (*read_metadata) (struct exception_store *store); + +	/* +	 * Find somewhere to store the next exception. +	 */ +	int (*prepare_exception) (struct exception_store *store, +				  struct exception *e); + +	/* +	 * Update the metadata with this exception. +	 */ +	void (*commit_exception) (struct exception_store *store, +				  struct exception *e, +				  void (*callback) (void *, int success), +				  void *callback_context); + +	/* +	 * The snapshot is invalid, note this in the metadata. +	 */ +	void (*drop_snapshot) (struct exception_store *store); + +	/* +	 * Return how full the snapshot is. +	 */ +	void (*fraction_full) (struct exception_store *store, +			       sector_t *numerator, +			       sector_t *denominator); + +	struct dm_snapshot *snap; +	void *context; +}; + +struct dm_snapshot { +	struct rw_semaphore lock; +	struct dm_table *table; + +	struct dm_dev *origin; +	struct dm_dev *cow; + +	/* List of snapshots per Origin */ +	struct list_head list; + +	/* Size of data blocks saved - must be a power of 2 */ +	chunk_t chunk_size; +	chunk_t chunk_mask; +	chunk_t chunk_shift; + +	/* You can't use a snapshot if this is 0 (e.g. if full) */ +	int valid; +	int have_metadata; + +	/* Used for display of table */ +	char type; + +	/* The last percentage we notified */ +	int last_percent; + +	struct exception_table pending; +	struct exception_table complete; + +	/* The on disk metadata handler */ +	struct exception_store store; + +	struct kcopyd_client *kcopyd_client; +}; + +/* + * Used by the exception stores to load exceptions hen + * initialising. + */ +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); + +/* + * Constructor and destructor for the default persistent + * store. + */ +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); + +int dm_create_transient(struct exception_store *store, +			struct dm_snapshot *s, int blocksize); + +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(struct block_device *bdev) +{ +	return bdev->bd_inode->i_size >> SECTOR_SHIFT; +} + +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) +{ +	return (sector & ~s->chunk_mask) >> s->chunk_shift; +} + +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) +{ +	return chunk << s->chunk_shift; +} + +static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs) +{ +	/* +	 * There is only ever one instance of a particular block +	 * device so we can compare pointers safely. +	 */ +	return lhs == rhs; +} + +#endif diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c new file mode 100644 index 00000000000..ab89278a56b --- /dev/null +++ b/drivers/md/dm-stripe.c @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2001-2003 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> + +struct stripe { +	struct dm_dev *dev; +	sector_t physical_start; +}; + +struct stripe_c { +	uint32_t stripes; + +	/* The size of this target / num. stripes */ +	sector_t stripe_width; + +	/* stripe chunk size */ +	uint32_t chunk_shift; +	sector_t chunk_mask; + +	struct stripe stripe[0]; +}; + +static inline struct stripe_c *alloc_context(unsigned int stripes) +{ +	size_t len; + +	if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), +			  stripes)) +		return NULL; + +	len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); + +	return kmalloc(len, GFP_KERNEL); +} + +/* + * Parse a single <dev> <sector> pair + */ +static int get_stripe(struct dm_target *ti, struct stripe_c *sc, +		      unsigned int stripe, char **argv) +{ +	sector_t start; + +	if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1) +		return -EINVAL; + +	if (dm_get_device(ti, argv[0], start, sc->stripe_width, +			  dm_table_get_mode(ti->table), +			  &sc->stripe[stripe].dev)) +		return -ENXIO; + +	sc->stripe[stripe].physical_start = start; +	return 0; +} + +/* + * Construct a striped mapping. + * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+ + */ +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ +	struct stripe_c *sc; +	sector_t width; +	uint32_t stripes; +	uint32_t chunk_size; +	char *end; +	int r; +	unsigned int i; + +	if (argc < 2) { +		ti->error = "dm-stripe: Not enough arguments"; +		return -EINVAL; +	} + +	stripes = simple_strtoul(argv[0], &end, 10); +	if (*end) { +		ti->error = "dm-stripe: Invalid stripe count"; +		return -EINVAL; +	} + +	chunk_size = simple_strtoul(argv[1], &end, 10); +	if (*end) { +		ti->error = "dm-stripe: Invalid chunk_size"; +		return -EINVAL; +	} + +	/* +	 * chunk_size is a power of two +	 */ +	if (!chunk_size || (chunk_size & (chunk_size - 1)) || +	    (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { +		ti->error = "dm-stripe: Invalid chunk size"; +		return -EINVAL; +	} + +	width = ti->len; +	if (sector_div(width, stripes)) { +		ti->error = "dm-stripe: Target length not divisable by " +		    "number of stripes"; +		return -EINVAL; +	} + +	/* +	 * Do we have enough arguments for that many stripes ? +	 */ +	if (argc != (2 + 2 * stripes)) { +		ti->error = "dm-stripe: Not enough destinations " +			"specified"; +		return -EINVAL; +	} + +	sc = alloc_context(stripes); +	if (!sc) { +		ti->error = "dm-stripe: Memory allocation for striped context " +		    "failed"; +		return -ENOMEM; +	} + +	sc->stripes = stripes; +	sc->stripe_width = width; +	ti->split_io = chunk_size; + +	sc->chunk_mask = ((sector_t) chunk_size) - 1; +	for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) +		chunk_size >>= 1; +	sc->chunk_shift--; + +	/* +	 * Get the stripe destinations. +	 */ +	for (i = 0; i < stripes; i++) { +		argv += 2; + +		r = get_stripe(ti, sc, i, argv); +		if (r < 0) { +			ti->error = "dm-stripe: Couldn't parse stripe " +				"destination"; +			while (i--) +				dm_put_device(ti, sc->stripe[i].dev); +			kfree(sc); +			return r; +		} +	} + +	ti->private = sc; +	return 0; +} + +static void stripe_dtr(struct dm_target *ti) +{ +	unsigned int i; +	struct stripe_c *sc = (struct stripe_c *) ti->private; + +	for (i = 0; i < sc->stripes; i++) +		dm_put_device(ti, sc->stripe[i].dev); + +	kfree(sc); +} + +static int stripe_map(struct dm_target *ti, struct bio *bio, +		      union map_info *map_context) +{ +	struct stripe_c *sc = (struct stripe_c *) ti->private; + +	sector_t offset = bio->bi_sector - ti->begin; +	sector_t chunk = offset >> sc->chunk_shift; +	uint32_t stripe = sector_div(chunk, sc->stripes); + +	bio->bi_bdev = sc->stripe[stripe].dev->bdev; +	bio->bi_sector = sc->stripe[stripe].physical_start + +	    (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); +	return 1; +} + +static int stripe_status(struct dm_target *ti, +			 status_type_t type, char *result, unsigned int maxlen) +{ +	struct stripe_c *sc = (struct stripe_c *) ti->private; +	unsigned int sz = 0; +	unsigned int i; + +	switch (type) { +	case STATUSTYPE_INFO: +		result[0] = '\0'; +		break; + +	case STATUSTYPE_TABLE: +		DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1); +		for (i = 0; i < sc->stripes; i++) +			DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name, +			       sc->stripe[i].physical_start); +		break; +	} +	return 0; +} + +static struct target_type stripe_target = { +	.name   = "striped", +	.version= {1, 0, 2}, +	.module = THIS_MODULE, +	.ctr    = stripe_ctr, +	.dtr    = stripe_dtr, +	.map    = stripe_map, +	.status = stripe_status, +}; + +int __init dm_stripe_init(void) +{ +	int r; + +	r = dm_register_target(&stripe_target); +	if (r < 0) +		DMWARN("striped target registration failed"); + +	return r; +} + +void dm_stripe_exit(void) +{ +	if (dm_unregister_target(&stripe_target)) +		DMWARN("striped target unregistration failed"); + +	return; +} diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c new file mode 100644 index 00000000000..ee175d4906c --- /dev/null +++ b/drivers/md/dm-table.c @@ -0,0 +1,950 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/namei.h> +#include <linux/ctype.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <asm/atomic.h> + +#define MAX_DEPTH 16 +#define NODE_SIZE L1_CACHE_BYTES +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) + +struct dm_table { +	atomic_t holders; + +	/* btree table */ +	unsigned int depth; +	unsigned int counts[MAX_DEPTH];	/* in nodes */ +	sector_t *index[MAX_DEPTH]; + +	unsigned int num_targets; +	unsigned int num_allocated; +	sector_t *highs; +	struct dm_target *targets; + +	/* +	 * Indicates the rw permissions for the new logical +	 * device.  This should be a combination of FMODE_READ +	 * and FMODE_WRITE. +	 */ +	int mode; + +	/* a list of devices used by this table */ +	struct list_head devices; + +	/* +	 * These are optimistic limits taken from all the +	 * targets, some targets will need smaller limits. +	 */ +	struct io_restrictions limits; + +	/* events get handed up using this callback */ +	void (*event_fn)(void *); +	void *event_context; +}; + +/* + * Similar to ceiling(log_size(n)) + */ +static unsigned int int_log(unsigned int n, unsigned int base) +{ +	int result = 0; + +	while (n > 1) { +		n = dm_div_up(n, base); +		result++; +	} + +	return result; +} + +/* + * Returns the minimum that is _not_ zero, unless both are zero. + */ +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + +/* + * Combine two io_restrictions, always taking the lower value. + */ +static void combine_restrictions_low(struct io_restrictions *lhs, +				     struct io_restrictions *rhs) +{ +	lhs->max_sectors = +		min_not_zero(lhs->max_sectors, rhs->max_sectors); + +	lhs->max_phys_segments = +		min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments); + +	lhs->max_hw_segments = +		min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); + +	lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size); + +	lhs->max_segment_size = +		min_not_zero(lhs->max_segment_size, rhs->max_segment_size); + +	lhs->seg_boundary_mask = +		min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); +} + +/* + * Calculate the index of the child node of the n'th node k'th key. + */ +static inline unsigned int get_child(unsigned int n, unsigned int k) +{ +	return (n * CHILDREN_PER_NODE) + k; +} + +/* + * Return the n'th node of level l from table t. + */ +static inline sector_t *get_node(struct dm_table *t, +				 unsigned int l, unsigned int n) +{ +	return t->index[l] + (n * KEYS_PER_NODE); +} + +/* + * Return the highest key that you could lookup from the n'th + * node on level l of the btree. + */ +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) +{ +	for (; l < t->depth - 1; l++) +		n = get_child(n, CHILDREN_PER_NODE - 1); + +	if (n >= t->counts[l]) +		return (sector_t) - 1; + +	return get_node(t, l, n)[KEYS_PER_NODE - 1]; +} + +/* + * Fills in a level of the btree based on the highs of the level + * below it. + */ +static int setup_btree_index(unsigned int l, struct dm_table *t) +{ +	unsigned int n, k; +	sector_t *node; + +	for (n = 0U; n < t->counts[l]; n++) { +		node = get_node(t, l, n); + +		for (k = 0U; k < KEYS_PER_NODE; k++) +			node[k] = high(t, l + 1, get_child(n, k)); +	} + +	return 0; +} + +void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) +{ +	unsigned long size; +	void *addr; + +	/* +	 * Check that we're not going to overflow. +	 */ +	if (nmemb > (ULONG_MAX / elem_size)) +		return NULL; + +	size = nmemb * elem_size; +	addr = vmalloc(size); +	if (addr) +		memset(addr, 0, size); + +	return addr; +} + +/* + * highs, and targets are managed as dynamic arrays during a + * table load. + */ +static int alloc_targets(struct dm_table *t, unsigned int num) +{ +	sector_t *n_highs; +	struct dm_target *n_targets; +	int n = t->num_targets; + +	/* +	 * Allocate both the target array and offset array at once. +	 */ +	n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) + +					  sizeof(sector_t)); +	if (!n_highs) +		return -ENOMEM; + +	n_targets = (struct dm_target *) (n_highs + num); + +	if (n) { +		memcpy(n_highs, t->highs, sizeof(*n_highs) * n); +		memcpy(n_targets, t->targets, sizeof(*n_targets) * n); +	} + +	memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); +	vfree(t->highs); + +	t->num_allocated = num; +	t->highs = n_highs; +	t->targets = n_targets; + +	return 0; +} + +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets) +{ +	struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL); + +	if (!t) +		return -ENOMEM; + +	memset(t, 0, sizeof(*t)); +	INIT_LIST_HEAD(&t->devices); +	atomic_set(&t->holders, 1); + +	if (!num_targets) +		num_targets = KEYS_PER_NODE; + +	num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + +	if (alloc_targets(t, num_targets)) { +		kfree(t); +		t = NULL; +		return -ENOMEM; +	} + +	t->mode = mode; +	*result = t; +	return 0; +} + +static void free_devices(struct list_head *devices) +{ +	struct list_head *tmp, *next; + +	for (tmp = devices->next; tmp != devices; tmp = next) { +		struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); +		next = tmp->next; +		kfree(dd); +	} +} + +void table_destroy(struct dm_table *t) +{ +	unsigned int i; + +	/* free the indexes (see dm_table_complete) */ +	if (t->depth >= 2) +		vfree(t->index[t->depth - 2]); + +	/* free the targets */ +	for (i = 0; i < t->num_targets; i++) { +		struct dm_target *tgt = t->targets + i; + +		if (tgt->type->dtr) +			tgt->type->dtr(tgt); + +		dm_put_target_type(tgt->type); +	} + +	vfree(t->highs); + +	/* free the device list */ +	if (t->devices.next != &t->devices) { +		DMWARN("devices still present during destroy: " +		       "dm_table_remove_device calls missing"); + +		free_devices(&t->devices); +	} + +	kfree(t); +} + +void dm_table_get(struct dm_table *t) +{ +	atomic_inc(&t->holders); +} + +void dm_table_put(struct dm_table *t) +{ +	if (!t) +		return; + +	if (atomic_dec_and_test(&t->holders)) +		table_destroy(t); +} + +/* + * Checks to see if we need to extend highs or targets. + */ +static inline int check_space(struct dm_table *t) +{ +	if (t->num_targets >= t->num_allocated) +		return alloc_targets(t, t->num_allocated * 2); + +	return 0; +} + +/* + * Convert a device path to a dev_t. + */ +static int lookup_device(const char *path, dev_t *dev) +{ +	int r; +	struct nameidata nd; +	struct inode *inode; + +	if ((r = path_lookup(path, LOOKUP_FOLLOW, &nd))) +		return r; + +	inode = nd.dentry->d_inode; +	if (!inode) { +		r = -ENOENT; +		goto out; +	} + +	if (!S_ISBLK(inode->i_mode)) { +		r = -ENOTBLK; +		goto out; +	} + +	*dev = inode->i_rdev; + + out: +	path_release(&nd); +	return r; +} + +/* + * See if we've already got a device in the list. + */ +static struct dm_dev *find_device(struct list_head *l, dev_t dev) +{ +	struct dm_dev *dd; + +	list_for_each_entry (dd, l, list) +		if (dd->bdev->bd_dev == dev) +			return dd; + +	return NULL; +} + +/* + * Open a device so we can use it as a map destination. + */ +static int open_dev(struct dm_dev *d, dev_t dev) +{ +	static char *_claim_ptr = "I belong to device-mapper"; +	struct block_device *bdev; + +	int r; + +	if (d->bdev) +		BUG(); + +	bdev = open_by_devnum(dev, d->mode); +	if (IS_ERR(bdev)) +		return PTR_ERR(bdev); +	r = bd_claim(bdev, _claim_ptr); +	if (r) +		blkdev_put(bdev); +	else +		d->bdev = bdev; +	return r; +} + +/* + * Close a device that we've been using. + */ +static void close_dev(struct dm_dev *d) +{ +	if (!d->bdev) +		return; + +	bd_release(d->bdev); +	blkdev_put(d->bdev); +	d->bdev = NULL; +} + +/* + * If possible (ie. blk_size[major] is set), this checks an area + * of a destination device is valid. + */ +static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len) +{ +	sector_t dev_size; +	dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT; +	return ((start < dev_size) && (len <= (dev_size - start))); +} + +/* + * This upgrades the mode on an already open dm_dev.  Being + * careful to leave things as they were if we fail to reopen the + * device. + */ +static int upgrade_mode(struct dm_dev *dd, int new_mode) +{ +	int r; +	struct dm_dev dd_copy; +	dev_t dev = dd->bdev->bd_dev; + +	dd_copy = *dd; + +	dd->mode |= new_mode; +	dd->bdev = NULL; +	r = open_dev(dd, dev); +	if (!r) +		close_dev(&dd_copy); +	else +		*dd = dd_copy; + +	return r; +} + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +static int __table_get_device(struct dm_table *t, struct dm_target *ti, +			      const char *path, sector_t start, sector_t len, +			      int mode, struct dm_dev **result) +{ +	int r; +	dev_t dev; +	struct dm_dev *dd; +	unsigned int major, minor; + +	if (!t) +		BUG(); + +	if (sscanf(path, "%u:%u", &major, &minor) == 2) { +		/* Extract the major/minor numbers */ +		dev = MKDEV(major, minor); +		if (MAJOR(dev) != major || MINOR(dev) != minor) +			return -EOVERFLOW; +	} else { +		/* convert the path to a device */ +		if ((r = lookup_device(path, &dev))) +			return r; +	} + +	dd = find_device(&t->devices, dev); +	if (!dd) { +		dd = kmalloc(sizeof(*dd), GFP_KERNEL); +		if (!dd) +			return -ENOMEM; + +		dd->mode = mode; +		dd->bdev = NULL; + +		if ((r = open_dev(dd, dev))) { +			kfree(dd); +			return r; +		} + +		format_dev_t(dd->name, dev); + +		atomic_set(&dd->count, 0); +		list_add(&dd->list, &t->devices); + +	} else if (dd->mode != (mode | dd->mode)) { +		r = upgrade_mode(dd, mode); +		if (r) +			return r; +	} +	atomic_inc(&dd->count); + +	if (!check_device_area(dd, start, len)) { +		DMWARN("device %s too small for target", path); +		dm_put_device(ti, dd); +		return -EINVAL; +	} + +	*result = dd; + +	return 0; +} + + +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, +		  sector_t len, int mode, struct dm_dev **result) +{ +	int r = __table_get_device(ti->table, ti, path, +				   start, len, mode, result); +	if (!r) { +		request_queue_t *q = bdev_get_queue((*result)->bdev); +		struct io_restrictions *rs = &ti->limits; + +		/* +		 * Combine the device limits low. +		 * +		 * FIXME: if we move an io_restriction struct +		 *        into q this would just be a call to +		 *        combine_restrictions_low() +		 */ +		rs->max_sectors = +			min_not_zero(rs->max_sectors, q->max_sectors); + +		/* FIXME: Device-Mapper on top of RAID-0 breaks because DM +		 *        currently doesn't honor MD's merge_bvec_fn routine. +		 *        In this case, we'll force DM to use PAGE_SIZE or +		 *        smaller I/O, just to be safe. A better fix is in the +		 *        works, but add this for the time being so it will at +		 *        least operate correctly. +		 */ +		if (q->merge_bvec_fn) +			rs->max_sectors = +				min_not_zero(rs->max_sectors, +					     (unsigned short)(PAGE_SIZE >> 9)); + +		rs->max_phys_segments = +			min_not_zero(rs->max_phys_segments, +				     q->max_phys_segments); + +		rs->max_hw_segments = +			min_not_zero(rs->max_hw_segments, q->max_hw_segments); + +		rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); + +		rs->max_segment_size = +			min_not_zero(rs->max_segment_size, q->max_segment_size); + +		rs->seg_boundary_mask = +			min_not_zero(rs->seg_boundary_mask, +				     q->seg_boundary_mask); +	} + +	return r; +} + +/* + * Decrement a devices use count and remove it if necessary. + */ +void dm_put_device(struct dm_target *ti, struct dm_dev *dd) +{ +	if (atomic_dec_and_test(&dd->count)) { +		close_dev(dd); +		list_del(&dd->list); +		kfree(dd); +	} +} + +/* + * Checks to see if the target joins onto the end of the table. + */ +static int adjoin(struct dm_table *table, struct dm_target *ti) +{ +	struct dm_target *prev; + +	if (!table->num_targets) +		return !ti->begin; + +	prev = &table->targets[table->num_targets - 1]; +	return (ti->begin == (prev->begin + prev->len)); +} + +/* + * Used to dynamically allocate the arg array. + */ +static char **realloc_argv(unsigned *array_size, char **old_argv) +{ +	char **argv; +	unsigned new_size; + +	new_size = *array_size ? *array_size * 2 : 64; +	argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); +	if (argv) { +		memcpy(argv, old_argv, *array_size * sizeof(*argv)); +		*array_size = new_size; +	} + +	kfree(old_argv); +	return argv; +} + +/* + * Destructively splits up the argument list to pass to ctr. + */ +int dm_split_args(int *argc, char ***argvp, char *input) +{ +	char *start, *end = input, *out, **argv = NULL; +	unsigned array_size = 0; + +	*argc = 0; +	argv = realloc_argv(&array_size, argv); +	if (!argv) +		return -ENOMEM; + +	while (1) { +		start = end; + +		/* Skip whitespace */ +		while (*start && isspace(*start)) +			start++; + +		if (!*start) +			break;	/* success, we hit the end */ + +		/* 'out' is used to remove any back-quotes */ +		end = out = start; +		while (*end) { +			/* Everything apart from '\0' can be quoted */ +			if (*end == '\\' && *(end + 1)) { +				*out++ = *(end + 1); +				end += 2; +				continue; +			} + +			if (isspace(*end)) +				break;	/* end of token */ + +			*out++ = *end++; +		} + +		/* have we already filled the array ? */ +		if ((*argc + 1) > array_size) { +			argv = realloc_argv(&array_size, argv); +			if (!argv) +				return -ENOMEM; +		} + +		/* we know this is whitespace */ +		if (*end) +			end++; + +		/* terminate the string and put it in the array */ +		*out = '\0'; +		argv[*argc] = start; +		(*argc)++; +	} + +	*argvp = argv; +	return 0; +} + +static void check_for_valid_limits(struct io_restrictions *rs) +{ +	if (!rs->max_sectors) +		rs->max_sectors = MAX_SECTORS; +	if (!rs->max_phys_segments) +		rs->max_phys_segments = MAX_PHYS_SEGMENTS; +	if (!rs->max_hw_segments) +		rs->max_hw_segments = MAX_HW_SEGMENTS; +	if (!rs->hardsect_size) +		rs->hardsect_size = 1 << SECTOR_SHIFT; +	if (!rs->max_segment_size) +		rs->max_segment_size = MAX_SEGMENT_SIZE; +	if (!rs->seg_boundary_mask) +		rs->seg_boundary_mask = -1; +} + +int dm_table_add_target(struct dm_table *t, const char *type, +			sector_t start, sector_t len, char *params) +{ +	int r = -EINVAL, argc; +	char **argv; +	struct dm_target *tgt; + +	if ((r = check_space(t))) +		return r; + +	tgt = t->targets + t->num_targets; +	memset(tgt, 0, sizeof(*tgt)); + +	if (!len) { +		tgt->error = "zero-length target"; +		DMERR("%s", tgt->error); +		return -EINVAL; +	} + +	tgt->type = dm_get_target_type(type); +	if (!tgt->type) { +		tgt->error = "unknown target type"; +		DMERR("%s", tgt->error); +		return -EINVAL; +	} + +	tgt->table = t; +	tgt->begin = start; +	tgt->len = len; +	tgt->error = "Unknown error"; + +	/* +	 * Does this target adjoin the previous one ? +	 */ +	if (!adjoin(t, tgt)) { +		tgt->error = "Gap in table"; +		r = -EINVAL; +		goto bad; +	} + +	r = dm_split_args(&argc, &argv, params); +	if (r) { +		tgt->error = "couldn't split parameters (insufficient memory)"; +		goto bad; +	} + +	r = tgt->type->ctr(tgt, argc, argv); +	kfree(argv); +	if (r) +		goto bad; + +	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + +	/* FIXME: the plan is to combine high here and then have +	 * the merge fn apply the target level restrictions. */ +	combine_restrictions_low(&t->limits, &tgt->limits); +	return 0; + + bad: +	DMERR("%s", tgt->error); +	dm_put_target_type(tgt->type); +	return r; +} + +static int setup_indexes(struct dm_table *t) +{ +	int i; +	unsigned int total = 0; +	sector_t *indexes; + +	/* allocate the space for *all* the indexes */ +	for (i = t->depth - 2; i >= 0; i--) { +		t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); +		total += t->counts[i]; +	} + +	indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE); +	if (!indexes) +		return -ENOMEM; + +	/* set up internal nodes, bottom-up */ +	for (i = t->depth - 2, total = 0; i >= 0; i--) { +		t->index[i] = indexes; +		indexes += (KEYS_PER_NODE * t->counts[i]); +		setup_btree_index(i, t); +	} + +	return 0; +} + +/* + * Builds the btree to index the map. + */ +int dm_table_complete(struct dm_table *t) +{ +	int r = 0; +	unsigned int leaf_nodes; + +	check_for_valid_limits(&t->limits); + +	/* how many indexes will the btree have ? */ +	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); +	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); + +	/* leaf layer has already been set up */ +	t->counts[t->depth - 1] = leaf_nodes; +	t->index[t->depth - 1] = t->highs; + +	if (t->depth >= 2) +		r = setup_indexes(t); + +	return r; +} + +static DECLARE_MUTEX(_event_lock); +void dm_table_event_callback(struct dm_table *t, +			     void (*fn)(void *), void *context) +{ +	down(&_event_lock); +	t->event_fn = fn; +	t->event_context = context; +	up(&_event_lock); +} + +void dm_table_event(struct dm_table *t) +{ +	/* +	 * You can no longer call dm_table_event() from interrupt +	 * context, use a bottom half instead. +	 */ +	BUG_ON(in_interrupt()); + +	down(&_event_lock); +	if (t->event_fn) +		t->event_fn(t->event_context); +	up(&_event_lock); +} + +sector_t dm_table_get_size(struct dm_table *t) +{ +	return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; +} + +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) +{ +	if (index > t->num_targets) +		return NULL; + +	return t->targets + index; +} + +/* + * Search the btree for the correct target. + */ +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) +{ +	unsigned int l, n = 0, k = 0; +	sector_t *node; + +	for (l = 0; l < t->depth; l++) { +		n = get_child(n, k); +		node = get_node(t, l, n); + +		for (k = 0; k < KEYS_PER_NODE; k++) +			if (node[k] >= sector) +				break; +	} + +	return &t->targets[(KEYS_PER_NODE * n) + k]; +} + +void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) +{ +	/* +	 * Make sure we obey the optimistic sub devices +	 * restrictions. +	 */ +	blk_queue_max_sectors(q, t->limits.max_sectors); +	q->max_phys_segments = t->limits.max_phys_segments; +	q->max_hw_segments = t->limits.max_hw_segments; +	q->hardsect_size = t->limits.hardsect_size; +	q->max_segment_size = t->limits.max_segment_size; +	q->seg_boundary_mask = t->limits.seg_boundary_mask; +} + +unsigned int dm_table_get_num_targets(struct dm_table *t) +{ +	return t->num_targets; +} + +struct list_head *dm_table_get_devices(struct dm_table *t) +{ +	return &t->devices; +} + +int dm_table_get_mode(struct dm_table *t) +{ +	return t->mode; +} + +static void suspend_targets(struct dm_table *t, unsigned postsuspend) +{ +	int i = t->num_targets; +	struct dm_target *ti = t->targets; + +	while (i--) { +		if (postsuspend) { +			if (ti->type->postsuspend) +				ti->type->postsuspend(ti); +		} else if (ti->type->presuspend) +			ti->type->presuspend(ti); + +		ti++; +	} +} + +void dm_table_presuspend_targets(struct dm_table *t) +{ +	return suspend_targets(t, 0); +} + +void dm_table_postsuspend_targets(struct dm_table *t) +{ +	return suspend_targets(t, 1); +} + +void dm_table_resume_targets(struct dm_table *t) +{ +	int i; + +	for (i = 0; i < t->num_targets; i++) { +		struct dm_target *ti = t->targets + i; + +		if (ti->type->resume) +			ti->type->resume(ti); +	} +} + +int dm_table_any_congested(struct dm_table *t, int bdi_bits) +{ +	struct list_head *d, *devices; +	int r = 0; + +	devices = dm_table_get_devices(t); +	for (d = devices->next; d != devices; d = d->next) { +		struct dm_dev *dd = list_entry(d, struct dm_dev, list); +		request_queue_t *q = bdev_get_queue(dd->bdev); +		r |= bdi_congested(&q->backing_dev_info, bdi_bits); +	} + +	return r; +} + +void dm_table_unplug_all(struct dm_table *t) +{ +	struct list_head *d, *devices = dm_table_get_devices(t); + +	for (d = devices->next; d != devices; d = d->next) { +		struct dm_dev *dd = list_entry(d, struct dm_dev, list); +		request_queue_t *q = bdev_get_queue(dd->bdev); + +		if (q->unplug_fn) +			q->unplug_fn(q); +	} +} + +int dm_table_flush_all(struct dm_table *t) +{ +	struct list_head *d, *devices = dm_table_get_devices(t); +	int ret = 0; + +	for (d = devices->next; d != devices; d = d->next) { +		struct dm_dev *dd = list_entry(d, struct dm_dev, list); +		request_queue_t *q = bdev_get_queue(dd->bdev); +		int err; + +		if (!q->issue_flush_fn) +			err = -EOPNOTSUPP; +		else +			err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL); + +		if (!ret) +			ret = err; +	} + +	return ret; +} + +EXPORT_SYMBOL(dm_vcalloc); +EXPORT_SYMBOL(dm_get_device); +EXPORT_SYMBOL(dm_put_device); +EXPORT_SYMBOL(dm_table_event); +EXPORT_SYMBOL(dm_table_get_mode); +EXPORT_SYMBOL(dm_table_put); +EXPORT_SYMBOL(dm_table_get); +EXPORT_SYMBOL(dm_table_unplug_all); +EXPORT_SYMBOL(dm_table_flush_all); diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c new file mode 100644 index 00000000000..aecd9e0c261 --- /dev/null +++ b/drivers/md/dm-target.c @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/bio.h> +#include <linux/slab.h> + +struct tt_internal { +	struct target_type tt; + +	struct list_head list; +	long use; +}; + +static LIST_HEAD(_targets); +static DECLARE_RWSEM(_lock); + +#define DM_MOD_NAME_SIZE 32 + +static inline struct tt_internal *__find_target_type(const char *name) +{ +	struct tt_internal *ti; + +	list_for_each_entry (ti, &_targets, list) +		if (!strcmp(name, ti->tt.name)) +			return ti; + +	return NULL; +} + +static struct tt_internal *get_target_type(const char *name) +{ +	struct tt_internal *ti; + +	down_read(&_lock); + +	ti = __find_target_type(name); +	if (ti) { +		if ((ti->use == 0) && !try_module_get(ti->tt.module)) +			ti = NULL; +		else +			ti->use++; +	} + +	up_read(&_lock); +	return ti; +} + +static void load_module(const char *name) +{ +	request_module("dm-%s", name); +} + +struct target_type *dm_get_target_type(const char *name) +{ +	struct tt_internal *ti = get_target_type(name); + +	if (!ti) { +		load_module(name); +		ti = get_target_type(name); +	} + +	return ti ? &ti->tt : NULL; +} + +void dm_put_target_type(struct target_type *t) +{ +	struct tt_internal *ti = (struct tt_internal *) t; + +	down_read(&_lock); +	if (--ti->use == 0) +		module_put(ti->tt.module); + +	if (ti->use < 0) +		BUG(); +	up_read(&_lock); + +	return; +} + +static struct tt_internal *alloc_target(struct target_type *t) +{ +	struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + +	if (ti) { +		memset(ti, 0, sizeof(*ti)); +		ti->tt = *t; +	} + +	return ti; +} + + +int dm_target_iterate(void (*iter_func)(struct target_type *tt, +					void *param), void *param) +{ +	struct tt_internal *ti; + +	down_read(&_lock); +	list_for_each_entry (ti, &_targets, list) +		iter_func(&ti->tt, param); +	up_read(&_lock); + +	return 0; +} + +int dm_register_target(struct target_type *t) +{ +	int rv = 0; +	struct tt_internal *ti = alloc_target(t); + +	if (!ti) +		return -ENOMEM; + +	down_write(&_lock); +	if (__find_target_type(t->name)) +		rv = -EEXIST; +	else +		list_add(&ti->list, &_targets); + +	up_write(&_lock); +	if (rv) +		kfree(ti); +	return rv; +} + +int dm_unregister_target(struct target_type *t) +{ +	struct tt_internal *ti; + +	down_write(&_lock); +	if (!(ti = __find_target_type(t->name))) { +		up_write(&_lock); +		return -EINVAL; +	} + +	if (ti->use) { +		up_write(&_lock); +		return -ETXTBSY; +	} + +	list_del(&ti->list); +	kfree(ti); + +	up_write(&_lock); +	return 0; +} + +/* + * io-err: always fails an io, useful for bringing + * up LVs that have holes in them. + */ +static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args) +{ +	return 0; +} + +static void io_err_dtr(struct dm_target *ti) +{ +	/* empty */ +} + +static int io_err_map(struct dm_target *ti, struct bio *bio, +		      union map_info *map_context) +{ +	return -EIO; +} + +static struct target_type error_target = { +	.name = "error", +	.version = {1, 0, 1}, +	.ctr  = io_err_ctr, +	.dtr  = io_err_dtr, +	.map  = io_err_map, +}; + +int __init dm_target_init(void) +{ +	return dm_register_target(&error_target); +} + +void dm_target_exit(void) +{ +	if (dm_unregister_target(&error_target)) +		DMWARN("error target unregistration failed"); +} + +EXPORT_SYMBOL(dm_register_target); +EXPORT_SYMBOL(dm_unregister_target); diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c new file mode 100644 index 00000000000..7febc2cac73 --- /dev/null +++ b/drivers/md/dm-zero.c @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2003 Christophe Saout <christophe@saout.de> + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bio.h> + +/* + * Construct a dummy mapping that only returns zeros + */ +static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ +	if (argc != 0) { +		ti->error = "dm-zero: No arguments required"; +		return -EINVAL; +	} + +	return 0; +} + +/* + * Return zeros only on reads + */ +static int zero_map(struct dm_target *ti, struct bio *bio, +		      union map_info *map_context) +{ +	switch(bio_rw(bio)) { +	case READ: +		zero_fill_bio(bio); +		break; +	case READA: +		/* readahead of null bytes only wastes buffer cache */ +		return -EIO; +	case WRITE: +		/* writes get silently dropped */ +		break; +	} + +	bio_endio(bio, bio->bi_size, 0); + +	/* accepted bio, don't make new request */ +	return 0; +} + +static struct target_type zero_target = { +	.name   = "zero", +	.version = {1, 0, 0}, +	.module = THIS_MODULE, +	.ctr    = zero_ctr, +	.map    = zero_map, +}; + +int __init dm_zero_init(void) +{ +	int r = dm_register_target(&zero_target); + +	if (r < 0) +		DMERR("zero: register failed %d", r); + +	return r; +} + +void __exit dm_zero_exit(void) +{ +	int r = dm_unregister_target(&zero_target); + +	if (r < 0) +		DMERR("zero: unregister failed %d", r); +} + +module_init(dm_zero_init) +module_exit(dm_zero_exit) + +MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); +MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.c b/drivers/md/dm.c new file mode 100644 index 00000000000..243ff6884e8 --- /dev/null +++ b/drivers/md/dm.c @@ -0,0 +1,1194 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-list.h" + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/blkpg.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/mempool.h> +#include <linux/slab.h> +#include <linux/idr.h> + +static const char *_name = DM_NAME; + +static unsigned int major = 0; +static unsigned int _major = 0; + +/* + * One of these is allocated per bio. + */ +struct dm_io { +	struct mapped_device *md; +	int error; +	struct bio *bio; +	atomic_t io_count; +}; + +/* + * One of these is allocated per target within a bio.  Hopefully + * this will be simplified out one day. + */ +struct target_io { +	struct dm_io *io; +	struct dm_target *ti; +	union map_info info; +}; + +union map_info *dm_get_mapinfo(struct bio *bio) +{ +        if (bio && bio->bi_private) +                return &((struct target_io *)bio->bi_private)->info; +        return NULL; +} + +/* + * Bits for the md->flags field. + */ +#define DMF_BLOCK_IO 0 +#define DMF_SUSPENDED 1 +#define DMF_FS_LOCKED 2 + +struct mapped_device { +	struct rw_semaphore lock; +	rwlock_t map_lock; +	atomic_t holders; + +	unsigned long flags; + +	request_queue_t *queue; +	struct gendisk *disk; + +	void *interface_ptr; + +	/* +	 * A list of ios that arrived while we were suspended. +	 */ +	atomic_t pending; +	wait_queue_head_t wait; + 	struct bio_list deferred; + +	/* +	 * The current mapping. +	 */ +	struct dm_table *map; + +	/* +	 * io objects are allocated from here. +	 */ +	mempool_t *io_pool; +	mempool_t *tio_pool; + +	/* +	 * Event handling. +	 */ +	atomic_t event_nr; +	wait_queue_head_t eventq; + +	/* +	 * freeze/thaw support require holding onto a super block +	 */ +	struct super_block *frozen_sb; +}; + +#define MIN_IOS 256 +static kmem_cache_t *_io_cache; +static kmem_cache_t *_tio_cache; + +static struct bio_set *dm_set; + +static int __init local_init(void) +{ +	int r; + +	dm_set = bioset_create(16, 16, 4); +	if (!dm_set) +		return -ENOMEM; + +	/* allocate a slab for the dm_ios */ +	_io_cache = kmem_cache_create("dm_io", +				      sizeof(struct dm_io), 0, 0, NULL, NULL); +	if (!_io_cache) +		return -ENOMEM; + +	/* allocate a slab for the target ios */ +	_tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io), +				       0, 0, NULL, NULL); +	if (!_tio_cache) { +		kmem_cache_destroy(_io_cache); +		return -ENOMEM; +	} + +	_major = major; +	r = register_blkdev(_major, _name); +	if (r < 0) { +		kmem_cache_destroy(_tio_cache); +		kmem_cache_destroy(_io_cache); +		return r; +	} + +	if (!_major) +		_major = r; + +	return 0; +} + +static void local_exit(void) +{ +	kmem_cache_destroy(_tio_cache); +	kmem_cache_destroy(_io_cache); + +	bioset_free(dm_set); + +	if (unregister_blkdev(_major, _name) < 0) +		DMERR("devfs_unregister_blkdev failed"); + +	_major = 0; + +	DMINFO("cleaned up"); +} + +int (*_inits[])(void) __initdata = { +	local_init, +	dm_target_init, +	dm_linear_init, +	dm_stripe_init, +	dm_interface_init, +}; + +void (*_exits[])(void) = { +	local_exit, +	dm_target_exit, +	dm_linear_exit, +	dm_stripe_exit, +	dm_interface_exit, +}; + +static int __init dm_init(void) +{ +	const int count = ARRAY_SIZE(_inits); + +	int r, i; + +	for (i = 0; i < count; i++) { +		r = _inits[i](); +		if (r) +			goto bad; +	} + +	return 0; + +      bad: +	while (i--) +		_exits[i](); + +	return r; +} + +static void __exit dm_exit(void) +{ +	int i = ARRAY_SIZE(_exits); + +	while (i--) +		_exits[i](); +} + +/* + * Block device functions + */ +static int dm_blk_open(struct inode *inode, struct file *file) +{ +	struct mapped_device *md; + +	md = inode->i_bdev->bd_disk->private_data; +	dm_get(md); +	return 0; +} + +static int dm_blk_close(struct inode *inode, struct file *file) +{ +	struct mapped_device *md; + +	md = inode->i_bdev->bd_disk->private_data; +	dm_put(md); +	return 0; +} + +static inline struct dm_io *alloc_io(struct mapped_device *md) +{ +	return mempool_alloc(md->io_pool, GFP_NOIO); +} + +static inline void free_io(struct mapped_device *md, struct dm_io *io) +{ +	mempool_free(io, md->io_pool); +} + +static inline struct target_io *alloc_tio(struct mapped_device *md) +{ +	return mempool_alloc(md->tio_pool, GFP_NOIO); +} + +static inline void free_tio(struct mapped_device *md, struct target_io *tio) +{ +	mempool_free(tio, md->tio_pool); +} + +/* + * Add the bio to the list of deferred io. + */ +static int queue_io(struct mapped_device *md, struct bio *bio) +{ +	down_write(&md->lock); + +	if (!test_bit(DMF_BLOCK_IO, &md->flags)) { +		up_write(&md->lock); +		return 1; +	} + +	bio_list_add(&md->deferred, bio); + +	up_write(&md->lock); +	return 0;		/* deferred successfully */ +} + +/* + * Everyone (including functions in this file), should use this + * function to access the md->map field, and make sure they call + * dm_table_put() when finished. + */ +struct dm_table *dm_get_table(struct mapped_device *md) +{ +	struct dm_table *t; + +	read_lock(&md->map_lock); +	t = md->map; +	if (t) +		dm_table_get(t); +	read_unlock(&md->map_lock); + +	return t; +} + +/*----------------------------------------------------------------- + * CRUD START: + *   A more elegant soln is in the works that uses the queue + *   merge fn, unfortunately there are a couple of changes to + *   the block layer that I want to make for this.  So in the + *   interests of getting something for people to use I give + *   you this clearly demarcated crap. + *---------------------------------------------------------------*/ + +/* + * Decrements the number of outstanding ios that a bio has been + * cloned into, completing the original io if necc. + */ +static inline void dec_pending(struct dm_io *io, int error) +{ +	if (error) +		io->error = error; + +	if (atomic_dec_and_test(&io->io_count)) { +		if (atomic_dec_and_test(&io->md->pending)) +			/* nudge anyone waiting on suspend queue */ +			wake_up(&io->md->wait); + +		bio_endio(io->bio, io->bio->bi_size, io->error); +		free_io(io->md, io); +	} +} + +static int clone_endio(struct bio *bio, unsigned int done, int error) +{ +	int r = 0; +	struct target_io *tio = bio->bi_private; +	struct dm_io *io = tio->io; +	dm_endio_fn endio = tio->ti->type->end_io; + +	if (bio->bi_size) +		return 1; + +	if (!bio_flagged(bio, BIO_UPTODATE) && !error) +		error = -EIO; + +	if (endio) { +		r = endio(tio->ti, bio, error, &tio->info); +		if (r < 0) +			error = r; + +		else if (r > 0) +			/* the target wants another shot at the io */ +			return 1; +	} + +	free_tio(io->md, tio); +	dec_pending(io, error); +	bio_put(bio); +	return r; +} + +static sector_t max_io_len(struct mapped_device *md, +			   sector_t sector, struct dm_target *ti) +{ +	sector_t offset = sector - ti->begin; +	sector_t len = ti->len - offset; + +	/* +	 * Does the target need to split even further ? +	 */ +	if (ti->split_io) { +		sector_t boundary; +		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) +			   - offset; +		if (len > boundary) +			len = boundary; +	} + +	return len; +} + +static void __map_bio(struct dm_target *ti, struct bio *clone, +		      struct target_io *tio) +{ +	int r; + +	/* +	 * Sanity checks. +	 */ +	BUG_ON(!clone->bi_size); + +	clone->bi_end_io = clone_endio; +	clone->bi_private = tio; + +	/* +	 * Map the clone.  If r == 0 we don't need to do +	 * anything, the target has assumed ownership of +	 * this io. +	 */ +	atomic_inc(&tio->io->io_count); +	r = ti->type->map(ti, clone, &tio->info); +	if (r > 0) +		/* the bio has been remapped so dispatch it */ +		generic_make_request(clone); + +	else if (r < 0) { +		/* error the io and bail out */ +		struct dm_io *io = tio->io; +		free_tio(tio->io->md, tio); +		dec_pending(io, -EIO); +		bio_put(clone); +	} +} + +struct clone_info { +	struct mapped_device *md; +	struct dm_table *map; +	struct bio *bio; +	struct dm_io *io; +	sector_t sector; +	sector_t sector_count; +	unsigned short idx; +}; + +/* + * Creates a little bio that is just does part of a bvec. + */ +static struct bio *split_bvec(struct bio *bio, sector_t sector, +			      unsigned short idx, unsigned int offset, +			      unsigned int len) +{ +	struct bio *clone; +	struct bio_vec *bv = bio->bi_io_vec + idx; + +	clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set); +	*clone->bi_io_vec = *bv; + +	clone->bi_sector = sector; +	clone->bi_bdev = bio->bi_bdev; +	clone->bi_rw = bio->bi_rw; +	clone->bi_vcnt = 1; +	clone->bi_size = to_bytes(len); +	clone->bi_io_vec->bv_offset = offset; +	clone->bi_io_vec->bv_len = clone->bi_size; + +	return clone; +} + +/* + * Creates a bio that consists of range of complete bvecs. + */ +static struct bio *clone_bio(struct bio *bio, sector_t sector, +			     unsigned short idx, unsigned short bv_count, +			     unsigned int len) +{ +	struct bio *clone; + +	clone = bio_clone(bio, GFP_NOIO); +	clone->bi_sector = sector; +	clone->bi_idx = idx; +	clone->bi_vcnt = idx + bv_count; +	clone->bi_size = to_bytes(len); +	clone->bi_flags &= ~(1 << BIO_SEG_VALID); + +	return clone; +} + +static void __clone_and_map(struct clone_info *ci) +{ +	struct bio *clone, *bio = ci->bio; +	struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); +	sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); +	struct target_io *tio; + +	/* +	 * Allocate a target io object. +	 */ +	tio = alloc_tio(ci->md); +	tio->io = ci->io; +	tio->ti = ti; +	memset(&tio->info, 0, sizeof(tio->info)); + +	if (ci->sector_count <= max) { +		/* +		 * Optimise for the simple case where we can do all of +		 * the remaining io with a single clone. +		 */ +		clone = clone_bio(bio, ci->sector, ci->idx, +				  bio->bi_vcnt - ci->idx, ci->sector_count); +		__map_bio(ti, clone, tio); +		ci->sector_count = 0; + +	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { +		/* +		 * There are some bvecs that don't span targets. +		 * Do as many of these as possible. +		 */ +		int i; +		sector_t remaining = max; +		sector_t bv_len; + +		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { +			bv_len = to_sector(bio->bi_io_vec[i].bv_len); + +			if (bv_len > remaining) +				break; + +			remaining -= bv_len; +			len += bv_len; +		} + +		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); +		__map_bio(ti, clone, tio); + +		ci->sector += len; +		ci->sector_count -= len; +		ci->idx = i; + +	} else { +		/* +		 * Create two copy bios to deal with io that has +		 * been split across a target. +		 */ +		struct bio_vec *bv = bio->bi_io_vec + ci->idx; + +		clone = split_bvec(bio, ci->sector, ci->idx, +				   bv->bv_offset, max); +		__map_bio(ti, clone, tio); + +		ci->sector += max; +		ci->sector_count -= max; +		ti = dm_table_find_target(ci->map, ci->sector); + +		len = to_sector(bv->bv_len) - max; +		clone = split_bvec(bio, ci->sector, ci->idx, +				   bv->bv_offset + to_bytes(max), len); +		tio = alloc_tio(ci->md); +		tio->io = ci->io; +		tio->ti = ti; +		memset(&tio->info, 0, sizeof(tio->info)); +		__map_bio(ti, clone, tio); + +		ci->sector += len; +		ci->sector_count -= len; +		ci->idx++; +	} +} + +/* + * Split the bio into several clones. + */ +static void __split_bio(struct mapped_device *md, struct bio *bio) +{ +	struct clone_info ci; + +	ci.map = dm_get_table(md); +	if (!ci.map) { +		bio_io_error(bio, bio->bi_size); +		return; +	} + +	ci.md = md; +	ci.bio = bio; +	ci.io = alloc_io(md); +	ci.io->error = 0; +	atomic_set(&ci.io->io_count, 1); +	ci.io->bio = bio; +	ci.io->md = md; +	ci.sector = bio->bi_sector; +	ci.sector_count = bio_sectors(bio); +	ci.idx = bio->bi_idx; + +	atomic_inc(&md->pending); +	while (ci.sector_count) +		__clone_and_map(&ci); + +	/* drop the extra reference count */ +	dec_pending(ci.io, 0); +	dm_table_put(ci.map); +} +/*----------------------------------------------------------------- + * CRUD END + *---------------------------------------------------------------*/ + +/* + * The request function that just remaps the bio built up by + * dm_merge_bvec. + */ +static int dm_request(request_queue_t *q, struct bio *bio) +{ +	int r; +	struct mapped_device *md = q->queuedata; + +	down_read(&md->lock); + +	/* +	 * If we're suspended we have to queue +	 * this io for later. +	 */ +	while (test_bit(DMF_BLOCK_IO, &md->flags)) { +		up_read(&md->lock); + +		if (bio_rw(bio) == READA) { +			bio_io_error(bio, bio->bi_size); +			return 0; +		} + +		r = queue_io(md, bio); +		if (r < 0) { +			bio_io_error(bio, bio->bi_size); +			return 0; + +		} else if (r == 0) +			return 0;	/* deferred successfully */ + +		/* +		 * We're in a while loop, because someone could suspend +		 * before we get to the following read lock. +		 */ +		down_read(&md->lock); +	} + +	__split_bio(md, bio); +	up_read(&md->lock); +	return 0; +} + +static int dm_flush_all(request_queue_t *q, struct gendisk *disk, +			sector_t *error_sector) +{ +	struct mapped_device *md = q->queuedata; +	struct dm_table *map = dm_get_table(md); +	int ret = -ENXIO; + +	if (map) { +		ret = dm_table_flush_all(md->map); +		dm_table_put(map); +	} + +	return ret; +} + +static void dm_unplug_all(request_queue_t *q) +{ +	struct mapped_device *md = q->queuedata; +	struct dm_table *map = dm_get_table(md); + +	if (map) { +		dm_table_unplug_all(map); +		dm_table_put(map); +	} +} + +static int dm_any_congested(void *congested_data, int bdi_bits) +{ +	int r; +	struct mapped_device *md = (struct mapped_device *) congested_data; +	struct dm_table *map = dm_get_table(md); + +	if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) +		r = bdi_bits; +	else +		r = dm_table_any_congested(map, bdi_bits); + +	dm_table_put(map); +	return r; +} + +/*----------------------------------------------------------------- + * An IDR is used to keep track of allocated minor numbers. + *---------------------------------------------------------------*/ +static DECLARE_MUTEX(_minor_lock); +static DEFINE_IDR(_minor_idr); + +static void free_minor(unsigned int minor) +{ +	down(&_minor_lock); +	idr_remove(&_minor_idr, minor); +	up(&_minor_lock); +} + +/* + * See if the device with a specific minor # is free. + */ +static int specific_minor(struct mapped_device *md, unsigned int minor) +{ +	int r, m; + +	if (minor >= (1 << MINORBITS)) +		return -EINVAL; + +	down(&_minor_lock); + +	if (idr_find(&_minor_idr, minor)) { +		r = -EBUSY; +		goto out; +	} + +	r = idr_pre_get(&_minor_idr, GFP_KERNEL); +	if (!r) { +		r = -ENOMEM; +		goto out; +	} + +	r = idr_get_new_above(&_minor_idr, md, minor, &m); +	if (r) { +		goto out; +	} + +	if (m != minor) { +		idr_remove(&_minor_idr, m); +		r = -EBUSY; +		goto out; +	} + +out: +	up(&_minor_lock); +	return r; +} + +static int next_free_minor(struct mapped_device *md, unsigned int *minor) +{ +	int r; +	unsigned int m; + +	down(&_minor_lock); + +	r = idr_pre_get(&_minor_idr, GFP_KERNEL); +	if (!r) { +		r = -ENOMEM; +		goto out; +	} + +	r = idr_get_new(&_minor_idr, md, &m); +	if (r) { +		goto out; +	} + +	if (m >= (1 << MINORBITS)) { +		idr_remove(&_minor_idr, m); +		r = -ENOSPC; +		goto out; +	} + +	*minor = m; + +out: +	up(&_minor_lock); +	return r; +} + +static struct block_device_operations dm_blk_dops; + +/* + * Allocate and initialise a blank device with a given minor. + */ +static struct mapped_device *alloc_dev(unsigned int minor, int persistent) +{ +	int r; +	struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); + +	if (!md) { +		DMWARN("unable to allocate device, out of memory."); +		return NULL; +	} + +	/* get a minor number for the dev */ +	r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor); +	if (r < 0) +		goto bad1; + +	memset(md, 0, sizeof(*md)); +	init_rwsem(&md->lock); +	rwlock_init(&md->map_lock); +	atomic_set(&md->holders, 1); +	atomic_set(&md->event_nr, 0); + +	md->queue = blk_alloc_queue(GFP_KERNEL); +	if (!md->queue) +		goto bad1; + +	md->queue->queuedata = md; +	md->queue->backing_dev_info.congested_fn = dm_any_congested; +	md->queue->backing_dev_info.congested_data = md; +	blk_queue_make_request(md->queue, dm_request); +	md->queue->unplug_fn = dm_unplug_all; +	md->queue->issue_flush_fn = dm_flush_all; + +	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, +				     mempool_free_slab, _io_cache); + 	if (!md->io_pool) + 		goto bad2; + +	md->tio_pool = mempool_create(MIN_IOS, mempool_alloc_slab, +				      mempool_free_slab, _tio_cache); +	if (!md->tio_pool) +		goto bad3; + +	md->disk = alloc_disk(1); +	if (!md->disk) +		goto bad4; + +	md->disk->major = _major; +	md->disk->first_minor = minor; +	md->disk->fops = &dm_blk_dops; +	md->disk->queue = md->queue; +	md->disk->private_data = md; +	sprintf(md->disk->disk_name, "dm-%d", minor); +	add_disk(md->disk); + +	atomic_set(&md->pending, 0); +	init_waitqueue_head(&md->wait); +	init_waitqueue_head(&md->eventq); + +	return md; + + bad4: +	mempool_destroy(md->tio_pool); + bad3: +	mempool_destroy(md->io_pool); + bad2: +	blk_put_queue(md->queue); +	free_minor(minor); + bad1: +	kfree(md); +	return NULL; +} + +static void free_dev(struct mapped_device *md) +{ +	free_minor(md->disk->first_minor); +	mempool_destroy(md->tio_pool); +	mempool_destroy(md->io_pool); +	del_gendisk(md->disk); +	put_disk(md->disk); +	blk_put_queue(md->queue); +	kfree(md); +} + +/* + * Bind a table to the device. + */ +static void event_callback(void *context) +{ +	struct mapped_device *md = (struct mapped_device *) context; + +	atomic_inc(&md->event_nr); +	wake_up(&md->eventq); +} + +static void __set_size(struct gendisk *disk, sector_t size) +{ +	struct block_device *bdev; + +	set_capacity(disk, size); +	bdev = bdget_disk(disk, 0); +	if (bdev) { +		down(&bdev->bd_inode->i_sem); +		i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); +		up(&bdev->bd_inode->i_sem); +		bdput(bdev); +	} +} + +static int __bind(struct mapped_device *md, struct dm_table *t) +{ +	request_queue_t *q = md->queue; +	sector_t size; + +	size = dm_table_get_size(t); +	__set_size(md->disk, size); +	if (size == 0) +		return 0; + +	write_lock(&md->map_lock); +	md->map = t; +	write_unlock(&md->map_lock); + +	dm_table_get(t); +	dm_table_event_callback(md->map, event_callback, md); +	dm_table_set_restrictions(t, q); +	return 0; +} + +static void __unbind(struct mapped_device *md) +{ +	struct dm_table *map = md->map; + +	if (!map) +		return; + +	dm_table_event_callback(map, NULL, NULL); +	write_lock(&md->map_lock); +	md->map = NULL; +	write_unlock(&md->map_lock); +	dm_table_put(map); +} + +/* + * Constructor for a new device. + */ +static int create_aux(unsigned int minor, int persistent, +		      struct mapped_device **result) +{ +	struct mapped_device *md; + +	md = alloc_dev(minor, persistent); +	if (!md) +		return -ENXIO; + +	*result = md; +	return 0; +} + +int dm_create(struct mapped_device **result) +{ +	return create_aux(0, 0, result); +} + +int dm_create_with_minor(unsigned int minor, struct mapped_device **result) +{ +	return create_aux(minor, 1, result); +} + +void *dm_get_mdptr(dev_t dev) +{ +	struct mapped_device *md; +	void *mdptr = NULL; +	unsigned minor = MINOR(dev); + +	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) +		return NULL; + +	down(&_minor_lock); + +	md = idr_find(&_minor_idr, minor); + +	if (md && (dm_disk(md)->first_minor == minor)) +		mdptr = md->interface_ptr; + +	up(&_minor_lock); + +	return mdptr; +} + +void dm_set_mdptr(struct mapped_device *md, void *ptr) +{ +	md->interface_ptr = ptr; +} + +void dm_get(struct mapped_device *md) +{ +	atomic_inc(&md->holders); +} + +void dm_put(struct mapped_device *md) +{ +	struct dm_table *map = dm_get_table(md); + +	if (atomic_dec_and_test(&md->holders)) { +		if (!test_bit(DMF_SUSPENDED, &md->flags) && map) { +			dm_table_presuspend_targets(map); +			dm_table_postsuspend_targets(map); +		} +		__unbind(md); +		free_dev(md); +	} + +	dm_table_put(map); +} + +/* + * Process the deferred bios + */ +static void __flush_deferred_io(struct mapped_device *md, struct bio *c) +{ +	struct bio *n; + +	while (c) { +		n = c->bi_next; +		c->bi_next = NULL; +		__split_bio(md, c); +		c = n; +	} +} + +/* + * Swap in a new table (destroying old one). + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *table) +{ +	int r; + +	down_write(&md->lock); + +	/* device must be suspended */ +	if (!test_bit(DMF_SUSPENDED, &md->flags)) { +		up_write(&md->lock); +		return -EPERM; +	} + +	__unbind(md); +	r = __bind(md, table); +	if (r) +		return r; + +	up_write(&md->lock); +	return 0; +} + +/* + * Functions to lock and unlock any filesystem running on the + * device. + */ +static int __lock_fs(struct mapped_device *md) +{ +	struct block_device *bdev; + +	if (test_and_set_bit(DMF_FS_LOCKED, &md->flags)) +		return 0; + +	bdev = bdget_disk(md->disk, 0); +	if (!bdev) { +		DMWARN("bdget failed in __lock_fs"); +		return -ENOMEM; +	} + +	WARN_ON(md->frozen_sb); +	md->frozen_sb = freeze_bdev(bdev); +	/* don't bdput right now, we don't want the bdev +	 * to go away while it is locked.  We'll bdput +	 * in __unlock_fs +	 */ +	return 0; +} + +static int __unlock_fs(struct mapped_device *md) +{ +	struct block_device *bdev; + +	if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags)) +		return 0; + +	bdev = bdget_disk(md->disk, 0); +	if (!bdev) { +		DMWARN("bdget failed in __unlock_fs"); +		return -ENOMEM; +	} + +	thaw_bdev(bdev, md->frozen_sb); +	md->frozen_sb = NULL; +	bdput(bdev); +	bdput(bdev); +	return 0; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem.  For example we might want to move some data in + * the background.  Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight bios and ensure that any further io gets deferred. + */ +int dm_suspend(struct mapped_device *md) +{ +	struct dm_table *map; +	DECLARE_WAITQUEUE(wait, current); + +	/* Flush I/O to the device. */ +	down_read(&md->lock); +	if (test_bit(DMF_BLOCK_IO, &md->flags)) { +		up_read(&md->lock); +		return -EINVAL; +	} + +	map = dm_get_table(md); +	if (map) +		dm_table_presuspend_targets(map); +	__lock_fs(md); + +	up_read(&md->lock); + +	/* +	 * First we set the BLOCK_IO flag so no more ios will be +	 * mapped. +	 */ +	down_write(&md->lock); +	if (test_bit(DMF_BLOCK_IO, &md->flags)) { +		/* +		 * If we get here we know another thread is +		 * trying to suspend as well, so we leave the fs +		 * locked for this thread. +		 */ +		up_write(&md->lock); +		return -EINVAL; +	} + +	set_bit(DMF_BLOCK_IO, &md->flags); +	add_wait_queue(&md->wait, &wait); +	up_write(&md->lock); + +	/* unplug */ +	if (map) { +		dm_table_unplug_all(map); +		dm_table_put(map); +	} + +	/* +	 * Then we wait for the already mapped ios to +	 * complete. +	 */ +	while (1) { +		set_current_state(TASK_INTERRUPTIBLE); + +		if (!atomic_read(&md->pending) || signal_pending(current)) +			break; + +		io_schedule(); +	} +	set_current_state(TASK_RUNNING); + +	down_write(&md->lock); +	remove_wait_queue(&md->wait, &wait); + +	/* were we interrupted ? */ +	if (atomic_read(&md->pending)) { +		__unlock_fs(md); +		clear_bit(DMF_BLOCK_IO, &md->flags); +		up_write(&md->lock); +		return -EINTR; +	} + +	set_bit(DMF_SUSPENDED, &md->flags); + +	map = dm_get_table(md); +	if (map) +		dm_table_postsuspend_targets(map); +	dm_table_put(map); +	up_write(&md->lock); + +	return 0; +} + +int dm_resume(struct mapped_device *md) +{ +	struct bio *def; +	struct dm_table *map = dm_get_table(md); + +	down_write(&md->lock); +	if (!map || +	    !test_bit(DMF_SUSPENDED, &md->flags) || +	    !dm_table_get_size(map)) { +		up_write(&md->lock); +		dm_table_put(map); +		return -EINVAL; +	} + +	dm_table_resume_targets(map); +	clear_bit(DMF_SUSPENDED, &md->flags); +	clear_bit(DMF_BLOCK_IO, &md->flags); + +	def = bio_list_get(&md->deferred); +	__flush_deferred_io(md, def); +	up_write(&md->lock); +	__unlock_fs(md); +	dm_table_unplug_all(map); +	dm_table_put(map); + +	return 0; +} + +/*----------------------------------------------------------------- + * Event notification. + *---------------------------------------------------------------*/ +uint32_t dm_get_event_nr(struct mapped_device *md) +{ +	return atomic_read(&md->event_nr); +} + +int dm_wait_event(struct mapped_device *md, int event_nr) +{ +	return wait_event_interruptible(md->eventq, +			(event_nr != atomic_read(&md->event_nr))); +} + +/* + * The gendisk is only valid as long as you have a reference + * count on 'md'. + */ +struct gendisk *dm_disk(struct mapped_device *md) +{ +	return md->disk; +} + +int dm_suspended(struct mapped_device *md) +{ +	return test_bit(DMF_SUSPENDED, &md->flags); +} + +static struct block_device_operations dm_blk_dops = { +	.open = dm_blk_open, +	.release = dm_blk_close, +	.owner = THIS_MODULE +}; + +EXPORT_SYMBOL(dm_get_mapinfo); + +/* + * module hooks + */ +module_init(dm_init); +module_exit(dm_exit); + +module_param(major, uint, 0); +MODULE_PARM_DESC(major, "The major number of the device mapper"); +MODULE_DESCRIPTION(DM_NAME " driver"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h new file mode 100644 index 00000000000..e38c3fc1a1d --- /dev/null +++ b/drivers/md/dm.h @@ -0,0 +1,195 @@ +/* + * Internal header file for device mapper + * + * Copyright (C) 2001, 2002 Sistina Software + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the LGPL. + */ + +#ifndef DM_INTERNAL_H +#define DM_INTERNAL_H + +#include <linux/fs.h> +#include <linux/device-mapper.h> +#include <linux/list.h> +#include <linux/blkdev.h> + +#define DM_NAME "device-mapper" +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) + +#define DMEMIT(x...) sz += ((sz >= maxlen) ? \ +			  0 : scnprintf(result + sz, maxlen - sz, x)) + +/* + * FIXME: I think this should be with the definition of sector_t + * in types.h. + */ +#ifdef CONFIG_LBD +#define SECTOR_FORMAT "%Lu" +#else +#define SECTOR_FORMAT "%lu" +#endif + +#define SECTOR_SHIFT 9 + +/* + * List of devices that a metadevice uses and should open/close. + */ +struct dm_dev { +	struct list_head list; + +	atomic_t count; +	int mode; +	struct block_device *bdev; +	char name[16]; +}; + +struct dm_table; +struct mapped_device; + +/*----------------------------------------------------------------- + * Functions for manipulating a struct mapped_device. + * Drop the reference with dm_put when you finish with the object. + *---------------------------------------------------------------*/ +int dm_create(struct mapped_device **md); +int dm_create_with_minor(unsigned int minor, struct mapped_device **md); +void dm_set_mdptr(struct mapped_device *md, void *ptr); +void *dm_get_mdptr(dev_t dev); + +/* + * Reference counting for md. + */ +void dm_get(struct mapped_device *md); +void dm_put(struct mapped_device *md); + +/* + * A device can still be used while suspended, but I/O is deferred. + */ +int dm_suspend(struct mapped_device *md); +int dm_resume(struct mapped_device *md); + +/* + * The device must be suspended before calling this method. + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *t); + +/* + * Drop a reference on the table when you've finished with the + * result. + */ +struct dm_table *dm_get_table(struct mapped_device *md); + +/* + * Event functions. + */ +uint32_t dm_get_event_nr(struct mapped_device *md); +int dm_wait_event(struct mapped_device *md, int event_nr); + +/* + * Info functions. + */ +struct gendisk *dm_disk(struct mapped_device *md); +int dm_suspended(struct mapped_device *md); + +/*----------------------------------------------------------------- + * Functions for manipulating a table.  Tables are also reference + * counted. + *---------------------------------------------------------------*/ +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets); + +void dm_table_get(struct dm_table *t); +void dm_table_put(struct dm_table *t); + +int dm_table_add_target(struct dm_table *t, const char *type, +			sector_t start,	sector_t len, char *params); +int dm_table_complete(struct dm_table *t); +void dm_table_event_callback(struct dm_table *t, +			     void (*fn)(void *), void *context); +void dm_table_event(struct dm_table *t); +sector_t dm_table_get_size(struct dm_table *t); +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); +void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); +unsigned int dm_table_get_num_targets(struct dm_table *t); +struct list_head *dm_table_get_devices(struct dm_table *t); +int dm_table_get_mode(struct dm_table *t); +void dm_table_presuspend_targets(struct dm_table *t); +void dm_table_postsuspend_targets(struct dm_table *t); +void dm_table_resume_targets(struct dm_table *t); +int dm_table_any_congested(struct dm_table *t, int bdi_bits); +void dm_table_unplug_all(struct dm_table *t); +int dm_table_flush_all(struct dm_table *t); + +/*----------------------------------------------------------------- + * A registry of target types. + *---------------------------------------------------------------*/ +int dm_target_init(void); +void dm_target_exit(void); +struct target_type *dm_get_target_type(const char *name); +void dm_put_target_type(struct target_type *t); +int dm_target_iterate(void (*iter_func)(struct target_type *tt, +					void *param), void *param); + + +/*----------------------------------------------------------------- + * Useful inlines. + *---------------------------------------------------------------*/ +static inline int array_too_big(unsigned long fixed, unsigned long obj, +				unsigned long num) +{ +	return (num > (ULONG_MAX - fixed) / obj); +} + +/* + * Ceiling(n / sz) + */ +#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz)) + +#define dm_sector_div_up(n, sz) ( \ +{ \ +	sector_t _r = ((n) + (sz) - 1); \ +	sector_div(_r, (sz)); \ +	_r; \ +} \ +) + +/* + * ceiling(n / size) * size + */ +#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz)) + +static inline sector_t to_sector(unsigned long n) +{ +	return (n >> 9); +} + +static inline unsigned long to_bytes(sector_t n) +{ +	return (n << 9); +} + +int dm_split_args(int *argc, char ***argvp, char *input); + +/* + * The device-mapper can be driven through one of two interfaces; + * ioctl or filesystem, depending which patch you have applied. + */ +int dm_interface_init(void); +void dm_interface_exit(void); + +/* + * Targets for linear and striped mappings + */ +int dm_linear_init(void); +void dm_linear_exit(void); + +int dm_stripe_init(void); +void dm_stripe_exit(void); + +void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); +union map_info *dm_get_mapinfo(struct bio *bio); + +#endif diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c new file mode 100644 index 00000000000..0248f8e7eac --- /dev/null +++ b/drivers/md/faulty.c @@ -0,0 +1,343 @@ +/* + * faulty.c : Multiple Devices driver for Linux + * + * Copyright (C) 2004 Neil Brown + * + * fautly-device-simulator personality for md + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +/* + * The "faulty" personality causes some requests to fail. + * + * Possible failure modes are: + *   reads fail "randomly" but succeed on retry + *   writes fail "randomly" but succeed on retry + *   reads for some address fail and then persist until a write + *   reads for some address fail and then persist irrespective of write + *   writes for some address fail and persist + *   all writes fail + * + * Different modes can be active at a time, but only + * one can be set at array creation.  Others can be added later. + * A mode can be one-shot or recurrent with the recurrance being + * once in every N requests. + * The bottom 5 bits of the "layout" indicate the mode.  The + * remainder indicate a period, or 0 for one-shot. + * + * There is an implementation limit on the number of concurrently + * persisting-faulty blocks. When a new fault is requested that would + * exceed the limit, it is ignored. + * All current faults can be clear using a layout of "0". + * + * Requests are always sent to the device.  If they are to fail, + * we clone the bio and insert a new b_end_io into the chain. + */ + +#define	WriteTransient	0 +#define	ReadTransient	1 +#define	WritePersistent	2 +#define	ReadPersistent	3 +#define	WriteAll	4 /* doesn't go to device */ +#define	ReadFixable	5 +#define	Modes	6 + +#define	ClearErrors	31 +#define	ClearFaults	30 + +#define AllPersist	100 /* internal use only */ +#define	NoPersist	101 + +#define	ModeMask	0x1f +#define	ModeShift	5 + +#define MaxFault	50 +#include <linux/raid/md.h> + + +static int faulty_fail(struct bio *bio, unsigned int bytes_done, int error) +{ +	struct bio *b = bio->bi_private; + +	b->bi_size = bio->bi_size; +	b->bi_sector = bio->bi_sector; + +	if (bio->bi_size == 0) +		bio_put(bio); + +	clear_bit(BIO_UPTODATE, &b->bi_flags); +	return (b->bi_end_io)(b, bytes_done, -EIO); +} + +typedef struct faulty_conf { +	int period[Modes]; +	atomic_t counters[Modes]; +	sector_t faults[MaxFault]; +	int	modes[MaxFault]; +	int nfaults; +	mdk_rdev_t *rdev; +} conf_t; + +static int check_mode(conf_t *conf, int mode) +{ +	if (conf->period[mode] == 0 && +	    atomic_read(&conf->counters[mode]) <= 0) +		return 0; /* no failure, no decrement */ + + +	if (atomic_dec_and_test(&conf->counters[mode])) { +		if (conf->period[mode]) +			atomic_set(&conf->counters[mode], conf->period[mode]); +		return 1; +	} +	return 0; +} + +static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir) +{ +	/* If we find a ReadFixable sector, we fix it ... */ +	int i; +	for (i=0; i<conf->nfaults; i++) +		if (conf->faults[i] >= start && +		    conf->faults[i] < end) { +			/* found it ... */ +			switch (conf->modes[i] * 2 + dir) { +			case WritePersistent*2+WRITE: return 1; +			case ReadPersistent*2+READ: return 1; +			case ReadFixable*2+READ: return 1; +			case ReadFixable*2+WRITE: +				conf->modes[i] = NoPersist; +				return 0; +			case AllPersist*2+READ: +			case AllPersist*2+WRITE: return 1; +			default: +				return 0; +			} +		} +	return 0; +} + +static void add_sector(conf_t *conf, sector_t start, int mode) +{ +	int i; +	int n = conf->nfaults; +	for (i=0; i<conf->nfaults; i++) +		if (conf->faults[i] == start) { +			switch(mode) { +			case NoPersist: conf->modes[i] = mode; return; +			case WritePersistent: +				if (conf->modes[i] == ReadPersistent || +				    conf->modes[i] == ReadFixable) +					conf->modes[i] = AllPersist; +				else +					conf->modes[i] = WritePersistent; +				return; +			case ReadPersistent: +				if (conf->modes[i] == WritePersistent) +					conf->modes[i] = AllPersist; +				else +					conf->modes[i] = ReadPersistent; +				return; +			case ReadFixable: +				if (conf->modes[i] == WritePersistent || +				    conf->modes[i] == ReadPersistent) +					conf->modes[i] = AllPersist; +				else +					conf->modes[i] = ReadFixable; +				return; +			} +		} else if (conf->modes[i] == NoPersist) +			n = i; + +	if (n >= MaxFault) +		return; +	conf->faults[n] = start; +	conf->modes[n] = mode; +	if (conf->nfaults == n) +		conf->nfaults = n+1; +} + +static int make_request(request_queue_t *q, struct bio *bio) +{ +	mddev_t *mddev = q->queuedata; +	conf_t *conf = (conf_t*)mddev->private; +	int failit = 0; + +	if (bio->bi_rw & 1) { +		/* write request */ +		if (atomic_read(&conf->counters[WriteAll])) { +			/* special case - don't decrement, don't generic_make_request, +			 * just fail immediately +			 */ +			bio_endio(bio, bio->bi_size, -EIO); +			return 0; +		} + +		if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), +				 WRITE)) +			failit = 1; +		if (check_mode(conf, WritePersistent)) { +			add_sector(conf, bio->bi_sector, WritePersistent); +			failit = 1; +		} +		if (check_mode(conf, WriteTransient)) +			failit = 1; +	} else { +		/* read request */ +		if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9), +				 READ)) +			failit = 1; +		if (check_mode(conf, ReadTransient)) +			failit = 1; +		if (check_mode(conf, ReadPersistent)) { +			add_sector(conf, bio->bi_sector, ReadPersistent); +			failit = 1; +		} +		if (check_mode(conf, ReadFixable)) { +			add_sector(conf, bio->bi_sector, ReadFixable); +			failit = 1; +		} +	} +	if (failit) { +		struct bio *b = bio_clone(bio, GFP_NOIO); +		b->bi_bdev = conf->rdev->bdev; +		b->bi_private = bio; +		b->bi_end_io = faulty_fail; +		generic_make_request(b); +		return 0; +	} else { +		bio->bi_bdev = conf->rdev->bdev; +		return 1; +	} +} + +static void status(struct seq_file *seq, mddev_t *mddev) +{ +	conf_t *conf = (conf_t*)mddev->private; +	int n; + +	if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) +		seq_printf(seq, " WriteTransient=%d(%d)", +			   n, conf->period[WriteTransient]); + +	if ((n=atomic_read(&conf->counters[ReadTransient])) != 0) +		seq_printf(seq, " ReadTransient=%d(%d)", +			   n, conf->period[ReadTransient]); + +	if ((n=atomic_read(&conf->counters[WritePersistent])) != 0) +		seq_printf(seq, " WritePersistent=%d(%d)", +			   n, conf->period[WritePersistent]); + +	if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0) +		seq_printf(seq, " ReadPersistent=%d(%d)", +			   n, conf->period[ReadPersistent]); + + +	if ((n=atomic_read(&conf->counters[ReadFixable])) != 0) +		seq_printf(seq, " ReadFixable=%d(%d)", +			   n, conf->period[ReadFixable]); + +	if ((n=atomic_read(&conf->counters[WriteAll])) != 0) +		seq_printf(seq, " WriteAll"); + +	seq_printf(seq, " nfaults=%d", conf->nfaults); +} + + +static int reconfig(mddev_t *mddev, int layout, int chunk_size) +{ +	int mode = layout & ModeMask; +	int count = layout >> ModeShift; +	conf_t *conf = mddev->private; + +	if (chunk_size != -1) +		return -EINVAL; + +	/* new layout */ +	if (mode == ClearFaults) +		conf->nfaults = 0; +	else if (mode == ClearErrors) { +		int i; +		for (i=0 ; i < Modes ; i++) { +			conf->period[i] = 0; +			atomic_set(&conf->counters[i], 0); +		} +	} else if (mode < Modes) { +		conf->period[mode] = count; +		if (!count) count++; +		atomic_set(&conf->counters[mode], count); +	} else +		return -EINVAL; +	mddev->layout = -1; /* makes sure further changes come through */ +	return 0; +} + +static int run(mddev_t *mddev) +{ +	mdk_rdev_t *rdev; +	struct list_head *tmp; +	int i; + +	conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL); + +	for (i=0; i<Modes; i++) { +		atomic_set(&conf->counters[i], 0); +		conf->period[i] = 0; +	} +	conf->nfaults = 0; + +	ITERATE_RDEV(mddev, rdev, tmp) +		conf->rdev = rdev; + +	mddev->array_size = mddev->size; +	mddev->private = conf; + +	reconfig(mddev, mddev->layout, -1); + +	return 0; +} + +static int stop(mddev_t *mddev) +{ +	conf_t *conf = (conf_t *)mddev->private; + +	kfree(conf); +	mddev->private = NULL; +	return 0; +} + +static mdk_personality_t faulty_personality = +{ +	.name		= "faulty", +	.owner		= THIS_MODULE, +	.make_request	= make_request, +	.run		= run, +	.stop		= stop, +	.status		= status, +	.reconfig	= reconfig, +}; + +static int __init raid_init(void) +{ +	return register_md_personality(FAULTY, &faulty_personality); +} + +static void raid_exit(void) +{ +	unregister_md_personality(FAULTY); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("md-personality-10"); /* faulty */ diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c new file mode 100644 index 00000000000..eb703648597 --- /dev/null +++ b/drivers/md/kcopyd.c @@ -0,0 +1,687 @@ +/* + * Copyright (C) 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + * + * Kcopyd provides a simple interface for copying an area of one + * block-device to one or more other block-devices, with an asynchronous + * completion notification. + */ + +#include <asm/atomic.h> + +#include <linux/blkdev.h> +#include <linux/config.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> + +#include "kcopyd.h" + +static struct workqueue_struct *_kcopyd_wq; +static struct work_struct _kcopyd_work; + +static inline void wake(void) +{ +	queue_work(_kcopyd_wq, &_kcopyd_work); +} + +/*----------------------------------------------------------------- + * Each kcopyd client has its own little pool of preallocated + * pages for kcopyd io. + *---------------------------------------------------------------*/ +struct kcopyd_client { +	struct list_head list; + +	spinlock_t lock; +	struct page_list *pages; +	unsigned int nr_pages; +	unsigned int nr_free_pages; +}; + +static struct page_list *alloc_pl(void) +{ +	struct page_list *pl; + +	pl = kmalloc(sizeof(*pl), GFP_KERNEL); +	if (!pl) +		return NULL; + +	pl->page = alloc_page(GFP_KERNEL); +	if (!pl->page) { +		kfree(pl); +		return NULL; +	} + +	return pl; +} + +static void free_pl(struct page_list *pl) +{ +	__free_page(pl->page); +	kfree(pl); +} + +static int kcopyd_get_pages(struct kcopyd_client *kc, +			    unsigned int nr, struct page_list **pages) +{ +	struct page_list *pl; + +	spin_lock(&kc->lock); +	if (kc->nr_free_pages < nr) { +		spin_unlock(&kc->lock); +		return -ENOMEM; +	} + +	kc->nr_free_pages -= nr; +	for (*pages = pl = kc->pages; --nr; pl = pl->next) +		; + +	kc->pages = pl->next; +	pl->next = NULL; + +	spin_unlock(&kc->lock); + +	return 0; +} + +static void kcopyd_put_pages(struct kcopyd_client *kc, struct page_list *pl) +{ +	struct page_list *cursor; + +	spin_lock(&kc->lock); +	for (cursor = pl; cursor->next; cursor = cursor->next) +		kc->nr_free_pages++; + +	kc->nr_free_pages++; +	cursor->next = kc->pages; +	kc->pages = pl; +	spin_unlock(&kc->lock); +} + +/* + * These three functions resize the page pool. + */ +static void drop_pages(struct page_list *pl) +{ +	struct page_list *next; + +	while (pl) { +		next = pl->next; +		free_pl(pl); +		pl = next; +	} +} + +static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr) +{ +	unsigned int i; +	struct page_list *pl = NULL, *next; + +	for (i = 0; i < nr; i++) { +		next = alloc_pl(); +		if (!next) { +			if (pl) +				drop_pages(pl); +			return -ENOMEM; +		} +		next->next = pl; +		pl = next; +	} + +	kcopyd_put_pages(kc, pl); +	kc->nr_pages += nr; +	return 0; +} + +static void client_free_pages(struct kcopyd_client *kc) +{ +	BUG_ON(kc->nr_free_pages != kc->nr_pages); +	drop_pages(kc->pages); +	kc->pages = NULL; +	kc->nr_free_pages = kc->nr_pages = 0; +} + +/*----------------------------------------------------------------- + * kcopyd_jobs need to be allocated by the *clients* of kcopyd, + * for this reason we use a mempool to prevent the client from + * ever having to do io (which could cause a deadlock). + *---------------------------------------------------------------*/ +struct kcopyd_job { +	struct kcopyd_client *kc; +	struct list_head list; +	unsigned long flags; + +	/* +	 * Error state of the job. +	 */ +	int read_err; +	unsigned int write_err; + +	/* +	 * Either READ or WRITE +	 */ +	int rw; +	struct io_region source; + +	/* +	 * The destinations for the transfer. +	 */ +	unsigned int num_dests; +	struct io_region dests[KCOPYD_MAX_REGIONS]; + +	sector_t offset; +	unsigned int nr_pages; +	struct page_list *pages; + +	/* +	 * Set this to ensure you are notified when the job has +	 * completed.  'context' is for callback to use. +	 */ +	kcopyd_notify_fn fn; +	void *context; + +	/* +	 * These fields are only used if the job has been split +	 * into more manageable parts. +	 */ +	struct semaphore lock; +	atomic_t sub_jobs; +	sector_t progress; +}; + +/* FIXME: this should scale with the number of pages */ +#define MIN_JOBS 512 + +static kmem_cache_t *_job_cache; +static mempool_t *_job_pool; + +/* + * We maintain three lists of jobs: + * + * i)   jobs waiting for pages + * ii)  jobs that have pages, and are waiting for the io to be issued. + * iii) jobs that have completed. + * + * All three of these are protected by job_lock. + */ +static DEFINE_SPINLOCK(_job_lock); + +static LIST_HEAD(_complete_jobs); +static LIST_HEAD(_io_jobs); +static LIST_HEAD(_pages_jobs); + +static int jobs_init(void) +{ +	_job_cache = kmem_cache_create("kcopyd-jobs", +				       sizeof(struct kcopyd_job), +				       __alignof__(struct kcopyd_job), +				       0, NULL, NULL); +	if (!_job_cache) +		return -ENOMEM; + +	_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, +				   mempool_free_slab, _job_cache); +	if (!_job_pool) { +		kmem_cache_destroy(_job_cache); +		return -ENOMEM; +	} + +	return 0; +} + +static void jobs_exit(void) +{ +	BUG_ON(!list_empty(&_complete_jobs)); +	BUG_ON(!list_empty(&_io_jobs)); +	BUG_ON(!list_empty(&_pages_jobs)); + +	mempool_destroy(_job_pool); +	kmem_cache_destroy(_job_cache); +	_job_pool = NULL; +	_job_cache = NULL; +} + +/* + * Functions to push and pop a job onto the head of a given job + * list. + */ +static inline struct kcopyd_job *pop(struct list_head *jobs) +{ +	struct kcopyd_job *job = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&_job_lock, flags); + +	if (!list_empty(jobs)) { +		job = list_entry(jobs->next, struct kcopyd_job, list); +		list_del(&job->list); +	} +	spin_unlock_irqrestore(&_job_lock, flags); + +	return job; +} + +static inline void push(struct list_head *jobs, struct kcopyd_job *job) +{ +	unsigned long flags; + +	spin_lock_irqsave(&_job_lock, flags); +	list_add_tail(&job->list, jobs); +	spin_unlock_irqrestore(&_job_lock, flags); +} + +/* + * These three functions process 1 item from the corresponding + * job list. + * + * They return: + * < 0: error + *   0: success + * > 0: can't process yet. + */ +static int run_complete_job(struct kcopyd_job *job) +{ +	void *context = job->context; +	int read_err = job->read_err; +	unsigned int write_err = job->write_err; +	kcopyd_notify_fn fn = job->fn; + +	kcopyd_put_pages(job->kc, job->pages); +	mempool_free(job, _job_pool); +	fn(read_err, write_err, context); +	return 0; +} + +static void complete_io(unsigned long error, void *context) +{ +	struct kcopyd_job *job = (struct kcopyd_job *) context; + +	if (error) { +		if (job->rw == WRITE) +			job->write_err &= error; +		else +			job->read_err = 1; + +		if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { +			push(&_complete_jobs, job); +			wake(); +			return; +		} +	} + +	if (job->rw == WRITE) +		push(&_complete_jobs, job); + +	else { +		job->rw = WRITE; +		push(&_io_jobs, job); +	} + +	wake(); +} + +/* + * Request io on as many buffer heads as we can currently get for + * a particular job. + */ +static int run_io_job(struct kcopyd_job *job) +{ +	int r; + +	if (job->rw == READ) +		r = dm_io_async(1, &job->source, job->rw, +				job->pages, +				job->offset, complete_io, job); + +	else +		r = dm_io_async(job->num_dests, job->dests, job->rw, +				job->pages, +				job->offset, complete_io, job); + +	return r; +} + +static int run_pages_job(struct kcopyd_job *job) +{ +	int r; + +	job->nr_pages = dm_div_up(job->dests[0].count + job->offset, +				  PAGE_SIZE >> 9); +	r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); +	if (!r) { +		/* this job is ready for io */ +		push(&_io_jobs, job); +		return 0; +	} + +	if (r == -ENOMEM) +		/* can't complete now */ +		return 1; + +	return r; +} + +/* + * Run through a list for as long as possible.  Returns the count + * of successful jobs. + */ +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) +{ +	struct kcopyd_job *job; +	int r, count = 0; + +	while ((job = pop(jobs))) { + +		r = fn(job); + +		if (r < 0) { +			/* error this rogue job */ +			if (job->rw == WRITE) +				job->write_err = (unsigned int) -1; +			else +				job->read_err = 1; +			push(&_complete_jobs, job); +			break; +		} + +		if (r > 0) { +			/* +			 * We couldn't service this job ATM, so +			 * push this job back onto the list. +			 */ +			push(jobs, job); +			break; +		} + +		count++; +	} + +	return count; +} + +/* + * kcopyd does this every time it's woken up. + */ +static void do_work(void *ignored) +{ +	/* +	 * The order that these are called is *very* important. +	 * complete jobs can free some pages for pages jobs. +	 * Pages jobs when successful will jump onto the io jobs +	 * list.  io jobs call wake when they complete and it all +	 * starts again. +	 */ +	process_jobs(&_complete_jobs, run_complete_job); +	process_jobs(&_pages_jobs, run_pages_job); +	process_jobs(&_io_jobs, run_io_job); +} + +/* + * If we are copying a small region we just dispatch a single job + * to do the copy, otherwise the io has to be split up into many + * jobs. + */ +static void dispatch_job(struct kcopyd_job *job) +{ +	push(&_pages_jobs, job); +	wake(); +} + +#define SUB_JOB_SIZE 128 +static void segment_complete(int read_err, +			     unsigned int write_err, void *context) +{ +	/* FIXME: tidy this function */ +	sector_t progress = 0; +	sector_t count = 0; +	struct kcopyd_job *job = (struct kcopyd_job *) context; + +	down(&job->lock); + +	/* update the error */ +	if (read_err) +		job->read_err = 1; + +	if (write_err) +		job->write_err &= write_err; + +	/* +	 * Only dispatch more work if there hasn't been an error. +	 */ +	if ((!job->read_err && !job->write_err) || +	    test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { +		/* get the next chunk of work */ +		progress = job->progress; +		count = job->source.count - progress; +		if (count) { +			if (count > SUB_JOB_SIZE) +				count = SUB_JOB_SIZE; + +			job->progress += count; +		} +	} +	up(&job->lock); + +	if (count) { +		int i; +		struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO); + +		*sub_job = *job; +		sub_job->source.sector += progress; +		sub_job->source.count = count; + +		for (i = 0; i < job->num_dests; i++) { +			sub_job->dests[i].sector += progress; +			sub_job->dests[i].count = count; +		} + +		sub_job->fn = segment_complete; +		sub_job->context = job; +		dispatch_job(sub_job); + +	} else if (atomic_dec_and_test(&job->sub_jobs)) { + +		/* +		 * To avoid a race we must keep the job around +		 * until after the notify function has completed. +		 * Otherwise the client may try and stop the job +		 * after we've completed. +		 */ +		job->fn(read_err, write_err, job->context); +		mempool_free(job, _job_pool); +	} +} + +/* + * Create some little jobs that will do the move between + * them. + */ +#define SPLIT_COUNT 8 +static void split_job(struct kcopyd_job *job) +{ +	int i; + +	atomic_set(&job->sub_jobs, SPLIT_COUNT); +	for (i = 0; i < SPLIT_COUNT; i++) +		segment_complete(0, 0u, job); +} + +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, +		unsigned int num_dests, struct io_region *dests, +		unsigned int flags, kcopyd_notify_fn fn, void *context) +{ +	struct kcopyd_job *job; + +	/* +	 * Allocate a new job. +	 */ +	job = mempool_alloc(_job_pool, GFP_NOIO); + +	/* +	 * set up for the read. +	 */ +	job->kc = kc; +	job->flags = flags; +	job->read_err = 0; +	job->write_err = 0; +	job->rw = READ; + +	job->source = *from; + +	job->num_dests = num_dests; +	memcpy(&job->dests, dests, sizeof(*dests) * num_dests); + +	job->offset = 0; +	job->nr_pages = 0; +	job->pages = NULL; + +	job->fn = fn; +	job->context = context; + +	if (job->source.count < SUB_JOB_SIZE) +		dispatch_job(job); + +	else { +		init_MUTEX(&job->lock); +		job->progress = 0; +		split_job(job); +	} + +	return 0; +} + +/* + * Cancels a kcopyd job, eg. someone might be deactivating a + * mirror. + */ +int kcopyd_cancel(struct kcopyd_job *job, int block) +{ +	/* FIXME: finish */ +	return -1; +} + +/*----------------------------------------------------------------- + * Unit setup + *---------------------------------------------------------------*/ +static DECLARE_MUTEX(_client_lock); +static LIST_HEAD(_clients); + +static void client_add(struct kcopyd_client *kc) +{ +	down(&_client_lock); +	list_add(&kc->list, &_clients); +	up(&_client_lock); +} + +static void client_del(struct kcopyd_client *kc) +{ +	down(&_client_lock); +	list_del(&kc->list); +	up(&_client_lock); +} + +static DECLARE_MUTEX(kcopyd_init_lock); +static int kcopyd_clients = 0; + +static int kcopyd_init(void) +{ +	int r; + +	down(&kcopyd_init_lock); + +	if (kcopyd_clients) { +		/* Already initialized. */ +		kcopyd_clients++; +		up(&kcopyd_init_lock); +		return 0; +	} + +	r = jobs_init(); +	if (r) { +		up(&kcopyd_init_lock); +		return r; +	} + +	_kcopyd_wq = create_singlethread_workqueue("kcopyd"); +	if (!_kcopyd_wq) { +		jobs_exit(); +		up(&kcopyd_init_lock); +		return -ENOMEM; +	} + +	kcopyd_clients++; +	INIT_WORK(&_kcopyd_work, do_work, NULL); +	up(&kcopyd_init_lock); +	return 0; +} + +static void kcopyd_exit(void) +{ +	down(&kcopyd_init_lock); +	kcopyd_clients--; +	if (!kcopyd_clients) { +		jobs_exit(); +		destroy_workqueue(_kcopyd_wq); +		_kcopyd_wq = NULL; +	} +	up(&kcopyd_init_lock); +} + +int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result) +{ +	int r = 0; +	struct kcopyd_client *kc; + +	r = kcopyd_init(); +	if (r) +		return r; + +	kc = kmalloc(sizeof(*kc), GFP_KERNEL); +	if (!kc) { +		kcopyd_exit(); +		return -ENOMEM; +	} + +	spin_lock_init(&kc->lock); +	kc->pages = NULL; +	kc->nr_pages = kc->nr_free_pages = 0; +	r = client_alloc_pages(kc, nr_pages); +	if (r) { +		kfree(kc); +		kcopyd_exit(); +		return r; +	} + +	r = dm_io_get(nr_pages); +	if (r) { +		client_free_pages(kc); +		kfree(kc); +		kcopyd_exit(); +		return r; +	} + +	client_add(kc); +	*result = kc; +	return 0; +} + +void kcopyd_client_destroy(struct kcopyd_client *kc) +{ +	dm_io_put(kc->nr_pages); +	client_free_pages(kc); +	client_del(kc); +	kfree(kc); +	kcopyd_exit(); +} + +EXPORT_SYMBOL(kcopyd_client_create); +EXPORT_SYMBOL(kcopyd_client_destroy); +EXPORT_SYMBOL(kcopyd_copy); +EXPORT_SYMBOL(kcopyd_cancel); diff --git a/drivers/md/kcopyd.h b/drivers/md/kcopyd.h new file mode 100644 index 00000000000..4621ea055c0 --- /dev/null +++ b/drivers/md/kcopyd.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2001 Sistina Software + * + * This file is released under the GPL. + * + * Kcopyd provides a simple interface for copying an area of one + * block-device to one or more other block-devices, with an asynchronous + * completion notification. + */ + +#ifndef DM_KCOPYD_H +#define DM_KCOPYD_H + +#include "dm-io.h" + +/* FIXME: make this configurable */ +#define KCOPYD_MAX_REGIONS 8 + +#define KCOPYD_IGNORE_ERROR 1 + +/* + * To use kcopyd you must first create a kcopyd client object. + */ +struct kcopyd_client; +int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result); +void kcopyd_client_destroy(struct kcopyd_client *kc); + +/* + * Submit a copy job to kcopyd.  This is built on top of the + * previous three fns. + * + * read_err is a boolean, + * write_err is a bitset, with 1 bit for each destination region + */ +typedef void (*kcopyd_notify_fn)(int read_err, +				 unsigned int write_err, void *context); + +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, +		unsigned int num_dests, struct io_region *dests, +		unsigned int flags, kcopyd_notify_fn fn, void *context); + +#endif diff --git a/drivers/md/linear.c b/drivers/md/linear.c new file mode 100644 index 00000000000..161e9aa8729 --- /dev/null +++ b/drivers/md/linear.c @@ -0,0 +1,343 @@ +/* +   linear.c : Multiple Devices driver for Linux +	      Copyright (C) 1994-96 Marc ZYNGIER +	      <zyngier@ufr-info-p7.ibp.fr> or +	      <maz@gloups.fdn.fr> + +   Linear mode management functions. + +   This program is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published by +   the Free Software Foundation; either version 2, or (at your option) +   any later version. +    +   You should have received a copy of the GNU General Public License +   (for example /usr/src/linux/COPYING); if not, write to the Free +   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.   +*/ + +#include <linux/module.h> + +#include <linux/raid/md.h> +#include <linux/slab.h> +#include <linux/raid/linear.h> + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +/* + * find which device holds a particular offset  + */ +static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) +{ +	dev_info_t *hash; +	linear_conf_t *conf = mddev_to_conf(mddev); +	sector_t block = sector >> 1; + +	/* +	 * sector_div(a,b) returns the remainer and sets a to a/b +	 */ +	(void)sector_div(block, conf->smallest->size); +	hash = conf->hash_table[block]; + +	while ((sector>>1) >= (hash->size + hash->offset)) +		hash++; +	return hash; +} + +/** + *	linear_mergeable_bvec -- tell bio layer if a two requests can be merged + *	@q: request queue + *	@bio: the buffer head that's been built up so far + *	@biovec: the request that could be merged to it. + * + *	Return amount of bytes we can take at this offset + */ +static int linear_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) +{ +	mddev_t *mddev = q->queuedata; +	dev_info_t *dev0; +	unsigned long maxsectors, bio_sectors = bio->bi_size >> 9; +	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + +	dev0 = which_dev(mddev, sector); +	maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1)); + +	if (maxsectors < bio_sectors) +		maxsectors = 0; +	else +		maxsectors -= bio_sectors; + +	if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) +		return biovec->bv_len; +	/* The bytes available at this offset could be really big, +	 * so we cap at 2^31 to avoid overflow */ +	if (maxsectors > (1 << (31-9))) +		return 1<<31; +	return maxsectors << 9; +} + +static void linear_unplug(request_queue_t *q) +{ +	mddev_t *mddev = q->queuedata; +	linear_conf_t *conf = mddev_to_conf(mddev); +	int i; + +	for (i=0; i < mddev->raid_disks; i++) { +		request_queue_t *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); +		if (r_queue->unplug_fn) +			r_queue->unplug_fn(r_queue); +	} +} + +static int linear_issue_flush(request_queue_t *q, struct gendisk *disk, +			      sector_t *error_sector) +{ +	mddev_t *mddev = q->queuedata; +	linear_conf_t *conf = mddev_to_conf(mddev); +	int i, ret = 0; + +	for (i=0; i < mddev->raid_disks && ret == 0; i++) { +		struct block_device *bdev = conf->disks[i].rdev->bdev; +		request_queue_t *r_queue = bdev_get_queue(bdev); + +		if (!r_queue->issue_flush_fn) +			ret = -EOPNOTSUPP; +		else +			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); +	} +	return ret; +} + +static int linear_run (mddev_t *mddev) +{ +	linear_conf_t *conf; +	dev_info_t **table; +	mdk_rdev_t *rdev; +	int i, nb_zone, cnt; +	sector_t start; +	sector_t curr_offset; +	struct list_head *tmp; + +	conf = kmalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t), +			GFP_KERNEL); +	if (!conf) +		goto out; +	memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); +	mddev->private = conf; + +	/* +	 * Find the smallest device. +	 */ + +	conf->smallest = NULL; +	cnt = 0; +	mddev->array_size = 0; + +	ITERATE_RDEV(mddev,rdev,tmp) { +		int j = rdev->raid_disk; +		dev_info_t *disk = conf->disks + j; + +		if (j < 0 || j > mddev->raid_disks || disk->rdev) { +			printk("linear: disk numbering problem. Aborting!\n"); +			goto out; +		} + +		disk->rdev = rdev; + +		blk_queue_stack_limits(mddev->queue, +				       rdev->bdev->bd_disk->queue); +		/* as we don't honour merge_bvec_fn, we must never risk +		 * violating it, so limit ->max_sector to one PAGE, as +		 * a one page request is never in violation. +		 */ +		if (rdev->bdev->bd_disk->queue->merge_bvec_fn && +		    mddev->queue->max_sectors > (PAGE_SIZE>>9)) +			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + +		disk->size = rdev->size; +		mddev->array_size += rdev->size; + +		if (!conf->smallest || (disk->size < conf->smallest->size)) +			conf->smallest = disk; +		cnt++; +	} +	if (cnt != mddev->raid_disks) { +		printk("linear: not enough drives present. Aborting!\n"); +		goto out; +	} + +	/* +	 * This code was restructured to work around a gcc-2.95.3 internal +	 * compiler error.  Alter it with care. +	 */ +	{ +		sector_t sz; +		unsigned round; +		unsigned long base; + +		sz = mddev->array_size; +		base = conf->smallest->size; +		round = sector_div(sz, base); +		nb_zone = conf->nr_zones = sz + (round ? 1 : 0); +	} +			 +	conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone, +					GFP_KERNEL); +	if (!conf->hash_table) +		goto out; + +	/* +	 * Here we generate the linear hash table +	 */ +	table = conf->hash_table; +	start = 0; +	curr_offset = 0; +	for (i = 0; i < cnt; i++) { +		dev_info_t *disk = conf->disks + i; + +		disk->offset = curr_offset; +		curr_offset += disk->size; + +		/* 'curr_offset' is the end of this disk +		 * 'start' is the start of table +		 */ +		while (start < curr_offset) { +			*table++ = disk; +			start += conf->smallest->size; +		} +	} +	if (table-conf->hash_table != nb_zone) +		BUG(); + +	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); +	mddev->queue->unplug_fn = linear_unplug; +	mddev->queue->issue_flush_fn = linear_issue_flush; +	return 0; + +out: +	if (conf) +		kfree(conf); +	return 1; +} + +static int linear_stop (mddev_t *mddev) +{ +	linear_conf_t *conf = mddev_to_conf(mddev); +   +	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +	kfree(conf->hash_table); +	kfree(conf); + +	return 0; +} + +static int linear_make_request (request_queue_t *q, struct bio *bio) +{ +	mddev_t *mddev = q->queuedata; +	dev_info_t *tmp_dev; +	sector_t block; + +	if (bio_data_dir(bio)==WRITE) { +		disk_stat_inc(mddev->gendisk, writes); +		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); +	} else { +		disk_stat_inc(mddev->gendisk, reads); +		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); +	} + +	tmp_dev = which_dev(mddev, bio->bi_sector); +	block = bio->bi_sector >> 1; +     +	if (unlikely(block >= (tmp_dev->size + tmp_dev->offset) +		     || block < tmp_dev->offset)) { +		char b[BDEVNAME_SIZE]; + +		printk("linear_make_request: Block %llu out of bounds on " +			"dev %s size %llu offset %llu\n", +			(unsigned long long)block, +			bdevname(tmp_dev->rdev->bdev, b), +			(unsigned long long)tmp_dev->size, +		        (unsigned long long)tmp_dev->offset); +		bio_io_error(bio, bio->bi_size); +		return 0; +	} +	if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > +		     (tmp_dev->offset + tmp_dev->size)<<1)) { +		/* This bio crosses a device boundary, so we have to +		 * split it. +		 */ +		struct bio_pair *bp; +		bp = bio_split(bio, bio_split_pool,  +			       (bio->bi_sector + (bio->bi_size >> 9) - +				(tmp_dev->offset + tmp_dev->size))<<1); +		if (linear_make_request(q, &bp->bio1)) +			generic_make_request(&bp->bio1); +		if (linear_make_request(q, &bp->bio2)) +			generic_make_request(&bp->bio2); +		bio_pair_release(bp); +		return 0; +	} +		     +	bio->bi_bdev = tmp_dev->rdev->bdev; +	bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1) + tmp_dev->rdev->data_offset; + +	return 1; +} + +static void linear_status (struct seq_file *seq, mddev_t *mddev) +{ + +#undef MD_DEBUG +#ifdef MD_DEBUG +	int j; +	linear_conf_t *conf = mddev_to_conf(mddev); +	sector_t s = 0; +   +	seq_printf(seq, "      "); +	for (j = 0; j < conf->nr_zones; j++) +	{ +		char b[BDEVNAME_SIZE]; +		s += conf->smallest_size; +		seq_printf(seq, "[%s", +			   bdevname(conf->hash_table[j][0].rdev->bdev,b)); + +		while (s > conf->hash_table[j][0].offset + +		           conf->hash_table[j][0].size) +			seq_printf(seq, "/%s] ", +				   bdevname(conf->hash_table[j][1].rdev->bdev,b)); +		else +			seq_printf(seq, "] "); +	} +	seq_printf(seq, "\n"); +#endif +	seq_printf(seq, " %dk rounding", mddev->chunk_size/1024); +} + + +static mdk_personality_t linear_personality= +{ +	.name		= "linear", +	.owner		= THIS_MODULE, +	.make_request	= linear_make_request, +	.run		= linear_run, +	.stop		= linear_stop, +	.status		= linear_status, +}; + +static int __init linear_init (void) +{ +	return register_md_personality (LINEAR, &linear_personality); +} + +static void linear_exit (void) +{ +	unregister_md_personality (LINEAR); +} + + +module_init(linear_init); +module_exit(linear_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("md-personality-1"); /* LINEAR */ diff --git a/drivers/md/md.c b/drivers/md/md.c new file mode 100644 index 00000000000..04562add192 --- /dev/null +++ b/drivers/md/md.c @@ -0,0 +1,3766 @@ +/* +   md.c : Multiple Devices driver for Linux +	  Copyright (C) 1998, 1999, 2000 Ingo Molnar + +     completely rewritten, based on the MD driver code from Marc Zyngier + +   Changes: + +   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar +   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> +   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> +   - kerneld support by Boris Tobotras <boris@xtalk.msk.su> +   - kmod support by: Cyrus Durgin +   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> +   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> + +   - lots of fixes and improvements to the RAID1/RAID5 and generic +     RAID code (such as request based resynchronization): + +     Neil Brown <neilb@cse.unsw.edu.au>. + +   This program is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published by +   the Free Software Foundation; either version 2, or (at your option) +   any later version. + +   You should have received a copy of the GNU General Public License +   (for example /usr/src/linux/COPYING); if not, write to the Free +   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/linkage.h> +#include <linux/raid/md.h> +#include <linux/sysctl.h> +#include <linux/devfs_fs_kernel.h> +#include <linux/buffer_head.h> /* for invalidate_bdev */ +#include <linux/suspend.h> + +#include <linux/init.h> + +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif + +#include <asm/unaligned.h> + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER + +/* 63 partitions with the alternate major number (mdp) */ +#define MdpMinorShift 6 + +#define DEBUG 0 +#define dprintk(x...) ((void)(DEBUG && printk(x))) + + +#ifndef MODULE +static void autostart_arrays (int part); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; +static DEFINE_SPINLOCK(pers_lock); + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 1000 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 1000; +static int sysctl_speed_limit_max = 200000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { +	{ +		.ctl_name	= DEV_RAID_SPEED_LIMIT_MIN, +		.procname	= "speed_limit_min", +		.data		= &sysctl_speed_limit_min, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec, +	}, +	{ +		.ctl_name	= DEV_RAID_SPEED_LIMIT_MAX, +		.procname	= "speed_limit_max", +		.data		= &sysctl_speed_limit_max, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec, +	}, +	{ .ctl_name = 0 } +}; + +static ctl_table raid_dir_table[] = { +	{ +		.ctl_name	= DEV_RAID, +		.procname	= "raid", +		.maxlen		= 0, +		.mode		= 0555, +		.child		= raid_table, +	}, +	{ .ctl_name = 0 } +}; + +static ctl_table raid_root_table[] = { +	{ +		.ctl_name	= CTL_DEV, +		.procname	= "dev", +		.maxlen		= 0, +		.mode		= 0555, +		.child		= raid_dir_table, +	}, +	{ .ctl_name = 0 } +}; + +static struct block_device_operations md_fops; + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list. + */ +static LIST_HEAD(all_mddevs); +static DEFINE_SPINLOCK(all_mddevs_lock); + + +/* + * iterates through all used mddevs in the system. + * We take care to grab the all_mddevs_lock whenever navigating + * the list, and to always hold a refcount when unlocked. + * Any code which breaks out of this loop while own + * a reference to the current mddev and must mddev_put it. + */ +#define ITERATE_MDDEV(mddev,tmp)					\ +									\ +	for (({ spin_lock(&all_mddevs_lock); 				\ +		tmp = all_mddevs.next;					\ +		mddev = NULL;});					\ +	     ({ if (tmp != &all_mddevs)					\ +			mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ +		spin_unlock(&all_mddevs_lock);				\ +		if (mddev) mddev_put(mddev);				\ +		mddev = list_entry(tmp, mddev_t, all_mddevs);		\ +		tmp != &all_mddevs;});					\ +	     ({ spin_lock(&all_mddevs_lock);				\ +		tmp = tmp->next;})					\ +		) + + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ +	bio_io_error(bio, bio->bi_size); +	return 0; +} + +static inline mddev_t *mddev_get(mddev_t *mddev) +{ +	atomic_inc(&mddev->active); +	return mddev; +} + +static void mddev_put(mddev_t *mddev) +{ +	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) +		return; +	if (!mddev->raid_disks && list_empty(&mddev->disks)) { +		list_del(&mddev->all_mddevs); +		blk_put_queue(mddev->queue); +		kfree(mddev); +	} +	spin_unlock(&all_mddevs_lock); +} + +static mddev_t * mddev_find(dev_t unit) +{ +	mddev_t *mddev, *new = NULL; + + retry: +	spin_lock(&all_mddevs_lock); +	list_for_each_entry(mddev, &all_mddevs, all_mddevs) +		if (mddev->unit == unit) { +			mddev_get(mddev); +			spin_unlock(&all_mddevs_lock); +			if (new) +				kfree(new); +			return mddev; +		} + +	if (new) { +		list_add(&new->all_mddevs, &all_mddevs); +		spin_unlock(&all_mddevs_lock); +		return new; +	} +	spin_unlock(&all_mddevs_lock); + +	new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); +	if (!new) +		return NULL; + +	memset(new, 0, sizeof(*new)); + +	new->unit = unit; +	if (MAJOR(unit) == MD_MAJOR) +		new->md_minor = MINOR(unit); +	else +		new->md_minor = MINOR(unit) >> MdpMinorShift; + +	init_MUTEX(&new->reconfig_sem); +	INIT_LIST_HEAD(&new->disks); +	INIT_LIST_HEAD(&new->all_mddevs); +	init_timer(&new->safemode_timer); +	atomic_set(&new->active, 1); + +	new->queue = blk_alloc_queue(GFP_KERNEL); +	if (!new->queue) { +		kfree(new); +		return NULL; +	} + +	blk_queue_make_request(new->queue, md_fail_request); + +	goto retry; +} + +static inline int mddev_lock(mddev_t * mddev) +{ +	return down_interruptible(&mddev->reconfig_sem); +} + +static inline void mddev_lock_uninterruptible(mddev_t * mddev) +{ +	down(&mddev->reconfig_sem); +} + +static inline int mddev_trylock(mddev_t * mddev) +{ +	return down_trylock(&mddev->reconfig_sem); +} + +static inline void mddev_unlock(mddev_t * mddev) +{ +	up(&mddev->reconfig_sem); + +	if (mddev->thread) +		md_wakeup_thread(mddev->thread); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ +	mdk_rdev_t * rdev; +	struct list_head *tmp; + +	ITERATE_RDEV(mddev,rdev,tmp) { +		if (rdev->desc_nr == nr) +			return rdev; +	} +	return NULL; +} + +static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +{ +	struct list_head *tmp; +	mdk_rdev_t *rdev; + +	ITERATE_RDEV(mddev,rdev,tmp) { +		if (rdev->bdev->bd_dev == dev) +			return rdev; +	} +	return NULL; +} + +inline static sector_t calc_dev_sboffset(struct block_device *bdev) +{ +	sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; +	return MD_NEW_SIZE_BLOCKS(size); +} + +static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) +{ +	sector_t size; + +	size = rdev->sb_offset; + +	if (chunk_size) +		size &= ~((sector_t)chunk_size/1024 - 1); +	return size; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ +	if (rdev->sb_page) +		MD_BUG(); + +	rdev->sb_page = alloc_page(GFP_KERNEL); +	if (!rdev->sb_page) { +		printk(KERN_ALERT "md: out of memory.\n"); +		return -EINVAL; +	} + +	return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ +	if (rdev->sb_page) { +		page_cache_release(rdev->sb_page); +		rdev->sb_loaded = 0; +		rdev->sb_page = NULL; +		rdev->sb_offset = 0; +		rdev->size = 0; +	} +} + + +static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) +{ +	if (bio->bi_size) +		return 1; + +	complete((struct completion*)bio->bi_private); +	return 0; +} + +static int sync_page_io(struct block_device *bdev, sector_t sector, int size, +		   struct page *page, int rw) +{ +	struct bio *bio = bio_alloc(GFP_KERNEL, 1); +	struct completion event; +	int ret; + +	rw |= (1 << BIO_RW_SYNC); + +	bio->bi_bdev = bdev; +	bio->bi_sector = sector; +	bio_add_page(bio, page, size, 0); +	init_completion(&event); +	bio->bi_private = &event; +	bio->bi_end_io = bi_complete; +	submit_bio(rw, bio); +	wait_for_completion(&event); + +	ret = test_bit(BIO_UPTODATE, &bio->bi_flags); +	bio_put(bio); +	return ret; +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ +	char b[BDEVNAME_SIZE]; +	if (!rdev->sb_page) { +		MD_BUG(); +		return -EINVAL; +	} +	if (rdev->sb_loaded) +		return 0; + + +	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) +		goto fail; +	rdev->sb_loaded = 1; +	return 0; + +fail: +	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", +		bdevname(rdev->bdev,b)); +	return -EINVAL; +} + +static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ +	if (	(sb1->set_uuid0 == sb2->set_uuid0) && +		(sb1->set_uuid1 == sb2->set_uuid1) && +		(sb1->set_uuid2 == sb2->set_uuid2) && +		(sb1->set_uuid3 == sb2->set_uuid3)) + +		return 1; + +	return 0; +} + + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ +	int ret; +	mdp_super_t *tmp1, *tmp2; + +	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); +	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + +	if (!tmp1 || !tmp2) { +		ret = 0; +		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); +		goto abort; +	} + +	*tmp1 = *sb1; +	*tmp2 = *sb2; + +	/* +	 * nr_disks is not constant +	 */ +	tmp1->nr_disks = 0; +	tmp2->nr_disks = 0; + +	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) +		ret = 0; +	else +		ret = 1; + +abort: +	if (tmp1) +		kfree(tmp1); +	if (tmp2) +		kfree(tmp2); + +	return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ +	unsigned int disk_csum, csum; + +	disk_csum = sb->sb_csum; +	sb->sb_csum = 0; +	csum = csum_partial((void *)sb, MD_SB_BYTES, 0); +	sb->sb_csum = disk_csum; +	return csum; +} + + +/* + * Handle superblock details. + * We want to be able to handle multiple superblock formats + * so we have a common interface to them all, and an array of + * different handlers. + * We rely on user-space to write the initial superblock, and support + * reading and updating of superblocks. + * Interface methods are: + *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) + *      loads and validates a superblock on dev. + *      if refdev != NULL, compare superblocks on both devices + *    Return: + *      0 - dev has a superblock that is compatible with refdev + *      1 - dev has a superblock that is compatible and newer than refdev + *          so dev should be used as the refdev in future + *     -EINVAL superblock incompatible or invalid + *     -othererror e.g. -EIO + * + *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev) + *      Verify that dev is acceptable into mddev. + *       The first time, mddev->raid_disks will be 0, and data from + *       dev should be merged in.  Subsequent calls check that dev + *       is new enough.  Return 0 or -EINVAL + * + *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev) + *     Update the superblock for rdev with data in mddev + *     This does not write to disc. + * + */ + +struct super_type  { +	char 		*name; +	struct module	*owner; +	int		(*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); +	int		(*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); +	void		(*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); +}; + +/* + * load_super for 0.90.0  + */ +static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ +	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; +	mdp_super_t *sb; +	int ret; +	sector_t sb_offset; + +	/* +	 * Calculate the position of the superblock, +	 * it's at the end of the disk. +	 * +	 * It also happens to be a multiple of 4Kb. +	 */ +	sb_offset = calc_dev_sboffset(rdev->bdev); +	rdev->sb_offset = sb_offset; + +	ret = read_disk_sb(rdev); +	if (ret) return ret; + +	ret = -EINVAL; + +	bdevname(rdev->bdev, b); +	sb = (mdp_super_t*)page_address(rdev->sb_page); + +	if (sb->md_magic != MD_SB_MAGIC) { +		printk(KERN_ERR "md: invalid raid superblock magic on %s\n", +		       b); +		goto abort; +	} + +	if (sb->major_version != 0 || +	    sb->minor_version != 90) { +		printk(KERN_WARNING "Bad version number %d.%d on %s\n", +			sb->major_version, sb->minor_version, +			b); +		goto abort; +	} + +	if (sb->raid_disks <= 0) +		goto abort; + +	if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { +		printk(KERN_WARNING "md: invalid superblock checksum on %s\n", +			b); +		goto abort; +	} + +	rdev->preferred_minor = sb->md_minor; +	rdev->data_offset = 0; + +	if (sb->level == LEVEL_MULTIPATH) +		rdev->desc_nr = -1; +	else +		rdev->desc_nr = sb->this_disk.number; + +	if (refdev == 0) +		ret = 1; +	else { +		__u64 ev1, ev2; +		mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); +		if (!uuid_equal(refsb, sb)) { +			printk(KERN_WARNING "md: %s has different UUID to %s\n", +				b, bdevname(refdev->bdev,b2)); +			goto abort; +		} +		if (!sb_equal(refsb, sb)) { +			printk(KERN_WARNING "md: %s has same UUID" +			       " but different superblock to %s\n", +			       b, bdevname(refdev->bdev, b2)); +			goto abort; +		} +		ev1 = md_event(sb); +		ev2 = md_event(refsb); +		if (ev1 > ev2) +			ret = 1; +		else  +			ret = 0; +	} +	rdev->size = calc_dev_size(rdev, sb->chunk_size); + + abort: +	return ret; +} + +/* + * validate_super for 0.90.0 + */ +static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	mdp_disk_t *desc; +	mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + +	if (mddev->raid_disks == 0) { +		mddev->major_version = 0; +		mddev->minor_version = sb->minor_version; +		mddev->patch_version = sb->patch_version; +		mddev->persistent = ! sb->not_persistent; +		mddev->chunk_size = sb->chunk_size; +		mddev->ctime = sb->ctime; +		mddev->utime = sb->utime; +		mddev->level = sb->level; +		mddev->layout = sb->layout; +		mddev->raid_disks = sb->raid_disks; +		mddev->size = sb->size; +		mddev->events = md_event(sb); + +		if (sb->state & (1<<MD_SB_CLEAN)) +			mddev->recovery_cp = MaxSector; +		else { +			if (sb->events_hi == sb->cp_events_hi &&  +				sb->events_lo == sb->cp_events_lo) { +				mddev->recovery_cp = sb->recovery_cp; +			} else +				mddev->recovery_cp = 0; +		} + +		memcpy(mddev->uuid+0, &sb->set_uuid0, 4); +		memcpy(mddev->uuid+4, &sb->set_uuid1, 4); +		memcpy(mddev->uuid+8, &sb->set_uuid2, 4); +		memcpy(mddev->uuid+12,&sb->set_uuid3, 4); + +		mddev->max_disks = MD_SB_DISKS; +	} else { +		__u64 ev1; +		ev1 = md_event(sb); +		++ev1; +		if (ev1 < mddev->events)  +			return -EINVAL; +	} +	if (mddev->level != LEVEL_MULTIPATH) { +		rdev->raid_disk = -1; +		rdev->in_sync = rdev->faulty = 0; +		desc = sb->disks + rdev->desc_nr; + +		if (desc->state & (1<<MD_DISK_FAULTY)) +			rdev->faulty = 1; +		else if (desc->state & (1<<MD_DISK_SYNC) && +			 desc->raid_disk < mddev->raid_disks) { +			rdev->in_sync = 1; +			rdev->raid_disk = desc->raid_disk; +		} +	} +	return 0; +} + +/* + * sync_super for 0.90.0 + */ +static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	mdp_super_t *sb; +	struct list_head *tmp; +	mdk_rdev_t *rdev2; +	int next_spare = mddev->raid_disks; + +	/* make rdev->sb match mddev data.. +	 * +	 * 1/ zero out disks +	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); +	 * 3/ any empty disks < next_spare become removed +	 * +	 * disks[0] gets initialised to REMOVED because +	 * we cannot be sure from other fields if it has +	 * been initialised or not. +	 */ +	int i; +	int active=0, working=0,failed=0,spare=0,nr_disks=0; + +	sb = (mdp_super_t*)page_address(rdev->sb_page); + +	memset(sb, 0, sizeof(*sb)); + +	sb->md_magic = MD_SB_MAGIC; +	sb->major_version = mddev->major_version; +	sb->minor_version = mddev->minor_version; +	sb->patch_version = mddev->patch_version; +	sb->gvalid_words  = 0; /* ignored */ +	memcpy(&sb->set_uuid0, mddev->uuid+0, 4); +	memcpy(&sb->set_uuid1, mddev->uuid+4, 4); +	memcpy(&sb->set_uuid2, mddev->uuid+8, 4); +	memcpy(&sb->set_uuid3, mddev->uuid+12,4); + +	sb->ctime = mddev->ctime; +	sb->level = mddev->level; +	sb->size  = mddev->size; +	sb->raid_disks = mddev->raid_disks; +	sb->md_minor = mddev->md_minor; +	sb->not_persistent = !mddev->persistent; +	sb->utime = mddev->utime; +	sb->state = 0; +	sb->events_hi = (mddev->events>>32); +	sb->events_lo = (u32)mddev->events; + +	if (mddev->in_sync) +	{ +		sb->recovery_cp = mddev->recovery_cp; +		sb->cp_events_hi = (mddev->events>>32); +		sb->cp_events_lo = (u32)mddev->events; +		if (mddev->recovery_cp == MaxSector) +			sb->state = (1<< MD_SB_CLEAN); +	} else +		sb->recovery_cp = 0; + +	sb->layout = mddev->layout; +	sb->chunk_size = mddev->chunk_size; + +	sb->disks[0].state = (1<<MD_DISK_REMOVED); +	ITERATE_RDEV(mddev,rdev2,tmp) { +		mdp_disk_t *d; +		if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) +			rdev2->desc_nr = rdev2->raid_disk; +		else +			rdev2->desc_nr = next_spare++; +		d = &sb->disks[rdev2->desc_nr]; +		nr_disks++; +		d->number = rdev2->desc_nr; +		d->major = MAJOR(rdev2->bdev->bd_dev); +		d->minor = MINOR(rdev2->bdev->bd_dev); +		if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) +			d->raid_disk = rdev2->raid_disk; +		else +			d->raid_disk = rdev2->desc_nr; /* compatibility */ +		if (rdev2->faulty) { +			d->state = (1<<MD_DISK_FAULTY); +			failed++; +		} else if (rdev2->in_sync) { +			d->state = (1<<MD_DISK_ACTIVE); +			d->state |= (1<<MD_DISK_SYNC); +			active++; +			working++; +		} else { +			d->state = 0; +			spare++; +			working++; +		} +	} +	 +	/* now set the "removed" and "faulty" bits on any missing devices */ +	for (i=0 ; i < mddev->raid_disks ; i++) { +		mdp_disk_t *d = &sb->disks[i]; +		if (d->state == 0 && d->number == 0) { +			d->number = i; +			d->raid_disk = i; +			d->state = (1<<MD_DISK_REMOVED); +			d->state |= (1<<MD_DISK_FAULTY); +			failed++; +		} +	} +	sb->nr_disks = nr_disks; +	sb->active_disks = active; +	sb->working_disks = working; +	sb->failed_disks = failed; +	sb->spare_disks = spare; + +	sb->this_disk = sb->disks[rdev->desc_nr]; +	sb->sb_csum = calc_sb_csum(sb); +} + +/* + * version 1 superblock + */ + +static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ +	unsigned int disk_csum, csum; +	unsigned long long newcsum; +	int size = 256 + le32_to_cpu(sb->max_dev)*2; +	unsigned int *isuper = (unsigned int*)sb; +	int i; + +	disk_csum = sb->sb_csum; +	sb->sb_csum = 0; +	newcsum = 0; +	for (i=0; size>=4; size -= 4 ) +		newcsum += le32_to_cpu(*isuper++); + +	if (size == 2) +		newcsum += le16_to_cpu(*(unsigned short*) isuper); + +	csum = (newcsum & 0xffffffff) + (newcsum >> 32); +	sb->sb_csum = disk_csum; +	return cpu_to_le32(csum); +} + +static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ +	struct mdp_superblock_1 *sb; +	int ret; +	sector_t sb_offset; +	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + +	/* +	 * Calculate the position of the superblock. +	 * It is always aligned to a 4K boundary and +	 * depeding on minor_version, it can be: +	 * 0: At least 8K, but less than 12K, from end of device +	 * 1: At start of device +	 * 2: 4K from start of device. +	 */ +	switch(minor_version) { +	case 0: +		sb_offset = rdev->bdev->bd_inode->i_size >> 9; +		sb_offset -= 8*2; +		sb_offset &= ~(4*2-1); +		/* convert from sectors to K */ +		sb_offset /= 2; +		break; +	case 1: +		sb_offset = 0; +		break; +	case 2: +		sb_offset = 4; +		break; +	default: +		return -EINVAL; +	} +	rdev->sb_offset = sb_offset; + +	ret = read_disk_sb(rdev); +	if (ret) return ret; + + +	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + +	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || +	    sb->major_version != cpu_to_le32(1) || +	    le32_to_cpu(sb->max_dev) > (4096-256)/2 || +	    le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || +	    sb->feature_map != 0) +		return -EINVAL; + +	if (calc_sb_1_csum(sb) != sb->sb_csum) { +		printk("md: invalid superblock checksum on %s\n", +			bdevname(rdev->bdev,b)); +		return -EINVAL; +	} +	if (le64_to_cpu(sb->data_size) < 10) { +		printk("md: data_size too small on %s\n", +		       bdevname(rdev->bdev,b)); +		return -EINVAL; +	} +	rdev->preferred_minor = 0xffff; +	rdev->data_offset = le64_to_cpu(sb->data_offset); + +	if (refdev == 0) +		return 1; +	else { +		__u64 ev1, ev2; +		struct mdp_superblock_1 *refsb =  +			(struct mdp_superblock_1*)page_address(refdev->sb_page); + +		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || +		    sb->level != refsb->level || +		    sb->layout != refsb->layout || +		    sb->chunksize != refsb->chunksize) { +			printk(KERN_WARNING "md: %s has strangely different" +				" superblock to %s\n", +				bdevname(rdev->bdev,b), +				bdevname(refdev->bdev,b2)); +			return -EINVAL; +		} +		ev1 = le64_to_cpu(sb->events); +		ev2 = le64_to_cpu(refsb->events); + +		if (ev1 > ev2) +			return 1; +	} +	if (minor_version)  +		rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; +	else +		rdev->size = rdev->sb_offset; +	if (rdev->size < le64_to_cpu(sb->data_size)/2) +		return -EINVAL; +	rdev->size = le64_to_cpu(sb->data_size)/2; +	if (le32_to_cpu(sb->chunksize)) +		rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); +	return 0; +} + +static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + +	if (mddev->raid_disks == 0) { +		mddev->major_version = 1; +		mddev->patch_version = 0; +		mddev->persistent = 1; +		mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; +		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); +		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); +		mddev->level = le32_to_cpu(sb->level); +		mddev->layout = le32_to_cpu(sb->layout); +		mddev->raid_disks = le32_to_cpu(sb->raid_disks); +		mddev->size = le64_to_cpu(sb->size)/2; +		mddev->events = le64_to_cpu(sb->events); +		 +		mddev->recovery_cp = le64_to_cpu(sb->resync_offset); +		memcpy(mddev->uuid, sb->set_uuid, 16); + +		mddev->max_disks =  (4096-256)/2; +	} else { +		__u64 ev1; +		ev1 = le64_to_cpu(sb->events); +		++ev1; +		if (ev1 < mddev->events) +			return -EINVAL; +	} + +	if (mddev->level != LEVEL_MULTIPATH) { +		int role; +		rdev->desc_nr = le32_to_cpu(sb->dev_number); +		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); +		switch(role) { +		case 0xffff: /* spare */ +			rdev->in_sync = 0; +			rdev->faulty = 0; +			rdev->raid_disk = -1; +			break; +		case 0xfffe: /* faulty */ +			rdev->in_sync = 0; +			rdev->faulty = 1; +			rdev->raid_disk = -1; +			break; +		default: +			rdev->in_sync = 1; +			rdev->faulty = 0; +			rdev->raid_disk = role; +			break; +		} +	} +	return 0; +} + +static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	struct mdp_superblock_1 *sb; +	struct list_head *tmp; +	mdk_rdev_t *rdev2; +	int max_dev, i; +	/* make rdev->sb match mddev and rdev data. */ + +	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + +	sb->feature_map = 0; +	sb->pad0 = 0; +	memset(sb->pad1, 0, sizeof(sb->pad1)); +	memset(sb->pad2, 0, sizeof(sb->pad2)); +	memset(sb->pad3, 0, sizeof(sb->pad3)); + +	sb->utime = cpu_to_le64((__u64)mddev->utime); +	sb->events = cpu_to_le64(mddev->events); +	if (mddev->in_sync) +		sb->resync_offset = cpu_to_le64(mddev->recovery_cp); +	else +		sb->resync_offset = cpu_to_le64(0); + +	max_dev = 0; +	ITERATE_RDEV(mddev,rdev2,tmp) +		if (rdev2->desc_nr+1 > max_dev) +			max_dev = rdev2->desc_nr+1; +	 +	sb->max_dev = cpu_to_le32(max_dev); +	for (i=0; i<max_dev;i++) +		sb->dev_roles[i] = cpu_to_le16(0xfffe); +	 +	ITERATE_RDEV(mddev,rdev2,tmp) { +		i = rdev2->desc_nr; +		if (rdev2->faulty) +			sb->dev_roles[i] = cpu_to_le16(0xfffe); +		else if (rdev2->in_sync) +			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); +		else +			sb->dev_roles[i] = cpu_to_le16(0xffff); +	} + +	sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ +	sb->sb_csum = calc_sb_1_csum(sb); +} + + +struct super_type super_types[] = { +	[0] = { +		.name	= "0.90.0", +		.owner	= THIS_MODULE, +		.load_super	= super_90_load, +		.validate_super	= super_90_validate, +		.sync_super	= super_90_sync, +	}, +	[1] = { +		.name	= "md-1", +		.owner	= THIS_MODULE, +		.load_super	= super_1_load, +		.validate_super	= super_1_validate, +		.sync_super	= super_1_sync, +	}, +}; +	 +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) +{ +	struct list_head *tmp; +	mdk_rdev_t *rdev; + +	ITERATE_RDEV(mddev,rdev,tmp) +		if (rdev->bdev->bd_contains == dev->bdev->bd_contains) +			return rdev; + +	return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ +	struct list_head *tmp; +	mdk_rdev_t *rdev; + +	ITERATE_RDEV(mddev1,rdev,tmp) +		if (match_dev_unit(mddev2, rdev)) +			return 1; + +	return 0; +} + +static LIST_HEAD(pending_raid_disks); + +static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ +	mdk_rdev_t *same_pdev; +	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + +	if (rdev->mddev) { +		MD_BUG(); +		return -EINVAL; +	} +	same_pdev = match_dev_unit(mddev, rdev); +	if (same_pdev) +		printk(KERN_WARNING +			"%s: WARNING: %s appears to be on the same physical" +	 		" disk as %s. True\n     protection against single-disk" +			" failure might be compromised.\n", +			mdname(mddev), bdevname(rdev->bdev,b), +			bdevname(same_pdev->bdev,b2)); + +	/* Verify rdev->desc_nr is unique. +	 * If it is -1, assign a free number, else +	 * check number is not in use +	 */ +	if (rdev->desc_nr < 0) { +		int choice = 0; +		if (mddev->pers) choice = mddev->raid_disks; +		while (find_rdev_nr(mddev, choice)) +			choice++; +		rdev->desc_nr = choice; +	} else { +		if (find_rdev_nr(mddev, rdev->desc_nr)) +			return -EBUSY; +	} +			 +	list_add(&rdev->same_set, &mddev->disks); +	rdev->mddev = mddev; +	printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); +	return 0; +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ +	char b[BDEVNAME_SIZE]; +	if (!rdev->mddev) { +		MD_BUG(); +		return; +	} +	list_del_init(&rdev->same_set); +	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); +	rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by bd_claiming the device. + */ +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +{ +	int err = 0; +	struct block_device *bdev; +	char b[BDEVNAME_SIZE]; + +	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); +	if (IS_ERR(bdev)) { +		printk(KERN_ERR "md: could not open %s.\n", +			__bdevname(dev, b)); +		return PTR_ERR(bdev); +	} +	err = bd_claim(bdev, rdev); +	if (err) { +		printk(KERN_ERR "md: could not bd_claim %s.\n", +			bdevname(bdev, b)); +		blkdev_put(bdev); +		return err; +	} +	rdev->bdev = bdev; +	return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ +	struct block_device *bdev = rdev->bdev; +	rdev->bdev = NULL; +	if (!bdev) +		MD_BUG(); +	bd_release(bdev); +	blkdev_put(bdev); +} + +void md_autodetect_dev(dev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ +	char b[BDEVNAME_SIZE]; +	printk(KERN_INFO "md: export_rdev(%s)\n", +		bdevname(rdev->bdev,b)); +	if (rdev->mddev) +		MD_BUG(); +	free_disk_sb(rdev); +	list_del_init(&rdev->same_set); +#ifndef MODULE +	md_autodetect_dev(rdev->bdev->bd_dev); +#endif +	unlock_rdev(rdev); +	kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ +	unbind_rdev_from_array(rdev); +	export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ +	struct list_head *tmp; +	mdk_rdev_t *rdev; + +	ITERATE_RDEV(mddev,rdev,tmp) { +		if (!rdev->mddev) { +			MD_BUG(); +			continue; +		} +		kick_rdev_from_array(rdev); +	} +	if (!list_empty(&mddev->disks)) +		MD_BUG(); +	mddev->raid_disks = 0; +	mddev->major_version = 0; +} + +static void print_desc(mdp_disk_t *desc) +{ +	printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, +		desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ +	int i; + +	printk(KERN_INFO  +		"md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", +		sb->major_version, sb->minor_version, sb->patch_version, +		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, +		sb->ctime); +	printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", +		sb->level, sb->size, sb->nr_disks, sb->raid_disks, +		sb->md_minor, sb->layout, sb->chunk_size); +	printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d" +		" FD:%d SD:%d CSUM:%08x E:%08lx\n", +		sb->utime, sb->state, sb->active_disks, sb->working_disks, +		sb->failed_disks, sb->spare_disks, +		sb->sb_csum, (unsigned long)sb->events_lo); + +	printk(KERN_INFO); +	for (i = 0; i < MD_SB_DISKS; i++) { +		mdp_disk_t *desc; + +		desc = sb->disks + i; +		if (desc->number || desc->major || desc->minor || +		    desc->raid_disk || (desc->state && (desc->state != 4))) { +			printk("     D %2d: ", i); +			print_desc(desc); +		} +	} +	printk(KERN_INFO "md:     THIS: "); +	print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ +	char b[BDEVNAME_SIZE]; +	printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", +		bdevname(rdev->bdev,b), (unsigned long long)rdev->size, +	       	rdev->faulty, rdev->in_sync, rdev->desc_nr); +	if (rdev->sb_loaded) { +		printk(KERN_INFO "md: rdev superblock:\n"); +		print_sb((mdp_super_t*)page_address(rdev->sb_page)); +	} else +		printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ +	struct list_head *tmp, *tmp2; +	mdk_rdev_t *rdev; +	mddev_t *mddev; +	char b[BDEVNAME_SIZE]; + +	printk("\n"); +	printk("md:	**********************************\n"); +	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n"); +	printk("md:	**********************************\n"); +	ITERATE_MDDEV(mddev,tmp) { +		printk("%s: ", mdname(mddev)); + +		ITERATE_RDEV(mddev,rdev,tmp2) +			printk("<%s>", bdevname(rdev->bdev,b)); +		printk("\n"); + +		ITERATE_RDEV(mddev,rdev,tmp2) +			print_rdev(rdev); +	} +	printk("md:	**********************************\n"); +	printk("\n"); +} + + +static int write_disk_sb(mdk_rdev_t * rdev) +{ +	char b[BDEVNAME_SIZE]; +	if (!rdev->sb_loaded) { +		MD_BUG(); +		return 1; +	} +	if (rdev->faulty) { +		MD_BUG(); +		return 1; +	} + +	dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", +		bdevname(rdev->bdev,b), +	       (unsigned long long)rdev->sb_offset); +   +	if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) +		return 0; + +	printk("md: write_disk_sb failed for device %s\n",  +		bdevname(rdev->bdev,b)); +	return 1; +} + +static void sync_sbs(mddev_t * mddev) +{ +	mdk_rdev_t *rdev; +	struct list_head *tmp; + +	ITERATE_RDEV(mddev,rdev,tmp) { +		super_types[mddev->major_version]. +			sync_super(mddev, rdev); +		rdev->sb_loaded = 1; +	} +} + +static void md_update_sb(mddev_t * mddev) +{ +	int err, count = 100; +	struct list_head *tmp; +	mdk_rdev_t *rdev; + +	mddev->sb_dirty = 0; +repeat: +	mddev->utime = get_seconds(); +	mddev->events ++; + +	if (!mddev->events) { +		/* +		 * oops, this 64-bit counter should never wrap. +		 * Either we are in around ~1 trillion A.C., assuming +		 * 1 reboot per second, or we have a bug: +		 */ +		MD_BUG(); +		mddev->events --; +	} +	sync_sbs(mddev); + +	/* +	 * do not write anything to disk if using +	 * nonpersistent superblocks +	 */ +	if (!mddev->persistent) +		return; + +	dprintk(KERN_INFO  +		"md: updating %s RAID superblock on device (in sync %d)\n", +		mdname(mddev),mddev->in_sync); + +	err = 0; +	ITERATE_RDEV(mddev,rdev,tmp) { +		char b[BDEVNAME_SIZE]; +		dprintk(KERN_INFO "md: "); +		if (rdev->faulty) +			dprintk("(skipping faulty "); + +		dprintk("%s ", bdevname(rdev->bdev,b)); +		if (!rdev->faulty) { +			err += write_disk_sb(rdev); +		} else +			dprintk(")\n"); +		if (!err && mddev->level == LEVEL_MULTIPATH) +			/* only need to write one superblock... */ +			break; +	} +	if (err) { +		if (--count) { +			printk(KERN_ERR "md: errors occurred during superblock" +				" update, repeating\n"); +			goto repeat; +		} +		printk(KERN_ERR \ +			"md: excessive errors occurred during superblock update, exiting\n"); +	} +} + +/* + * Import a device. If 'super_format' >= 0, then sanity check the superblock + * + * mark the device faulty if: + * + *   - the device is nonexistent (zero size) + *   - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) +{ +	char b[BDEVNAME_SIZE]; +	int err; +	mdk_rdev_t *rdev; +	sector_t size; + +	rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); +	if (!rdev) { +		printk(KERN_ERR "md: could not alloc mem for new device!\n"); +		return ERR_PTR(-ENOMEM); +	} +	memset(rdev, 0, sizeof(*rdev)); + +	if ((err = alloc_disk_sb(rdev))) +		goto abort_free; + +	err = lock_rdev(rdev, newdev); +	if (err) +		goto abort_free; + +	rdev->desc_nr = -1; +	rdev->faulty = 0; +	rdev->in_sync = 0; +	rdev->data_offset = 0; +	atomic_set(&rdev->nr_pending, 0); + +	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; +	if (!size) { +		printk(KERN_WARNING  +			"md: %s has zero or unknown size, marking faulty!\n", +			bdevname(rdev->bdev,b)); +		err = -EINVAL; +		goto abort_free; +	} + +	if (super_format >= 0) { +		err = super_types[super_format]. +			load_super(rdev, NULL, super_minor); +		if (err == -EINVAL) { +			printk(KERN_WARNING  +				"md: %s has invalid sb, not importing!\n", +				bdevname(rdev->bdev,b)); +			goto abort_free; +		} +		if (err < 0) { +			printk(KERN_WARNING  +				"md: could not read %s's sb, not importing!\n", +				bdevname(rdev->bdev,b)); +			goto abort_free; +		} +	} +	INIT_LIST_HEAD(&rdev->same_set); + +	return rdev; + +abort_free: +	if (rdev->sb_page) { +		if (rdev->bdev) +			unlock_rdev(rdev); +		free_disk_sb(rdev); +	} +	kfree(rdev); +	return ERR_PTR(err); +} + +/* + * Check a full RAID array for plausibility + */ + + +static int analyze_sbs(mddev_t * mddev) +{ +	int i; +	struct list_head *tmp; +	mdk_rdev_t *rdev, *freshest; +	char b[BDEVNAME_SIZE]; + +	freshest = NULL; +	ITERATE_RDEV(mddev,rdev,tmp) +		switch (super_types[mddev->major_version]. +			load_super(rdev, freshest, mddev->minor_version)) { +		case 1: +			freshest = rdev; +			break; +		case 0: +			break; +		default: +			printk( KERN_ERR \ +				"md: fatal superblock inconsistency in %s" +				" -- removing from array\n",  +				bdevname(rdev->bdev,b)); +			kick_rdev_from_array(rdev); +		} + + +	super_types[mddev->major_version]. +		validate_super(mddev, freshest); + +	i = 0; +	ITERATE_RDEV(mddev,rdev,tmp) { +		if (rdev != freshest) +			if (super_types[mddev->major_version]. +			    validate_super(mddev, rdev)) { +				printk(KERN_WARNING "md: kicking non-fresh %s" +					" from array!\n", +					bdevname(rdev->bdev,b)); +				kick_rdev_from_array(rdev); +				continue; +			} +		if (mddev->level == LEVEL_MULTIPATH) { +			rdev->desc_nr = i++; +			rdev->raid_disk = rdev->desc_nr; +			rdev->in_sync = 1; +		} +	} + + + +	if (mddev->recovery_cp != MaxSector && +	    mddev->level >= 1) +		printk(KERN_ERR "md: %s: raid array is not clean" +		       " -- starting background reconstruction\n", +		       mdname(mddev)); + +	return 0; +} + +int mdp_major = 0; + +static struct kobject *md_probe(dev_t dev, int *part, void *data) +{ +	static DECLARE_MUTEX(disks_sem); +	mddev_t *mddev = mddev_find(dev); +	struct gendisk *disk; +	int partitioned = (MAJOR(dev) != MD_MAJOR); +	int shift = partitioned ? MdpMinorShift : 0; +	int unit = MINOR(dev) >> shift; + +	if (!mddev) +		return NULL; + +	down(&disks_sem); +	if (mddev->gendisk) { +		up(&disks_sem); +		mddev_put(mddev); +		return NULL; +	} +	disk = alloc_disk(1 << shift); +	if (!disk) { +		up(&disks_sem); +		mddev_put(mddev); +		return NULL; +	} +	disk->major = MAJOR(dev); +	disk->first_minor = unit << shift; +	if (partitioned) { +		sprintf(disk->disk_name, "md_d%d", unit); +		sprintf(disk->devfs_name, "md/d%d", unit); +	} else { +		sprintf(disk->disk_name, "md%d", unit); +		sprintf(disk->devfs_name, "md/%d", unit); +	} +	disk->fops = &md_fops; +	disk->private_data = mddev; +	disk->queue = mddev->queue; +	add_disk(disk); +	mddev->gendisk = disk; +	up(&disks_sem); +	return NULL; +} + +void md_wakeup_thread(mdk_thread_t *thread); + +static void md_safemode_timeout(unsigned long data) +{ +	mddev_t *mddev = (mddev_t *) data; + +	mddev->safemode = 1; +	md_wakeup_thread(mddev->thread); +} + + +static int do_md_run(mddev_t * mddev) +{ +	int pnum, err; +	int chunk_size; +	struct list_head *tmp; +	mdk_rdev_t *rdev; +	struct gendisk *disk; +	char b[BDEVNAME_SIZE]; + +	if (list_empty(&mddev->disks)) { +		MD_BUG(); +		return -EINVAL; +	} + +	if (mddev->pers) +		return -EBUSY; + +	/* +	 * Analyze all RAID superblock(s) +	 */ +	if (!mddev->raid_disks && analyze_sbs(mddev)) { +		MD_BUG(); +		return -EINVAL; +	} + +	chunk_size = mddev->chunk_size; +	pnum = level_to_pers(mddev->level); + +	if ((pnum != MULTIPATH) && (pnum != RAID1)) { +		if (!chunk_size) { +			/* +			 * 'default chunksize' in the old md code used to +			 * be PAGE_SIZE, baaad. +			 * we abort here to be on the safe side. We don't +			 * want to continue the bad practice. +			 */ +			printk(KERN_ERR  +				"no chunksize specified, see 'man raidtab'\n"); +			return -EINVAL; +		} +		if (chunk_size > MAX_CHUNK_SIZE) { +			printk(KERN_ERR "too big chunk_size: %d > %d\n", +				chunk_size, MAX_CHUNK_SIZE); +			return -EINVAL; +		} +		/* +		 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE +		 */ +		if ( (1 << ffz(~chunk_size)) != chunk_size) { +			MD_BUG(); +			return -EINVAL; +		} +		if (chunk_size < PAGE_SIZE) { +			printk(KERN_ERR "too small chunk_size: %d < %ld\n", +				chunk_size, PAGE_SIZE); +			return -EINVAL; +		} + +		/* devices must have minimum size of one chunk */ +		ITERATE_RDEV(mddev,rdev,tmp) { +			if (rdev->faulty) +				continue; +			if (rdev->size < chunk_size / 1024) { +				printk(KERN_WARNING +					"md: Dev %s smaller than chunk_size:" +					" %lluk < %dk\n", +					bdevname(rdev->bdev,b), +					(unsigned long long)rdev->size, +					chunk_size / 1024); +				return -EINVAL; +			} +		} +	} + +	if (pnum >= MAX_PERSONALITY) { +		MD_BUG(); +		return -EINVAL; +	} + +#ifdef CONFIG_KMOD +	if (!pers[pnum]) +	{ +		request_module("md-personality-%d", pnum); +	} +#endif + +	/* +	 * Drop all container device buffers, from now on +	 * the only valid external interface is through the md +	 * device. +	 * Also find largest hardsector size +	 */ +	ITERATE_RDEV(mddev,rdev,tmp) { +		if (rdev->faulty) +			continue; +		sync_blockdev(rdev->bdev); +		invalidate_bdev(rdev->bdev, 0); +	} + +	md_probe(mddev->unit, NULL, NULL); +	disk = mddev->gendisk; +	if (!disk) +		return -ENOMEM; + +	spin_lock(&pers_lock); +	if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { +		spin_unlock(&pers_lock); +		printk(KERN_WARNING "md: personality %d is not loaded!\n", +		       pnum); +		return -EINVAL; +	} + +	mddev->pers = pers[pnum]; +	spin_unlock(&pers_lock); + +	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ + +	err = mddev->pers->run(mddev); +	if (err) { +		printk(KERN_ERR "md: pers->run() failed ...\n"); +		module_put(mddev->pers->owner); +		mddev->pers = NULL; +		return -EINVAL; +	} + 	atomic_set(&mddev->writes_pending,0); +	mddev->safemode = 0; +	mddev->safemode_timer.function = md_safemode_timeout; +	mddev->safemode_timer.data = (unsigned long) mddev; +	mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ +	mddev->in_sync = 1; +	 +	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +	 +	if (mddev->sb_dirty) +		md_update_sb(mddev); + +	set_capacity(disk, mddev->array_size<<1); + +	/* If we call blk_queue_make_request here, it will +	 * re-initialise max_sectors etc which may have been +	 * refined inside -> run.  So just set the bits we need to set. +	 * Most initialisation happended when we called +	 * blk_queue_make_request(..., md_fail_request) +	 * earlier. +	 */ +	mddev->queue->queuedata = mddev; +	mddev->queue->make_request_fn = mddev->pers->make_request; + +	mddev->changed = 1; +	return 0; +} + +static int restart_array(mddev_t *mddev) +{ +	struct gendisk *disk = mddev->gendisk; +	int err; + +	/* +	 * Complain if it has no devices +	 */ +	err = -ENXIO; +	if (list_empty(&mddev->disks)) +		goto out; + +	if (mddev->pers) { +		err = -EBUSY; +		if (!mddev->ro) +			goto out; + +		mddev->safemode = 0; +		mddev->ro = 0; +		set_disk_ro(disk, 0); + +		printk(KERN_INFO "md: %s switched to read-write mode.\n", +			mdname(mddev)); +		/* +		 * Kick recovery or resync if necessary +		 */ +		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +		md_wakeup_thread(mddev->thread); +		err = 0; +	} else { +		printk(KERN_ERR "md: %s has no personality assigned.\n", +			mdname(mddev)); +		err = -EINVAL; +	} + +out: +	return err; +} + +static int do_md_stop(mddev_t * mddev, int ro) +{ +	int err = 0; +	struct gendisk *disk = mddev->gendisk; + +	if (mddev->pers) { +		if (atomic_read(&mddev->active)>2) { +			printk("md: %s still in use.\n",mdname(mddev)); +			return -EBUSY; +		} + +		if (mddev->sync_thread) { +			set_bit(MD_RECOVERY_INTR, &mddev->recovery); +			md_unregister_thread(mddev->sync_thread); +			mddev->sync_thread = NULL; +		} + +		del_timer_sync(&mddev->safemode_timer); + +		invalidate_partition(disk, 0); + +		if (ro) { +			err  = -ENXIO; +			if (mddev->ro) +				goto out; +			mddev->ro = 1; +		} else { +			if (mddev->ro) +				set_disk_ro(disk, 0); +			blk_queue_make_request(mddev->queue, md_fail_request); +			mddev->pers->stop(mddev); +			module_put(mddev->pers->owner); +			mddev->pers = NULL; +			if (mddev->ro) +				mddev->ro = 0; +		} +		if (!mddev->in_sync) { +			/* mark array as shutdown cleanly */ +			mddev->in_sync = 1; +			md_update_sb(mddev); +		} +		if (ro) +			set_disk_ro(disk, 1); +	} +	/* +	 * Free resources if final stop +	 */ +	if (!ro) { +		struct gendisk *disk; +		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); + +		export_array(mddev); + +		mddev->array_size = 0; +		disk = mddev->gendisk; +		if (disk) +			set_capacity(disk, 0); +		mddev->changed = 1; +	} else +		printk(KERN_INFO "md: %s switched to read-only mode.\n", +			mdname(mddev)); +	err = 0; +out: +	return err; +} + +static void autorun_array(mddev_t *mddev) +{ +	mdk_rdev_t *rdev; +	struct list_head *tmp; +	int err; + +	if (list_empty(&mddev->disks)) { +		MD_BUG(); +		return; +	} + +	printk(KERN_INFO "md: running: "); + +	ITERATE_RDEV(mddev,rdev,tmp) { +		char b[BDEVNAME_SIZE]; +		printk("<%s>", bdevname(rdev->bdev,b)); +	} +	printk("\n"); + +	err = do_md_run (mddev); +	if (err) { +		printk(KERN_WARNING "md: do_md_run() returned %d\n", err); +		do_md_stop (mddev, 0); +	} +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in pending_raid_disks) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(int part) +{ +	struct list_head candidates; +	struct list_head *tmp; +	mdk_rdev_t *rdev0, *rdev; +	mddev_t *mddev; +	char b[BDEVNAME_SIZE]; + +	printk(KERN_INFO "md: autorun ...\n"); +	while (!list_empty(&pending_raid_disks)) { +		dev_t dev; +		rdev0 = list_entry(pending_raid_disks.next, +					 mdk_rdev_t, same_set); + +		printk(KERN_INFO "md: considering %s ...\n", +			bdevname(rdev0->bdev,b)); +		INIT_LIST_HEAD(&candidates); +		ITERATE_RDEV_PENDING(rdev,tmp) +			if (super_90_load(rdev, rdev0, 0) >= 0) { +				printk(KERN_INFO "md:  adding %s ...\n", +					bdevname(rdev->bdev,b)); +				list_move(&rdev->same_set, &candidates); +			} +		/* +		 * now we have a set of devices, with all of them having +		 * mostly sane superblocks. It's time to allocate the +		 * mddev. +		 */ +		if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { +			printk(KERN_INFO "md: unit number in %s is bad: %d\n", +			       bdevname(rdev0->bdev, b), rdev0->preferred_minor); +			break; +		} +		if (part) +			dev = MKDEV(mdp_major, +				    rdev0->preferred_minor << MdpMinorShift); +		else +			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); + +		md_probe(dev, NULL, NULL); +		mddev = mddev_find(dev); +		if (!mddev) { +			printk(KERN_ERR  +				"md: cannot allocate memory for md drive.\n"); +			break; +		} +		if (mddev_lock(mddev))  +			printk(KERN_WARNING "md: %s locked, cannot run\n", +			       mdname(mddev)); +		else if (mddev->raid_disks || mddev->major_version +			 || !list_empty(&mddev->disks)) { +			printk(KERN_WARNING  +				"md: %s already running, cannot run %s\n", +				mdname(mddev), bdevname(rdev0->bdev,b)); +			mddev_unlock(mddev); +		} else { +			printk(KERN_INFO "md: created %s\n", mdname(mddev)); +			ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { +				list_del_init(&rdev->same_set); +				if (bind_rdev_to_array(rdev, mddev)) +					export_rdev(rdev); +			} +			autorun_array(mddev); +			mddev_unlock(mddev); +		} +		/* on success, candidates will be empty, on error +		 * it won't... +		 */ +		ITERATE_RDEV_GENERIC(candidates,rdev,tmp) +			export_rdev(rdev); +		mddev_put(mddev); +	} +	printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +static int autostart_array(dev_t startdev) +{ +	char b[BDEVNAME_SIZE]; +	int err = -EINVAL, i; +	mdp_super_t *sb = NULL; +	mdk_rdev_t *start_rdev = NULL, *rdev; + +	start_rdev = md_import_device(startdev, 0, 0); +	if (IS_ERR(start_rdev)) +		return err; + + +	/* NOTE: this can only work for 0.90.0 superblocks */ +	sb = (mdp_super_t*)page_address(start_rdev->sb_page); +	if (sb->major_version != 0 || +	    sb->minor_version != 90 ) { +		printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); +		export_rdev(start_rdev); +		return err; +	} + +	if (start_rdev->faulty) { +		printk(KERN_WARNING  +			"md: can not autostart based on faulty %s!\n", +			bdevname(start_rdev->bdev,b)); +		export_rdev(start_rdev); +		return err; +	} +	list_add(&start_rdev->same_set, &pending_raid_disks); + +	for (i = 0; i < MD_SB_DISKS; i++) { +		mdp_disk_t *desc = sb->disks + i; +		dev_t dev = MKDEV(desc->major, desc->minor); + +		if (!dev) +			continue; +		if (dev == startdev) +			continue; +		if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) +			continue; +		rdev = md_import_device(dev, 0, 0); +		if (IS_ERR(rdev)) +			continue; + +		list_add(&rdev->same_set, &pending_raid_disks); +	} + +	/* +	 * possibly return codes +	 */ +	autorun_devices(0); +	return 0; + +} + + +static int get_version(void __user * arg) +{ +	mdu_version_t ver; + +	ver.major = MD_MAJOR_VERSION; +	ver.minor = MD_MINOR_VERSION; +	ver.patchlevel = MD_PATCHLEVEL_VERSION; + +	if (copy_to_user(arg, &ver, sizeof(ver))) +		return -EFAULT; + +	return 0; +} + +static int get_array_info(mddev_t * mddev, void __user * arg) +{ +	mdu_array_info_t info; +	int nr,working,active,failed,spare; +	mdk_rdev_t *rdev; +	struct list_head *tmp; + +	nr=working=active=failed=spare=0; +	ITERATE_RDEV(mddev,rdev,tmp) { +		nr++; +		if (rdev->faulty) +			failed++; +		else { +			working++; +			if (rdev->in_sync) +				active++;	 +			else +				spare++; +		} +	} + +	info.major_version = mddev->major_version; +	info.minor_version = mddev->minor_version; +	info.patch_version = MD_PATCHLEVEL_VERSION; +	info.ctime         = mddev->ctime; +	info.level         = mddev->level; +	info.size          = mddev->size; +	info.nr_disks      = nr; +	info.raid_disks    = mddev->raid_disks; +	info.md_minor      = mddev->md_minor; +	info.not_persistent= !mddev->persistent; + +	info.utime         = mddev->utime; +	info.state         = 0; +	if (mddev->in_sync) +		info.state = (1<<MD_SB_CLEAN); +	info.active_disks  = active; +	info.working_disks = working; +	info.failed_disks  = failed; +	info.spare_disks   = spare; + +	info.layout        = mddev->layout; +	info.chunk_size    = mddev->chunk_size; + +	if (copy_to_user(arg, &info, sizeof(info))) +		return -EFAULT; + +	return 0; +} + +static int get_disk_info(mddev_t * mddev, void __user * arg) +{ +	mdu_disk_info_t info; +	unsigned int nr; +	mdk_rdev_t *rdev; + +	if (copy_from_user(&info, arg, sizeof(info))) +		return -EFAULT; + +	nr = info.number; + +	rdev = find_rdev_nr(mddev, nr); +	if (rdev) { +		info.major = MAJOR(rdev->bdev->bd_dev); +		info.minor = MINOR(rdev->bdev->bd_dev); +		info.raid_disk = rdev->raid_disk; +		info.state = 0; +		if (rdev->faulty) +			info.state |= (1<<MD_DISK_FAULTY); +		else if (rdev->in_sync) { +			info.state |= (1<<MD_DISK_ACTIVE); +			info.state |= (1<<MD_DISK_SYNC); +		} +	} else { +		info.major = info.minor = 0; +		info.raid_disk = -1; +		info.state = (1<<MD_DISK_REMOVED); +	} + +	if (copy_to_user(arg, &info, sizeof(info))) +		return -EFAULT; + +	return 0; +} + +static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +{ +	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; +	mdk_rdev_t *rdev; +	dev_t dev = MKDEV(info->major,info->minor); + +	if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) +		return -EOVERFLOW; + +	if (!mddev->raid_disks) { +		int err; +		/* expecting a device which has a superblock */ +		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); +		if (IS_ERR(rdev)) { +			printk(KERN_WARNING  +				"md: md_import_device returned %ld\n", +				PTR_ERR(rdev)); +			return PTR_ERR(rdev); +		} +		if (!list_empty(&mddev->disks)) { +			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, +							mdk_rdev_t, same_set); +			int err = super_types[mddev->major_version] +				.load_super(rdev, rdev0, mddev->minor_version); +			if (err < 0) { +				printk(KERN_WARNING  +					"md: %s has different UUID to %s\n", +					bdevname(rdev->bdev,b),  +					bdevname(rdev0->bdev,b2)); +				export_rdev(rdev); +				return -EINVAL; +			} +		} +		err = bind_rdev_to_array(rdev, mddev); +		if (err) +			export_rdev(rdev); +		return err; +	} + +	/* +	 * add_new_disk can be used once the array is assembled +	 * to add "hot spares".  They must already have a superblock +	 * written +	 */ +	if (mddev->pers) { +		int err; +		if (!mddev->pers->hot_add_disk) { +			printk(KERN_WARNING  +				"%s: personality does not support diskops!\n", +			       mdname(mddev)); +			return -EINVAL; +		} +		rdev = md_import_device(dev, mddev->major_version, +					mddev->minor_version); +		if (IS_ERR(rdev)) { +			printk(KERN_WARNING  +				"md: md_import_device returned %ld\n", +				PTR_ERR(rdev)); +			return PTR_ERR(rdev); +		} +		rdev->in_sync = 0; /* just to be sure */ +		rdev->raid_disk = -1; +		err = bind_rdev_to_array(rdev, mddev); +		if (err) +			export_rdev(rdev); +		if (mddev->thread) +			md_wakeup_thread(mddev->thread); +		return err; +	} + +	/* otherwise, add_new_disk is only allowed +	 * for major_version==0 superblocks +	 */ +	if (mddev->major_version != 0) { +		printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", +		       mdname(mddev)); +		return -EINVAL; +	} + +	if (!(info->state & (1<<MD_DISK_FAULTY))) { +		int err; +		rdev = md_import_device (dev, -1, 0); +		if (IS_ERR(rdev)) { +			printk(KERN_WARNING  +				"md: error, md_import_device() returned %ld\n", +				PTR_ERR(rdev)); +			return PTR_ERR(rdev); +		} +		rdev->desc_nr = info->number; +		if (info->raid_disk < mddev->raid_disks) +			rdev->raid_disk = info->raid_disk; +		else +			rdev->raid_disk = -1; + +		rdev->faulty = 0; +		if (rdev->raid_disk < mddev->raid_disks) +			rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); +		else +			rdev->in_sync = 0; + +		err = bind_rdev_to_array(rdev, mddev); +		if (err) { +			export_rdev(rdev); +			return err; +		} + +		if (!mddev->persistent) { +			printk(KERN_INFO "md: nonpersistent superblock ...\n"); +			rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; +		} else  +			rdev->sb_offset = calc_dev_sboffset(rdev->bdev); +		rdev->size = calc_dev_size(rdev, mddev->chunk_size); + +		if (!mddev->size || (mddev->size > rdev->size)) +			mddev->size = rdev->size; +	} + +	return 0; +} + +static int hot_remove_disk(mddev_t * mddev, dev_t dev) +{ +	char b[BDEVNAME_SIZE]; +	mdk_rdev_t *rdev; + +	if (!mddev->pers) +		return -ENODEV; + +	rdev = find_rdev(mddev, dev); +	if (!rdev) +		return -ENXIO; + +	if (rdev->raid_disk >= 0) +		goto busy; + +	kick_rdev_from_array(rdev); +	md_update_sb(mddev); + +	return 0; +busy: +	printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", +		bdevname(rdev->bdev,b), mdname(mddev)); +	return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, dev_t dev) +{ +	char b[BDEVNAME_SIZE]; +	int err; +	unsigned int size; +	mdk_rdev_t *rdev; + +	if (!mddev->pers) +		return -ENODEV; + +	if (mddev->major_version != 0) { +		printk(KERN_WARNING "%s: HOT_ADD may only be used with" +			" version-0 superblocks.\n", +			mdname(mddev)); +		return -EINVAL; +	} +	if (!mddev->pers->hot_add_disk) { +		printk(KERN_WARNING  +			"%s: personality does not support diskops!\n", +			mdname(mddev)); +		return -EINVAL; +	} + +	rdev = md_import_device (dev, -1, 0); +	if (IS_ERR(rdev)) { +		printk(KERN_WARNING  +			"md: error, md_import_device() returned %ld\n", +			PTR_ERR(rdev)); +		return -EINVAL; +	} + +	if (mddev->persistent) +		rdev->sb_offset = calc_dev_sboffset(rdev->bdev); +	else +		rdev->sb_offset = +			rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + +	size = calc_dev_size(rdev, mddev->chunk_size); +	rdev->size = size; + +	if (size < mddev->size) { +		printk(KERN_WARNING  +			"%s: disk size %llu blocks < array size %llu\n", +			mdname(mddev), (unsigned long long)size, +			(unsigned long long)mddev->size); +		err = -ENOSPC; +		goto abort_export; +	} + +	if (rdev->faulty) { +		printk(KERN_WARNING  +			"md: can not hot-add faulty %s disk to %s!\n", +			bdevname(rdev->bdev,b), mdname(mddev)); +		err = -EINVAL; +		goto abort_export; +	} +	rdev->in_sync = 0; +	rdev->desc_nr = -1; +	bind_rdev_to_array(rdev, mddev); + +	/* +	 * The rest should better be atomic, we can have disk failures +	 * noticed in interrupt contexts ... +	 */ + +	if (rdev->desc_nr == mddev->max_disks) { +		printk(KERN_WARNING "%s: can not hot-add to full array!\n", +			mdname(mddev)); +		err = -EBUSY; +		goto abort_unbind_export; +	} + +	rdev->raid_disk = -1; + +	md_update_sb(mddev); + +	/* +	 * Kick recovery, maybe this spare has to be added to the +	 * array immediately. +	 */ +	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +	md_wakeup_thread(mddev->thread); + +	return 0; + +abort_unbind_export: +	unbind_rdev_from_array(rdev); + +abort_export: +	export_rdev(rdev); +	return err; +} + +/* + * set_array_info is used two different ways + * The original usage is when creating a new array. + * In this usage, raid_disks is > 0 and it together with + *  level, size, not_persistent,layout,chunksize determine the + *  shape of the array. + *  This will always create an array with a type-0.90.0 superblock. + * The newer usage is when assembling an array. + *  In this case raid_disks will be 0, and the major_version field is + *  use to determine which style super-blocks are to be found on the devices. + *  The minor and patch _version numbers are also kept incase the + *  super_block handler wishes to interpret them. + */ +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + +	if (info->raid_disks == 0) { +		/* just setting version number for superblock loading */ +		if (info->major_version < 0 || +		    info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || +		    super_types[info->major_version].name == NULL) { +			/* maybe try to auto-load a module? */ +			printk(KERN_INFO  +				"md: superblock version %d not known\n", +				info->major_version); +			return -EINVAL; +		} +		mddev->major_version = info->major_version; +		mddev->minor_version = info->minor_version; +		mddev->patch_version = info->patch_version; +		return 0; +	} +	mddev->major_version = MD_MAJOR_VERSION; +	mddev->minor_version = MD_MINOR_VERSION; +	mddev->patch_version = MD_PATCHLEVEL_VERSION; +	mddev->ctime         = get_seconds(); + +	mddev->level         = info->level; +	mddev->size          = info->size; +	mddev->raid_disks    = info->raid_disks; +	/* don't set md_minor, it is determined by which /dev/md* was +	 * openned +	 */ +	if (info->state & (1<<MD_SB_CLEAN)) +		mddev->recovery_cp = MaxSector; +	else +		mddev->recovery_cp = 0; +	mddev->persistent    = ! info->not_persistent; + +	mddev->layout        = info->layout; +	mddev->chunk_size    = info->chunk_size; + +	mddev->max_disks     = MD_SB_DISKS; + +	mddev->sb_dirty      = 1; + +	/* +	 * Generate a 128 bit UUID +	 */ +	get_random_bytes(mddev->uuid, 16); + +	return 0; +} + +/* + * update_array_info is used to change the configuration of an + * on-line array. + * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size + * fields in the info are checked against the array. + * Any differences that cannot be handled will cause an error. + * Normally, only one change can be managed at a time. + */ +static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) +{ +	int rv = 0; +	int cnt = 0; + +	if (mddev->major_version != info->major_version || +	    mddev->minor_version != info->minor_version || +/*	    mddev->patch_version != info->patch_version || */ +	    mddev->ctime         != info->ctime         || +	    mddev->level         != info->level         || +/*	    mddev->layout        != info->layout        || */ +	    !mddev->persistent	 != info->not_persistent|| +	    mddev->chunk_size    != info->chunk_size    ) +		return -EINVAL; +	/* Check there is only one change */ +	if (mddev->size != info->size) cnt++; +	if (mddev->raid_disks != info->raid_disks) cnt++; +	if (mddev->layout != info->layout) cnt++; +	if (cnt == 0) return 0; +	if (cnt > 1) return -EINVAL; + +	if (mddev->layout != info->layout) { +		/* Change layout +		 * we don't need to do anything at the md level, the +		 * personality will take care of it all. +		 */ +		if (mddev->pers->reconfig == NULL) +			return -EINVAL; +		else +			return mddev->pers->reconfig(mddev, info->layout, -1); +	} +	if (mddev->size != info->size) { +		mdk_rdev_t * rdev; +		struct list_head *tmp; +		if (mddev->pers->resize == NULL) +			return -EINVAL; +		/* The "size" is the amount of each device that is used. +		 * This can only make sense for arrays with redundancy. +		 * linear and raid0 always use whatever space is available +		 * We can only consider changing the size if no resync +		 * or reconstruction is happening, and if the new size +		 * is acceptable. It must fit before the sb_offset or, +		 * if that is <data_offset, it must fit before the +		 * size of each device. +		 * If size is zero, we find the largest size that fits. +		 */ +		if (mddev->sync_thread) +			return -EBUSY; +		ITERATE_RDEV(mddev,rdev,tmp) { +			sector_t avail; +			int fit = (info->size == 0); +			if (rdev->sb_offset > rdev->data_offset) +				avail = (rdev->sb_offset*2) - rdev->data_offset; +			else +				avail = get_capacity(rdev->bdev->bd_disk) +					- rdev->data_offset; +			if (fit && (info->size == 0 || info->size > avail/2)) +				info->size = avail/2; +			if (avail < ((sector_t)info->size << 1)) +				return -ENOSPC; +		} +		rv = mddev->pers->resize(mddev, (sector_t)info->size *2); +		if (!rv) { +			struct block_device *bdev; + +			bdev = bdget_disk(mddev->gendisk, 0); +			if (bdev) { +				down(&bdev->bd_inode->i_sem); +				i_size_write(bdev->bd_inode, mddev->array_size << 10); +				up(&bdev->bd_inode->i_sem); +				bdput(bdev); +			} +		} +	} +	if (mddev->raid_disks    != info->raid_disks) { +		/* change the number of raid disks */ +		if (mddev->pers->reshape == NULL) +			return -EINVAL; +		if (info->raid_disks <= 0 || +		    info->raid_disks >= mddev->max_disks) +			return -EINVAL; +		if (mddev->sync_thread) +			return -EBUSY; +		rv = mddev->pers->reshape(mddev, info->raid_disks); +		if (!rv) { +			struct block_device *bdev; + +			bdev = bdget_disk(mddev->gendisk, 0); +			if (bdev) { +				down(&bdev->bd_inode->i_sem); +				i_size_write(bdev->bd_inode, mddev->array_size << 10); +				up(&bdev->bd_inode->i_sem); +				bdput(bdev); +			} +		} +	} +	md_update_sb(mddev); +	return rv; +} + +static int set_disk_faulty(mddev_t *mddev, dev_t dev) +{ +	mdk_rdev_t *rdev; + +	if (mddev->pers == NULL) +		return -ENODEV; + +	rdev = find_rdev(mddev, dev); +	if (!rdev) +		return -ENODEV; + +	md_error(mddev, rdev); +	return 0; +} + +static int md_ioctl(struct inode *inode, struct file *file, +			unsigned int cmd, unsigned long arg) +{ +	int err = 0; +	void __user *argp = (void __user *)arg; +	struct hd_geometry __user *loc = argp; +	mddev_t *mddev = NULL; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EACCES; + +	/* +	 * Commands dealing with the RAID driver but not any +	 * particular array: +	 */ +	switch (cmd) +	{ +		case RAID_VERSION: +			err = get_version(argp); +			goto done; + +		case PRINT_RAID_DEBUG: +			err = 0; +			md_print_devices(); +			goto done; + +#ifndef MODULE +		case RAID_AUTORUN: +			err = 0; +			autostart_arrays(arg); +			goto done; +#endif +		default:; +	} + +	/* +	 * Commands creating/starting a new array: +	 */ + +	mddev = inode->i_bdev->bd_disk->private_data; + +	if (!mddev) { +		BUG(); +		goto abort; +	} + + +	if (cmd == START_ARRAY) { +		/* START_ARRAY doesn't need to lock the array as autostart_array +		 * does the locking, and it could even be a different array +		 */ +		static int cnt = 3; +		if (cnt > 0 ) { +			printk(KERN_WARNING +			       "md: %s(pid %d) used deprecated START_ARRAY ioctl. " +			       "This will not be supported beyond 2.6\n", +			       current->comm, current->pid); +			cnt--; +		} +		err = autostart_array(new_decode_dev(arg)); +		if (err) { +			printk(KERN_WARNING "md: autostart failed!\n"); +			goto abort; +		} +		goto done; +	} + +	err = mddev_lock(mddev); +	if (err) { +		printk(KERN_INFO  +			"md: ioctl lock interrupted, reason %d, cmd %d\n", +			err, cmd); +		goto abort; +	} + +	switch (cmd) +	{ +		case SET_ARRAY_INFO: +			{ +				mdu_array_info_t info; +				if (!arg) +					memset(&info, 0, sizeof(info)); +				else if (copy_from_user(&info, argp, sizeof(info))) { +					err = -EFAULT; +					goto abort_unlock; +				} +				if (mddev->pers) { +					err = update_array_info(mddev, &info); +					if (err) { +						printk(KERN_WARNING "md: couldn't update" +						       " array info. %d\n", err); +						goto abort_unlock; +					} +					goto done_unlock; +				} +				if (!list_empty(&mddev->disks)) { +					printk(KERN_WARNING +					       "md: array %s already has disks!\n", +					       mdname(mddev)); +					err = -EBUSY; +					goto abort_unlock; +				} +				if (mddev->raid_disks) { +					printk(KERN_WARNING +					       "md: array %s already initialised!\n", +					       mdname(mddev)); +					err = -EBUSY; +					goto abort_unlock; +				} +				err = set_array_info(mddev, &info); +				if (err) { +					printk(KERN_WARNING "md: couldn't set" +					       " array info. %d\n", err); +					goto abort_unlock; +				} +			} +			goto done_unlock; + +		default:; +	} + +	/* +	 * Commands querying/configuring an existing array: +	 */ +	/* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ +	if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { +		err = -ENODEV; +		goto abort_unlock; +	} + +	/* +	 * Commands even a read-only array can execute: +	 */ +	switch (cmd) +	{ +		case GET_ARRAY_INFO: +			err = get_array_info(mddev, argp); +			goto done_unlock; + +		case GET_DISK_INFO: +			err = get_disk_info(mddev, argp); +			goto done_unlock; + +		case RESTART_ARRAY_RW: +			err = restart_array(mddev); +			goto done_unlock; + +		case STOP_ARRAY: +			err = do_md_stop (mddev, 0); +			goto done_unlock; + +		case STOP_ARRAY_RO: +			err = do_md_stop (mddev, 1); +			goto done_unlock; + +	/* +	 * We have a problem here : there is no easy way to give a CHS +	 * virtual geometry. We currently pretend that we have a 2 heads +	 * 4 sectors (with a BIG number of cylinders...). This drives +	 * dosfs just mad... ;-) +	 */ +		case HDIO_GETGEO: +			if (!loc) { +				err = -EINVAL; +				goto abort_unlock; +			} +			err = put_user (2, (char __user *) &loc->heads); +			if (err) +				goto abort_unlock; +			err = put_user (4, (char __user *) &loc->sectors); +			if (err) +				goto abort_unlock; +			err = put_user(get_capacity(mddev->gendisk)/8, +					(short __user *) &loc->cylinders); +			if (err) +				goto abort_unlock; +			err = put_user (get_start_sect(inode->i_bdev), +						(long __user *) &loc->start); +			goto done_unlock; +	} + +	/* +	 * The remaining ioctls are changing the state of the +	 * superblock, so we do not allow read-only arrays +	 * here: +	 */ +	if (mddev->ro) { +		err = -EROFS; +		goto abort_unlock; +	} + +	switch (cmd) +	{ +		case ADD_NEW_DISK: +		{ +			mdu_disk_info_t info; +			if (copy_from_user(&info, argp, sizeof(info))) +				err = -EFAULT; +			else +				err = add_new_disk(mddev, &info); +			goto done_unlock; +		} + +		case HOT_REMOVE_DISK: +			err = hot_remove_disk(mddev, new_decode_dev(arg)); +			goto done_unlock; + +		case HOT_ADD_DISK: +			err = hot_add_disk(mddev, new_decode_dev(arg)); +			goto done_unlock; + +		case SET_DISK_FAULTY: +			err = set_disk_faulty(mddev, new_decode_dev(arg)); +			goto done_unlock; + +		case RUN_ARRAY: +			err = do_md_run (mddev); +			goto done_unlock; + +		default: +			if (_IOC_TYPE(cmd) == MD_MAJOR) +				printk(KERN_WARNING "md: %s(pid %d) used" +					" obsolete MD ioctl, upgrade your" +					" software to use new ictls.\n", +					current->comm, current->pid); +			err = -EINVAL; +			goto abort_unlock; +	} + +done_unlock: +abort_unlock: +	mddev_unlock(mddev); + +	return err; +done: +	if (err) +		MD_BUG(); +abort: +	return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ +	/* +	 * Succeed if we can lock the mddev, which confirms that +	 * it isn't being stopped right now. +	 */ +	mddev_t *mddev = inode->i_bdev->bd_disk->private_data; +	int err; + +	if ((err = mddev_lock(mddev))) +		goto out; + +	err = 0; +	mddev_get(mddev); +	mddev_unlock(mddev); + +	check_disk_change(inode->i_bdev); + out: +	return err; +} + +static int md_release(struct inode *inode, struct file * file) +{ + 	mddev_t *mddev = inode->i_bdev->bd_disk->private_data; + +	if (!mddev) +		BUG(); +	mddev_put(mddev); + +	return 0; +} + +static int md_media_changed(struct gendisk *disk) +{ +	mddev_t *mddev = disk->private_data; + +	return mddev->changed; +} + +static int md_revalidate(struct gendisk *disk) +{ +	mddev_t *mddev = disk->private_data; + +	mddev->changed = 0; +	return 0; +} +static struct block_device_operations md_fops = +{ +	.owner		= THIS_MODULE, +	.open		= md_open, +	.release	= md_release, +	.ioctl		= md_ioctl, +	.media_changed	= md_media_changed, +	.revalidate_disk= md_revalidate, +}; + +int md_thread(void * arg) +{ +	mdk_thread_t *thread = arg; + +	lock_kernel(); + +	/* +	 * Detach thread +	 */ + +	daemonize(thread->name, mdname(thread->mddev)); + +	current->exit_signal = SIGCHLD; +	allow_signal(SIGKILL); +	thread->tsk = current; + +	/* +	 * md_thread is a 'system-thread', it's priority should be very +	 * high. We avoid resource deadlocks individually in each +	 * raid personality. (RAID5 does preallocation) We also use RR and +	 * the very same RT priority as kswapd, thus we will never get +	 * into a priority inversion deadlock. +	 * +	 * we definitely have to have equal or higher priority than +	 * bdflush, otherwise bdflush will deadlock if there are too +	 * many dirty RAID5 blocks. +	 */ +	unlock_kernel(); + +	complete(thread->event); +	while (thread->run) { +		void (*run)(mddev_t *); + +		wait_event_interruptible(thread->wqueue, +					 test_bit(THREAD_WAKEUP, &thread->flags)); +		if (current->flags & PF_FREEZE) +			refrigerator(PF_FREEZE); + +		clear_bit(THREAD_WAKEUP, &thread->flags); + +		run = thread->run; +		if (run) +			run(thread->mddev); + +		if (signal_pending(current)) +			flush_signals(current); +	} +	complete(thread->event); +	return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ +	if (thread) { +		dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); +		set_bit(THREAD_WAKEUP, &thread->flags); +		wake_up(&thread->wqueue); +	} +} + +mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, +				 const char *name) +{ +	mdk_thread_t *thread; +	int ret; +	struct completion event; + +	thread = (mdk_thread_t *) kmalloc +				(sizeof(mdk_thread_t), GFP_KERNEL); +	if (!thread) +		return NULL; + +	memset(thread, 0, sizeof(mdk_thread_t)); +	init_waitqueue_head(&thread->wqueue); + +	init_completion(&event); +	thread->event = &event; +	thread->run = run; +	thread->mddev = mddev; +	thread->name = name; +	ret = kernel_thread(md_thread, thread, 0); +	if (ret < 0) { +		kfree(thread); +		return NULL; +	} +	wait_for_completion(&event); +	return thread; +} + +static void md_interrupt_thread(mdk_thread_t *thread) +{ +	if (!thread->tsk) { +		MD_BUG(); +		return; +	} +	dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); +	send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ +	struct completion event; + +	init_completion(&event); + +	thread->event = &event; +	thread->run = NULL; +	thread->name = NULL; +	md_interrupt_thread(thread); +	wait_for_completion(&event); +	kfree(thread); +} + +void md_error(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	if (!mddev) { +		MD_BUG(); +		return; +	} + +	if (!rdev || rdev->faulty) +		return; + +	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", +		mdname(mddev), +		MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), +		__builtin_return_address(0),__builtin_return_address(1), +		__builtin_return_address(2),__builtin_return_address(3)); + +	if (!mddev->pers->error_handler) +		return; +	mddev->pers->error_handler(mddev,rdev); +	set_bit(MD_RECOVERY_INTR, &mddev->recovery); +	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +	md_wakeup_thread(mddev->thread); +} + +/* seq_file implementation /proc/mdstat */ + +static void status_unused(struct seq_file *seq) +{ +	int i = 0; +	mdk_rdev_t *rdev; +	struct list_head *tmp; + +	seq_printf(seq, "unused devices: "); + +	ITERATE_RDEV_PENDING(rdev,tmp) { +		char b[BDEVNAME_SIZE]; +		i++; +		seq_printf(seq, "%s ", +			      bdevname(rdev->bdev,b)); +	} +	if (!i) +		seq_printf(seq, "<none>"); + +	seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ +	unsigned long max_blocks, resync, res, dt, db, rt; + +	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + +	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) +		max_blocks = mddev->resync_max_sectors >> 1; +	else +		max_blocks = mddev->size; + +	/* +	 * Should not happen. +	 */ +	if (!max_blocks) { +		MD_BUG(); +		return; +	} +	res = (resync/1024)*1000/(max_blocks/1024 + 1); +	{ +		int i, x = res/50, y = 20-x; +		seq_printf(seq, "["); +		for (i = 0; i < x; i++) +			seq_printf(seq, "="); +		seq_printf(seq, ">"); +		for (i = 0; i < y; i++) +			seq_printf(seq, "."); +		seq_printf(seq, "] "); +	} +	seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", +		      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? +		       "resync" : "recovery"), +		      res/10, res % 10, resync, max_blocks); + +	/* +	 * We do not want to overflow, so the order of operands and +	 * the * 100 / 100 trick are important. We do a +1 to be +	 * safe against division by zero. We only estimate anyway. +	 * +	 * dt: time from mark until now +	 * db: blocks written from mark until now +	 * rt: remaining time +	 */ +	dt = ((jiffies - mddev->resync_mark) / HZ); +	if (!dt) dt++; +	db = resync - (mddev->resync_mark_cnt/2); +	rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + +	seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + +	seq_printf(seq, " speed=%ldK/sec", db/dt); +} + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ +	struct list_head *tmp; +	loff_t l = *pos; +	mddev_t *mddev; + +	if (l >= 0x10000) +		return NULL; +	if (!l--) +		/* header */ +		return (void*)1; + +	spin_lock(&all_mddevs_lock); +	list_for_each(tmp,&all_mddevs) +		if (!l--) { +			mddev = list_entry(tmp, mddev_t, all_mddevs); +			mddev_get(mddev); +			spin_unlock(&all_mddevs_lock); +			return mddev; +		} +	spin_unlock(&all_mddevs_lock); +	if (!l--) +		return (void*)2;/* tail */ +	return NULL; +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	struct list_head *tmp; +	mddev_t *next_mddev, *mddev = v; +	 +	++*pos; +	if (v == (void*)2) +		return NULL; + +	spin_lock(&all_mddevs_lock); +	if (v == (void*)1) +		tmp = all_mddevs.next; +	else +		tmp = mddev->all_mddevs.next; +	if (tmp != &all_mddevs) +		next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); +	else { +		next_mddev = (void*)2; +		*pos = 0x10000; +	}		 +	spin_unlock(&all_mddevs_lock); + +	if (v != (void*)1) +		mddev_put(mddev); +	return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ +	mddev_t *mddev = v; + +	if (mddev && v != (void*)1 && v != (void*)2) +		mddev_put(mddev); +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ +	mddev_t *mddev = v; +	sector_t size; +	struct list_head *tmp2; +	mdk_rdev_t *rdev; +	int i; + +	if (v == (void*)1) { +		seq_printf(seq, "Personalities : "); +		spin_lock(&pers_lock); +		for (i = 0; i < MAX_PERSONALITY; i++) +			if (pers[i]) +				seq_printf(seq, "[%s] ", pers[i]->name); + +		spin_unlock(&pers_lock); +		seq_printf(seq, "\n"); +		return 0; +	} +	if (v == (void*)2) { +		status_unused(seq); +		return 0; +	} + +	if (mddev_lock(mddev)!=0)  +		return -EINTR; +	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { +		seq_printf(seq, "%s : %sactive", mdname(mddev), +						mddev->pers ? "" : "in"); +		if (mddev->pers) { +			if (mddev->ro) +				seq_printf(seq, " (read-only)"); +			seq_printf(seq, " %s", mddev->pers->name); +		} + +		size = 0; +		ITERATE_RDEV(mddev,rdev,tmp2) { +			char b[BDEVNAME_SIZE]; +			seq_printf(seq, " %s[%d]", +				bdevname(rdev->bdev,b), rdev->desc_nr); +			if (rdev->faulty) { +				seq_printf(seq, "(F)"); +				continue; +			} +			size += rdev->size; +		} + +		if (!list_empty(&mddev->disks)) { +			if (mddev->pers) +				seq_printf(seq, "\n      %llu blocks", +					(unsigned long long)mddev->array_size); +			else +				seq_printf(seq, "\n      %llu blocks", +					(unsigned long long)size); +		} + +		if (mddev->pers) { +			mddev->pers->status (seq, mddev); +	 		seq_printf(seq, "\n      "); +			if (mddev->curr_resync > 2) +				status_resync (seq, mddev); +			else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) +				seq_printf(seq, "	resync=DELAYED"); +		} + +		seq_printf(seq, "\n"); +	} +	mddev_unlock(mddev); +	 +	return 0; +} + +static struct seq_operations md_seq_ops = { +	.start  = md_seq_start, +	.next   = md_seq_next, +	.stop   = md_seq_stop, +	.show   = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ +	int error; + +	error = seq_open(file, &md_seq_ops); +	return error; +} + +static struct file_operations md_seq_fops = { +	.open           = md_seq_open, +	.read           = seq_read, +	.llseek         = seq_lseek, +	.release	= seq_release, +}; + +int register_md_personality(int pnum, mdk_personality_t *p) +{ +	if (pnum >= MAX_PERSONALITY) { +		printk(KERN_ERR +		       "md: tried to install personality %s as nr %d, but max is %lu\n", +		       p->name, pnum, MAX_PERSONALITY-1); +		return -EINVAL; +	} + +	spin_lock(&pers_lock); +	if (pers[pnum]) { +		spin_unlock(&pers_lock); +		MD_BUG(); +		return -EBUSY; +	} + +	pers[pnum] = p; +	printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); +	spin_unlock(&pers_lock); +	return 0; +} + +int unregister_md_personality(int pnum) +{ +	if (pnum >= MAX_PERSONALITY) { +		MD_BUG(); +		return -EINVAL; +	} + +	printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); +	spin_lock(&pers_lock); +	pers[pnum] = NULL; +	spin_unlock(&pers_lock); +	return 0; +} + +static int is_mddev_idle(mddev_t *mddev) +{ +	mdk_rdev_t * rdev; +	struct list_head *tmp; +	int idle; +	unsigned long curr_events; + +	idle = 1; +	ITERATE_RDEV(mddev,rdev,tmp) { +		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; +		curr_events = disk_stat_read(disk, read_sectors) +  +				disk_stat_read(disk, write_sectors) -  +				atomic_read(&disk->sync_io); +		/* Allow some slack between valud of curr_events and last_events, +		 * as there are some uninteresting races. +		 * Note: the following is an unsigned comparison. +		 */ +		if ((curr_events - rdev->last_events + 32) > 64) { +			rdev->last_events = curr_events; +			idle = 0; +		} +	} +	return idle; +} + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ +	/* another "blocks" (512byte) blocks have been synced */ +	atomic_sub(blocks, &mddev->recovery_active); +	wake_up(&mddev->recovery_wait); +	if (!ok) { +		set_bit(MD_RECOVERY_ERR, &mddev->recovery); +		md_wakeup_thread(mddev->thread); +		// stop recovery, signal do_sync .... +	} +} + + +void md_write_start(mddev_t *mddev) +{ +	if (!atomic_read(&mddev->writes_pending)) { +		mddev_lock_uninterruptible(mddev); +		if (mddev->in_sync) { +			mddev->in_sync = 0; + 			del_timer(&mddev->safemode_timer); +			md_update_sb(mddev); +		} +		atomic_inc(&mddev->writes_pending); +		mddev_unlock(mddev); +	} else +		atomic_inc(&mddev->writes_pending); +} + +void md_write_end(mddev_t *mddev) +{ +	if (atomic_dec_and_test(&mddev->writes_pending)) { +		if (mddev->safemode == 2) +			md_wakeup_thread(mddev->thread); +		else +			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); +	} +} + +static inline void md_enter_safemode(mddev_t *mddev) +{ +	if (!mddev->safemode) return; +	if (mddev->safemode == 2 && +	    (atomic_read(&mddev->writes_pending) || mddev->in_sync || +		    mddev->recovery_cp != MaxSector)) +		return; /* avoid the lock */ +	mddev_lock_uninterruptible(mddev); +	if (mddev->safemode && !atomic_read(&mddev->writes_pending) && +	    !mddev->in_sync && mddev->recovery_cp == MaxSector) { +		mddev->in_sync = 1; +		md_update_sb(mddev); +	} +	mddev_unlock(mddev); + +	if (mddev->safemode == 1) +		mddev->safemode = 0; +} + +void md_handle_safemode(mddev_t *mddev) +{ +	if (signal_pending(current)) { +		printk(KERN_INFO "md: %s in immediate safe mode\n", +			mdname(mddev)); +		mddev->safemode = 2; +		flush_signals(current); +	} +	md_enter_safemode(mddev); +} + + +DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +#define SYNC_MARKS	10 +#define	SYNC_MARK_STEP	(3*HZ) +static void md_do_sync(mddev_t *mddev) +{ +	mddev_t *mddev2; +	unsigned int currspeed = 0, +		 window; +	sector_t max_sectors,j; +	unsigned long mark[SYNC_MARKS]; +	sector_t mark_cnt[SYNC_MARKS]; +	int last_mark,m; +	struct list_head *tmp; +	sector_t last_check; + +	/* just incase thread restarts... */ +	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) +		return; + +	/* we overload curr_resync somewhat here. +	 * 0 == not engaged in resync at all +	 * 2 == checking that there is no conflict with another sync +	 * 1 == like 2, but have yielded to allow conflicting resync to +	 *		commense +	 * other == active in resync - this many blocks +	 * +	 * Before starting a resync we must have set curr_resync to +	 * 2, and then checked that every "conflicting" array has curr_resync +	 * less than ours.  When we find one that is the same or higher +	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync +	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). +	 * This will mean we have to start checking from the beginning again. +	 * +	 */ + +	do { +		mddev->curr_resync = 2; + +	try_again: +		if (signal_pending(current)) { +			flush_signals(current); +			goto skip; +		} +		ITERATE_MDDEV(mddev2,tmp) { +			printk("."); +			if (mddev2 == mddev) +				continue; +			if (mddev2->curr_resync &&  +			    match_mddev_units(mddev,mddev2)) { +				DEFINE_WAIT(wq); +				if (mddev < mddev2 && mddev->curr_resync == 2) { +					/* arbitrarily yield */ +					mddev->curr_resync = 1; +					wake_up(&resync_wait); +				} +				if (mddev > mddev2 && mddev->curr_resync == 1) +					/* no need to wait here, we can wait the next +					 * time 'round when curr_resync == 2 +					 */ +					continue; +				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); +				if (!signal_pending(current) +				    && mddev2->curr_resync >= mddev->curr_resync) { +					printk(KERN_INFO "md: delaying resync of %s" +					       " until %s has finished resync (they" +					       " share one or more physical units)\n", +					       mdname(mddev), mdname(mddev2)); +					mddev_put(mddev2); +					schedule(); +					finish_wait(&resync_wait, &wq); +					goto try_again; +				} +				finish_wait(&resync_wait, &wq); +			} +		} +	} while (mddev->curr_resync < 2); + +	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) +		/* resync follows the size requested by the personality, +		 * which default to physical size, but can be virtual size +		 */ +		max_sectors = mddev->resync_max_sectors; +	else +		/* recovery follows the physical size of devices */ +		max_sectors = mddev->size << 1; + +	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); +	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" +		" %d KB/sec/disc.\n", sysctl_speed_limit_min); +	printk(KERN_INFO "md: using maximum available idle IO bandwith " +	       "(but not more than %d KB/sec) for reconstruction.\n", +	       sysctl_speed_limit_max); + +	is_mddev_idle(mddev); /* this also initializes IO event counters */ +	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) +		j = mddev->recovery_cp; +	else +		j = 0; +	for (m = 0; m < SYNC_MARKS; m++) { +		mark[m] = jiffies; +		mark_cnt[m] = j; +	} +	last_mark = 0; +	mddev->resync_mark = mark[last_mark]; +	mddev->resync_mark_cnt = mark_cnt[last_mark]; + +	/* +	 * Tune reconstruction: +	 */ +	window = 32*(PAGE_SIZE/512); +	printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", +		window/2,(unsigned long long) max_sectors/2); + +	atomic_set(&mddev->recovery_active, 0); +	init_waitqueue_head(&mddev->recovery_wait); +	last_check = 0; + +	if (j>2) { +		printk(KERN_INFO  +			"md: resuming recovery of %s from checkpoint.\n", +			mdname(mddev)); +		mddev->curr_resync = j; +	} + +	while (j < max_sectors) { +		int sectors; + +		sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); +		if (sectors < 0) { +			set_bit(MD_RECOVERY_ERR, &mddev->recovery); +			goto out; +		} +		atomic_add(sectors, &mddev->recovery_active); +		j += sectors; +		if (j>1) mddev->curr_resync = j; + +		if (last_check + window > j || j == max_sectors) +			continue; + +		last_check = j; + +		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || +		    test_bit(MD_RECOVERY_ERR, &mddev->recovery)) +			break; + +	repeat: +		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { +			/* step marks */ +			int next = (last_mark+1) % SYNC_MARKS; + +			mddev->resync_mark = mark[next]; +			mddev->resync_mark_cnt = mark_cnt[next]; +			mark[next] = jiffies; +			mark_cnt[next] = j - atomic_read(&mddev->recovery_active); +			last_mark = next; +		} + + +		if (signal_pending(current)) { +			/* +			 * got a signal, exit. +			 */ +			printk(KERN_INFO  +				"md: md_do_sync() got signal ... exiting\n"); +			flush_signals(current); +			set_bit(MD_RECOVERY_INTR, &mddev->recovery); +			goto out; +		} + +		/* +		 * this loop exits only if either when we are slower than +		 * the 'hard' speed limit, or the system was IO-idle for +		 * a jiffy. +		 * the system might be non-idle CPU-wise, but we only care +		 * about not overloading the IO subsystem. (things like an +		 * e2fsck being done on the RAID array should execute fast) +		 */ +		mddev->queue->unplug_fn(mddev->queue); +		cond_resched(); + +		currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + +		if (currspeed > sysctl_speed_limit_min) { +			if ((currspeed > sysctl_speed_limit_max) || +					!is_mddev_idle(mddev)) { +				msleep_interruptible(250); +				goto repeat; +			} +		} +	} +	printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); +	/* +	 * this also signals 'finished resyncing' to md_stop +	 */ + out: +	mddev->queue->unplug_fn(mddev->queue); + +	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + +	/* tell personality that we are finished */ +	mddev->pers->sync_request(mddev, max_sectors, 1); + +	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && +	    mddev->curr_resync > 2 && +	    mddev->curr_resync >= mddev->recovery_cp) { +		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { +			printk(KERN_INFO  +				"md: checkpointing recovery of %s.\n", +				mdname(mddev)); +			mddev->recovery_cp = mddev->curr_resync; +		} else +			mddev->recovery_cp = MaxSector; +	} + +	md_enter_safemode(mddev); + skip: +	mddev->curr_resync = 0; +	wake_up(&resync_wait); +	set_bit(MD_RECOVERY_DONE, &mddev->recovery); +	md_wakeup_thread(mddev->thread); +} + + +/* + * This routine is regularly called by all per-raid-array threads to + * deal with generic issues like resync and super-block update. + * Raid personalities that don't have a thread (linear/raid0) do not + * need this as they never do any recovery or update the superblock. + * + * It does not do any resync itself, but rather "forks" off other threads + * to do that as needed. + * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in + * "->recovery" and create a thread at ->sync_thread. + * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) + * and wakeups up this thread which will reap the thread and finish up. + * This thread also removes any faulty devices (with nr_pending == 0). + * + * The overall approach is: + *  1/ if the superblock needs updating, update it. + *  2/ If a recovery thread is running, don't do anything else. + *  3/ If recovery has finished, clean up, possibly marking spares active. + *  4/ If there are any faulty devices, remove them. + *  5/ If array is degraded, try to add spares devices + *  6/ If array has spares or is not in-sync, start a resync thread. + */ +void md_check_recovery(mddev_t *mddev) +{ +	mdk_rdev_t *rdev; +	struct list_head *rtmp; + + +	dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); + +	if (mddev->ro) +		return; +	if ( ! ( +		mddev->sb_dirty || +		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || +		test_bit(MD_RECOVERY_DONE, &mddev->recovery) +		)) +		return; +	if (mddev_trylock(mddev)==0) { +		int spares =0; +		if (mddev->sb_dirty) +			md_update_sb(mddev); +		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && +		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { +			/* resync/recovery still happening */ +			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +			goto unlock; +		} +		if (mddev->sync_thread) { +			/* resync has finished, collect result */ +			md_unregister_thread(mddev->sync_thread); +			mddev->sync_thread = NULL; +			if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && +			    !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { +				/* success...*/ +				/* activate any spares */ +				mddev->pers->spare_active(mddev); +			} +			md_update_sb(mddev); +			mddev->recovery = 0; +			/* flag recovery needed just to double check */ +			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +			goto unlock; +		} +		if (mddev->recovery) +			/* probably just the RECOVERY_NEEDED flag */ +			mddev->recovery = 0; + +		/* no recovery is running. +		 * remove any failed drives, then +		 * add spares if possible. +		 * Spare are also removed and re-added, to allow +		 * the personality to fail the re-add. +		 */ +		ITERATE_RDEV(mddev,rdev,rtmp) +			if (rdev->raid_disk >= 0 && +			    (rdev->faulty || ! rdev->in_sync) && +			    atomic_read(&rdev->nr_pending)==0) { +				if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) +					rdev->raid_disk = -1; +			} + +		if (mddev->degraded) { +			ITERATE_RDEV(mddev,rdev,rtmp) +				if (rdev->raid_disk < 0 +				    && !rdev->faulty) { +					if (mddev->pers->hot_add_disk(mddev,rdev)) +						spares++; +					else +						break; +				} +		} + +		if (!spares && (mddev->recovery_cp == MaxSector )) { +			/* nothing we can do ... */ +			goto unlock; +		} +		if (mddev->pers->sync_request) { +			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); +			if (!spares) +				set_bit(MD_RECOVERY_SYNC, &mddev->recovery); +			mddev->sync_thread = md_register_thread(md_do_sync, +								mddev, +								"%s_resync"); +			if (!mddev->sync_thread) { +				printk(KERN_ERR "%s: could not start resync" +					" thread...\n",  +					mdname(mddev)); +				/* leave the spares where they are, it shouldn't hurt */ +				mddev->recovery = 0; +			} else { +				md_wakeup_thread(mddev->sync_thread); +			} +		} +	unlock: +		mddev_unlock(mddev); +	} +} + +int md_notify_reboot(struct notifier_block *this, +					unsigned long code, void *x) +{ +	struct list_head *tmp; +	mddev_t *mddev; + +	if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { + +		printk(KERN_INFO "md: stopping all md devices.\n"); + +		ITERATE_MDDEV(mddev,tmp) +			if (mddev_trylock(mddev)==0) +				do_md_stop (mddev, 1); +		/* +		 * certain more exotic SCSI devices are known to be +		 * volatile wrt too early system reboots. While the +		 * right place to handle this issue is the given +		 * driver, we do want to have a safe RAID driver ... +		 */ +		mdelay(1000*1); +	} +	return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { +	.notifier_call	= md_notify_reboot, +	.next		= NULL, +	.priority	= INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ +	struct proc_dir_entry *p; + +	dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +	p = create_proc_entry("mdstat", S_IRUGO, NULL); +	if (p) +		p->proc_fops = &md_seq_fops; +} + +int __init md_init(void) +{ +	int minor; + +	printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," +			" MD_SB_DISKS=%d\n", +			MD_MAJOR_VERSION, MD_MINOR_VERSION, +			MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + +	if (register_blkdev(MAJOR_NR, "md")) +		return -1; +	if ((mdp_major=register_blkdev(0, "mdp"))<=0) { +		unregister_blkdev(MAJOR_NR, "md"); +		return -1; +	} +	devfs_mk_dir("md"); +	blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, +				md_probe, NULL, NULL); +	blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, +			    md_probe, NULL, NULL); + +	for (minor=0; minor < MAX_MD_DEVS; ++minor) +		devfs_mk_bdev(MKDEV(MAJOR_NR, minor), +				S_IFBLK|S_IRUSR|S_IWUSR, +				"md/%d", minor); + +	for (minor=0; minor < MAX_MD_DEVS; ++minor) +		devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), +			      S_IFBLK|S_IRUSR|S_IWUSR, +			      "md/mdp%d", minor); + + +	register_reboot_notifier(&md_notifier); +	raid_table_header = register_sysctl_table(raid_root_table, 1); + +	md_geninit(); +	return (0); +} + + +#ifndef MODULE + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static dev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(dev_t dev) +{ +	if (dev_cnt >= 0 && dev_cnt < 127) +		detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(int part) +{ +	mdk_rdev_t *rdev; +	int i; + +	printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + +	for (i = 0; i < dev_cnt; i++) { +		dev_t dev = detected_devices[i]; + +		rdev = md_import_device(dev,0, 0); +		if (IS_ERR(rdev)) +			continue; + +		if (rdev->faulty) { +			MD_BUG(); +			continue; +		} +		list_add(&rdev->same_set, &pending_raid_disks); +	} +	dev_cnt = 0; + +	autorun_devices(part); +} + +#endif + +static __exit void md_exit(void) +{ +	mddev_t *mddev; +	struct list_head *tmp; +	int i; +	blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); +	blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); +	for (i=0; i < MAX_MD_DEVS; i++) +		devfs_remove("md/%d", i); +	for (i=0; i < MAX_MD_DEVS; i++) +		devfs_remove("md/d%d", i); + +	devfs_remove("md"); + +	unregister_blkdev(MAJOR_NR,"md"); +	unregister_blkdev(mdp_major, "mdp"); +	unregister_reboot_notifier(&md_notifier); +	unregister_sysctl_table(raid_table_header); +	remove_proc_entry("mdstat", NULL); +	ITERATE_MDDEV(mddev,tmp) { +		struct gendisk *disk = mddev->gendisk; +		if (!disk) +			continue; +		export_array(mddev); +		del_gendisk(disk); +		put_disk(disk); +		mddev->gendisk = NULL; +		mddev_put(mddev); +	} +} + +module_init(md_init) +module_exit(md_exit) + +EXPORT_SYMBOL(register_md_personality); +EXPORT_SYMBOL(unregister_md_personality); +EXPORT_SYMBOL(md_error); +EXPORT_SYMBOL(md_done_sync); +EXPORT_SYMBOL(md_write_start); +EXPORT_SYMBOL(md_write_end); +EXPORT_SYMBOL(md_handle_safemode); +EXPORT_SYMBOL(md_register_thread); +EXPORT_SYMBOL(md_unregister_thread); +EXPORT_SYMBOL(md_wakeup_thread); +EXPORT_SYMBOL(md_print_devices); +EXPORT_SYMBOL(md_check_recovery); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c new file mode 100644 index 00000000000..adef299908c --- /dev/null +++ b/drivers/md/mktables.c @@ -0,0 +1,125 @@ +#ident "$Id: mktables.c,v 1.2 2002/12/12 22:41:27 hpa Exp $" +/* ----------------------------------------------------------------------- * + * + *   Copyright 2002 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * mktables.c + * + * Make RAID-6 tables.  This is a host user space program to be run at + * compile time. + */ + +#include <stdio.h> +#include <string.h> +#include <inttypes.h> +#include <stdlib.h> +#include <time.h> + +static uint8_t gfmul(uint8_t a, uint8_t b) +{ +  uint8_t v = 0; + +  while ( b ) { +    if ( b & 1 ) v ^= a; +    a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); +    b >>= 1; +  } +  return v; +} + +static uint8_t gfpow(uint8_t a, int b) +{ +  uint8_t v = 1; + +  b %= 255; +  if ( b < 0 ) +    b += 255; + +  while ( b ) { +    if ( b & 1 ) v = gfmul(v,a); +    a = gfmul(a,a); +    b >>= 1; +  } +  return v; +} + +int main(int argc, char *argv[]) +{ +  int i, j, k; +  uint8_t v; +  uint8_t exptbl[256], invtbl[256]; + +  printf("#include \"raid6.h\"\n"); + +  /* Compute multiplication table */ +  printf("\nconst u8  __attribute__((aligned(256)))\n" +	 "raid6_gfmul[256][256] =\n" +	 "{\n"); +  for ( i = 0 ; i < 256 ; i++ ) { +    printf("\t{\n"); +    for ( j = 0 ; j < 256 ; j += 8 ) { +      printf("\t\t"); +      for ( k = 0 ; k < 8 ; k++ ) { +	printf("0x%02x, ", gfmul(i,j+k)); +      } +      printf("\n"); +    } +    printf("\t},\n"); +  } +  printf("};\n"); + +  /* Compute power-of-2 table (exponent) */ +  v = 1; +  printf("\nconst u8 __attribute__((aligned(256)))\n" +	 "raid6_gfexp[256] =\n" +	 "{\n"); +  for ( i = 0 ; i < 256 ; i += 8 ) { +    printf("\t"); +    for ( j = 0 ; j < 8 ; j++ ) { +      exptbl[i+j] = v; +      printf("0x%02x, ", v); +      v = gfmul(v,2); +      if ( v == 1 ) v = 0;	/* For entry 255, not a real entry */ +    } +    printf("\n"); +  } +  printf("};\n"); + +  /* Compute inverse table x^-1 == x^254 */ +  printf("\nconst u8 __attribute__((aligned(256)))\n" +	 "raid6_gfinv[256] =\n" +	 "{\n"); +  for ( i = 0 ; i < 256 ; i += 8 ) { +    printf("\t"); +    for ( j = 0 ; j < 8 ; j++ ) { +      invtbl[i+j] = v = gfpow(i+j,254); +      printf("0x%02x, ", v); +    } +    printf("\n"); +  } +  printf("};\n"); + +  /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ +  printf("\nconst u8 __attribute__((aligned(256)))\n" +	 "raid6_gfexi[256] =\n" +	 "{\n"); +  for ( i = 0 ; i < 256 ; i += 8 ) { +    printf("\t"); +    for ( j = 0 ; j < 8 ; j++ ) { +      printf("0x%02x, ", invtbl[exptbl[i+j]^1]); +    } +    printf("\n"); +  } +  printf("};\n\n"); + +  return 0; +} diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c new file mode 100644 index 00000000000..c9b134cd153 --- /dev/null +++ b/drivers/md/multipath.c @@ -0,0 +1,584 @@ +/* + * multipath.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * MULTIPATH management functions. + * + * derived from raid1.c. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/raid/multipath.h> +#include <linux/buffer_head.h> +#include <asm/atomic.h> + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +#define MAX_WORK_PER_DISK 128 + +#define	NR_RESERVED_BUFS	32 + + +static mdk_personality_t multipath_personality; + + +static void *mp_pool_alloc(unsigned int __nocast gfp_flags, void *data) +{ +	struct multipath_bh *mpb; +	mpb = kmalloc(sizeof(*mpb), gfp_flags); +	if (mpb)  +		memset(mpb, 0, sizeof(*mpb)); +	return mpb; +} + +static void mp_pool_free(void *mpb, void *data) +{ +	kfree(mpb); +} + +static int multipath_map (multipath_conf_t *conf) +{ +	int i, disks = conf->raid_disks; + +	/* +	 * Later we do read balancing on the read side  +	 * now we use the first available disk. +	 */ + +	rcu_read_lock(); +	for (i = 0; i < disks; i++) { +		mdk_rdev_t *rdev = conf->multipaths[i].rdev; +		if (rdev && rdev->in_sync) { +			atomic_inc(&rdev->nr_pending); +			rcu_read_unlock(); +			return i; +		} +	} +	rcu_read_unlock(); + +	printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); +	return (-1); +} + +static void multipath_reschedule_retry (struct multipath_bh *mp_bh) +{ +	unsigned long flags; +	mddev_t *mddev = mp_bh->mddev; +	multipath_conf_t *conf = mddev_to_conf(mddev); + +	spin_lock_irqsave(&conf->device_lock, flags); +	list_add(&mp_bh->retry_list, &conf->retry_list); +	spin_unlock_irqrestore(&conf->device_lock, flags); +	md_wakeup_thread(mddev->thread); +} + + +/* + * multipath_end_bh_io() is called when we have finished servicing a multipathed + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) +{ +	struct bio *bio = mp_bh->master_bio; +	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); + +	bio_endio(bio, bio->bi_size, err); +	mempool_free(mp_bh, conf->pool); +} + +int multipath_end_request(struct bio *bio, unsigned int bytes_done, int error) +{ +	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); +	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); +	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; + +	if (bio->bi_size) +		return 1; + +	if (uptodate) +		multipath_end_bh_io(mp_bh, 0); +	else if (!bio_rw_ahead(bio)) { +		/* +		 * oops, IO error: +		 */ +		char b[BDEVNAME_SIZE]; +		md_error (mp_bh->mddev, rdev); +		printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",  +		       bdevname(rdev->bdev,b),  +		       (unsigned long long)bio->bi_sector); +		multipath_reschedule_retry(mp_bh); +	} else +		multipath_end_bh_io(mp_bh, error); +	rdev_dec_pending(rdev, conf->mddev); +	return 0; +} + +static void unplug_slaves(mddev_t *mddev) +{ +	multipath_conf_t *conf = mddev_to_conf(mddev); +	int i; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks; i++) { +		mdk_rdev_t *rdev = conf->multipaths[i].rdev; +		if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { +			request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + +			atomic_inc(&rdev->nr_pending); +			rcu_read_unlock(); + +			if (r_queue->unplug_fn) +				r_queue->unplug_fn(r_queue); + +			rdev_dec_pending(rdev, mddev); +			rcu_read_lock(); +		} +	} +	rcu_read_unlock(); +} + +static void multipath_unplug(request_queue_t *q) +{ +	unplug_slaves(q->queuedata); +} + + +static int multipath_make_request (request_queue_t *q, struct bio * bio) +{ +	mddev_t *mddev = q->queuedata; +	multipath_conf_t *conf = mddev_to_conf(mddev); +	struct multipath_bh * mp_bh; +	struct multipath_info *multipath; + +	mp_bh = mempool_alloc(conf->pool, GFP_NOIO); + +	mp_bh->master_bio = bio; +	mp_bh->mddev = mddev; + +	if (bio_data_dir(bio)==WRITE) { +		disk_stat_inc(mddev->gendisk, writes); +		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); +	} else { +		disk_stat_inc(mddev->gendisk, reads); +		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); +	} + +	mp_bh->path = multipath_map(conf); +	if (mp_bh->path < 0) { +		bio_endio(bio, bio->bi_size, -EIO); +		mempool_free(mp_bh, conf->pool); +		return 0; +	} +	multipath = conf->multipaths + mp_bh->path; + +	mp_bh->bio = *bio; +	mp_bh->bio.bi_sector += multipath->rdev->data_offset; +	mp_bh->bio.bi_bdev = multipath->rdev->bdev; +	mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST); +	mp_bh->bio.bi_end_io = multipath_end_request; +	mp_bh->bio.bi_private = mp_bh; +	generic_make_request(&mp_bh->bio); +	return 0; +} + +static void multipath_status (struct seq_file *seq, mddev_t *mddev) +{ +	multipath_conf_t *conf = mddev_to_conf(mddev); +	int i; +	 +	seq_printf (seq, " [%d/%d] [", conf->raid_disks, +						 conf->working_disks); +	for (i = 0; i < conf->raid_disks; i++) +		seq_printf (seq, "%s", +			       conf->multipaths[i].rdev &&  +			       conf->multipaths[i].rdev->in_sync ? "U" : "_"); +	seq_printf (seq, "]"); +} + +static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, +				 sector_t *error_sector) +{ +	mddev_t *mddev = q->queuedata; +	multipath_conf_t *conf = mddev_to_conf(mddev); +	int i, ret = 0; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks && ret == 0; i++) { +		mdk_rdev_t *rdev = conf->multipaths[i].rdev; +		if (rdev && !rdev->faulty) { +			struct block_device *bdev = rdev->bdev; +			request_queue_t *r_queue = bdev_get_queue(bdev); + +			if (!r_queue->issue_flush_fn) +				ret = -EOPNOTSUPP; +			else { +				atomic_inc(&rdev->nr_pending); +				rcu_read_unlock(); +				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, +							      error_sector); +				rdev_dec_pending(rdev, mddev); +				rcu_read_lock(); +			} +		} +	} +	rcu_read_unlock(); +	return ret; +} + +/* + * Careful, this can execute in IRQ contexts as well! + */ +static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) +{ +	multipath_conf_t *conf = mddev_to_conf(mddev); + +	if (conf->working_disks <= 1) { +		/* +		 * Uh oh, we can do nothing if this is our last path, but +		 * first check if this is a queued request for a device +		 * which has just failed. +		 */ +		printk(KERN_ALERT  +			"multipath: only one IO path left and IO error.\n"); +		/* leave it active... it's all we have */ +	} else { +		/* +		 * Mark disk as unusable +		 */ +		if (!rdev->faulty) { +			char b[BDEVNAME_SIZE]; +			rdev->in_sync = 0; +			rdev->faulty = 1; +			mddev->sb_dirty = 1; +			conf->working_disks--; +			printk(KERN_ALERT "multipath: IO failure on %s," +				" disabling IO path. \n	Operation continuing" +				" on %d IO paths.\n", +				bdevname (rdev->bdev,b), +				conf->working_disks); +		} +	} +} + +static void print_multipath_conf (multipath_conf_t *conf) +{ +	int i; +	struct multipath_info *tmp; + +	printk("MULTIPATH conf printout:\n"); +	if (!conf) { +		printk("(conf==NULL)\n"); +		return; +	} +	printk(" --- wd:%d rd:%d\n", conf->working_disks, +			 conf->raid_disks); + +	for (i = 0; i < conf->raid_disks; i++) { +		char b[BDEVNAME_SIZE]; +		tmp = conf->multipaths + i; +		if (tmp->rdev) +			printk(" disk%d, o:%d, dev:%s\n", +				i,!tmp->rdev->faulty, +			       bdevname(tmp->rdev->bdev,b)); +	} +} + + +static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	multipath_conf_t *conf = mddev->private; +	int found = 0; +	int path; +	struct multipath_info *p; + +	print_multipath_conf(conf); + +	for (path=0; path<mddev->raid_disks; path++)  +		if ((p=conf->multipaths+path)->rdev == NULL) { +			blk_queue_stack_limits(mddev->queue, +					       rdev->bdev->bd_disk->queue); + +		/* as we don't honour merge_bvec_fn, we must never risk +		 * violating it, so limit ->max_sector to one PAGE, as +		 * a one page request is never in violation. +		 * (Note: it is very unlikely that a device with +		 * merge_bvec_fn will be involved in multipath.) +		 */ +			if (rdev->bdev->bd_disk->queue->merge_bvec_fn && +			    mddev->queue->max_sectors > (PAGE_SIZE>>9)) +				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + +			conf->working_disks++; +			rdev->raid_disk = path; +			rdev->in_sync = 1; +			p->rdev = rdev; +			found = 1; +		} + +	print_multipath_conf(conf); +	return found; +} + +static int multipath_remove_disk(mddev_t *mddev, int number) +{ +	multipath_conf_t *conf = mddev->private; +	int err = 0; +	mdk_rdev_t *rdev; +	struct multipath_info *p = conf->multipaths + number; + +	print_multipath_conf(conf); + +	rdev = p->rdev; +	if (rdev) { +		if (rdev->in_sync || +		    atomic_read(&rdev->nr_pending)) { +			printk(KERN_ERR "hot-remove-disk, slot %d is identified"				" but is still operational!\n", number); +			err = -EBUSY; +			goto abort; +		} +		p->rdev = NULL; +		synchronize_kernel(); +		if (atomic_read(&rdev->nr_pending)) { +			/* lost the race, try later */ +			err = -EBUSY; +			p->rdev = rdev; +		} +	} +abort: + +	print_multipath_conf(conf); +	return err; +} + + + +/* + * This is a kernel thread which: + * + *	1.	Retries failed read operations on working multipaths. + *	2.	Updates the raid superblock when problems encounter. + *	3.	Performs writes following reads for array syncronising. + */ + +static void multipathd (mddev_t *mddev) +{ +	struct multipath_bh *mp_bh; +	struct bio *bio; +	unsigned long flags; +	multipath_conf_t *conf = mddev_to_conf(mddev); +	struct list_head *head = &conf->retry_list; + +	md_check_recovery(mddev); +	for (;;) { +		char b[BDEVNAME_SIZE]; +		spin_lock_irqsave(&conf->device_lock, flags); +		if (list_empty(head)) +			break; +		mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); +		list_del(head->prev); +		spin_unlock_irqrestore(&conf->device_lock, flags); + +		bio = &mp_bh->bio; +		bio->bi_sector = mp_bh->master_bio->bi_sector; +		 +		if ((mp_bh->path = multipath_map (conf))<0) { +			printk(KERN_ALERT "multipath: %s: unrecoverable IO read" +				" error for block %llu\n", +				bdevname(bio->bi_bdev,b), +				(unsigned long long)bio->bi_sector); +			multipath_end_bh_io(mp_bh, -EIO); +		} else { +			printk(KERN_ERR "multipath: %s: redirecting sector %llu" +				" to another IO path\n", +				bdevname(bio->bi_bdev,b), +				(unsigned long long)bio->bi_sector); +			*bio = *(mp_bh->master_bio); +			bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; +			bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; +			bio->bi_rw |= (1 << BIO_RW_FAILFAST); +			bio->bi_end_io = multipath_end_request; +			bio->bi_private = mp_bh; +			generic_make_request(bio); +		} +	} +	spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static int multipath_run (mddev_t *mddev) +{ +	multipath_conf_t *conf; +	int disk_idx; +	struct multipath_info *disk; +	mdk_rdev_t *rdev; +	struct list_head *tmp; + +	if (mddev->level != LEVEL_MULTIPATH) { +		printk("multipath: %s: raid level not set to multipath IO (%d)\n", +		       mdname(mddev), mddev->level); +		goto out; +	} +	/* +	 * copy the already verified devices into our private MULTIPATH +	 * bookkeeping area. [whatever we allocate in multipath_run(), +	 * should be freed in multipath_stop()] +	 */ + +	conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL); +	mddev->private = conf; +	if (!conf) { +		printk(KERN_ERR  +			"multipath: couldn't allocate memory for %s\n", +			mdname(mddev)); +		goto out; +	} +	memset(conf, 0, sizeof(*conf)); + +	conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks, +				   GFP_KERNEL); +	if (!conf->multipaths) { +		printk(KERN_ERR  +			"multipath: couldn't allocate memory for %s\n", +			mdname(mddev)); +		goto out_free_conf; +	} +	memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks); + +	mddev->queue->unplug_fn = multipath_unplug; + +	mddev->queue->issue_flush_fn = multipath_issue_flush; + +	conf->working_disks = 0; +	ITERATE_RDEV(mddev,rdev,tmp) { +		disk_idx = rdev->raid_disk; +		if (disk_idx < 0 || +		    disk_idx >= mddev->raid_disks) +			continue; + +		disk = conf->multipaths + disk_idx; +		disk->rdev = rdev; + +		blk_queue_stack_limits(mddev->queue, +				       rdev->bdev->bd_disk->queue); +		/* as we don't honour merge_bvec_fn, we must never risk +		 * violating it, not that we ever expect a device with +		 * a merge_bvec_fn to be involved in multipath */ +		if (rdev->bdev->bd_disk->queue->merge_bvec_fn && +		    mddev->queue->max_sectors > (PAGE_SIZE>>9)) +			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + +		if (!rdev->faulty)  +			conf->working_disks++; +	} + +	conf->raid_disks = mddev->raid_disks; +	mddev->sb_dirty = 1; +	conf->mddev = mddev; +	spin_lock_init(&conf->device_lock); +	INIT_LIST_HEAD(&conf->retry_list); + +	if (!conf->working_disks) { +		printk(KERN_ERR "multipath: no operational IO paths for %s\n", +			mdname(mddev)); +		goto out_free_conf; +	} +	mddev->degraded = conf->raid_disks = conf->working_disks; + +	conf->pool = mempool_create(NR_RESERVED_BUFS, +				    mp_pool_alloc, mp_pool_free, +				    NULL); +	if (conf->pool == NULL) { +		printk(KERN_ERR  +			"multipath: couldn't allocate memory for %s\n", +			mdname(mddev)); +		goto out_free_conf; +	} + +	{ +		mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath"); +		if (!mddev->thread) { +			printk(KERN_ERR "multipath: couldn't allocate thread" +				" for %s\n", mdname(mddev)); +			goto out_free_conf; +		} +	} + +	printk(KERN_INFO  +		"multipath: array %s active with %d out of %d IO paths\n", +		mdname(mddev), conf->working_disks, mddev->raid_disks); +	/* +	 * Ok, everything is just fine now +	 */ +	mddev->array_size = mddev->size; +	return 0; + +out_free_conf: +	if (conf->pool) +		mempool_destroy(conf->pool); +	if (conf->multipaths) +		kfree(conf->multipaths); +	kfree(conf); +	mddev->private = NULL; +out: +	return -EIO; +} + + +static int multipath_stop (mddev_t *mddev) +{ +	multipath_conf_t *conf = mddev_to_conf(mddev); + +	md_unregister_thread(mddev->thread); +	mddev->thread = NULL; +	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +	mempool_destroy(conf->pool); +	kfree(conf->multipaths); +	kfree(conf); +	mddev->private = NULL; +	return 0; +} + +static mdk_personality_t multipath_personality= +{ +	.name		= "multipath", +	.owner		= THIS_MODULE, +	.make_request	= multipath_make_request, +	.run		= multipath_run, +	.stop		= multipath_stop, +	.status		= multipath_status, +	.error_handler	= multipath_error, +	.hot_add_disk	= multipath_add_disk, +	.hot_remove_disk= multipath_remove_disk, +}; + +static int __init multipath_init (void) +{ +	return register_md_personality (MULTIPATH, &multipath_personality); +} + +static void __exit multipath_exit (void) +{ +	unregister_md_personality (MULTIPATH); +} + +module_init(multipath_init); +module_exit(multipath_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c new file mode 100644 index 00000000000..e7d934eca06 --- /dev/null +++ b/drivers/md/raid0.c @@ -0,0 +1,539 @@ +/* +   raid0.c : Multiple Devices driver for Linux +             Copyright (C) 1994-96 Marc ZYNGIER +	     <zyngier@ufr-info-p7.ibp.fr> or +	     <maz@gloups.fdn.fr> +             Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + + +   RAID-0 management functions. + +   This program is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published by +   the Free Software Foundation; either version 2, or (at your option) +   any later version. +    +   You should have received a copy of the GNU General Public License +   (for example /usr/src/linux/COPYING); if not, write to the Free +   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.   +*/ + +#include <linux/module.h> +#include <linux/raid/raid0.h> + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +static void raid0_unplug(request_queue_t *q) +{ +	mddev_t *mddev = q->queuedata; +	raid0_conf_t *conf = mddev_to_conf(mddev); +	mdk_rdev_t **devlist = conf->strip_zone[0].dev; +	int i; + +	for (i=0; i<mddev->raid_disks; i++) { +		request_queue_t *r_queue = bdev_get_queue(devlist[i]->bdev); + +		if (r_queue->unplug_fn) +			r_queue->unplug_fn(r_queue); +	} +} + +static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk, +			     sector_t *error_sector) +{ +	mddev_t *mddev = q->queuedata; +	raid0_conf_t *conf = mddev_to_conf(mddev); +	mdk_rdev_t **devlist = conf->strip_zone[0].dev; +	int i, ret = 0; + +	for (i=0; i<mddev->raid_disks && ret == 0; i++) { +		struct block_device *bdev = devlist[i]->bdev; +		request_queue_t *r_queue = bdev_get_queue(bdev); + +		if (!r_queue->issue_flush_fn) +			ret = -EOPNOTSUPP; +		else +			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); +	} +	return ret; +} + + +static int create_strip_zones (mddev_t *mddev) +{ +	int i, c, j; +	sector_t current_offset, curr_zone_offset; +	sector_t min_spacing; +	raid0_conf_t *conf = mddev_to_conf(mddev); +	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; +	struct list_head *tmp1, *tmp2; +	struct strip_zone *zone; +	int cnt; +	char b[BDEVNAME_SIZE]; +  +	/* +	 * The number of 'same size groups' +	 */ +	conf->nr_strip_zones = 0; +  +	ITERATE_RDEV(mddev,rdev1,tmp1) { +		printk("raid0: looking at %s\n", +			bdevname(rdev1->bdev,b)); +		c = 0; +		ITERATE_RDEV(mddev,rdev2,tmp2) { +			printk("raid0:   comparing %s(%llu)", +			       bdevname(rdev1->bdev,b), +			       (unsigned long long)rdev1->size); +			printk(" with %s(%llu)\n", +			       bdevname(rdev2->bdev,b), +			       (unsigned long long)rdev2->size); +			if (rdev2 == rdev1) { +				printk("raid0:   END\n"); +				break; +			} +			if (rdev2->size == rdev1->size) +			{ +				/* +				 * Not unique, don't count it as a new +				 * group +				 */ +				printk("raid0:   EQUAL\n"); +				c = 1; +				break; +			} +			printk("raid0:   NOT EQUAL\n"); +		} +		if (!c) { +			printk("raid0:   ==> UNIQUE\n"); +			conf->nr_strip_zones++; +			printk("raid0: %d zones\n", conf->nr_strip_zones); +		} +	} +	printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); + +	conf->strip_zone = kmalloc(sizeof(struct strip_zone)* +				conf->nr_strip_zones, GFP_KERNEL); +	if (!conf->strip_zone) +		return 1; +	conf->devlist = kmalloc(sizeof(mdk_rdev_t*)* +				conf->nr_strip_zones*mddev->raid_disks, +				GFP_KERNEL); +	if (!conf->devlist) +		return 1; + +	memset(conf->strip_zone, 0,sizeof(struct strip_zone)* +				   conf->nr_strip_zones); +	memset(conf->devlist, 0, +	       sizeof(mdk_rdev_t*) * conf->nr_strip_zones * mddev->raid_disks); + +	/* The first zone must contain all devices, so here we check that +	 * there is a proper alignment of slots to devices and find them all +	 */ +	zone = &conf->strip_zone[0]; +	cnt = 0; +	smallest = NULL; +	zone->dev = conf->devlist; +	ITERATE_RDEV(mddev, rdev1, tmp1) { +		int j = rdev1->raid_disk; + +		if (j < 0 || j >= mddev->raid_disks) { +			printk("raid0: bad disk number %d - aborting!\n", j); +			goto abort; +		} +		if (zone->dev[j]) { +			printk("raid0: multiple devices for %d - aborting!\n", +				j); +			goto abort; +		} +		zone->dev[j] = rdev1; + +		blk_queue_stack_limits(mddev->queue, +				       rdev1->bdev->bd_disk->queue); +		/* as we don't honour merge_bvec_fn, we must never risk +		 * violating it, so limit ->max_sector to one PAGE, as +		 * a one page request is never in violation. +		 */ + +		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && +		    mddev->queue->max_sectors > (PAGE_SIZE>>9)) +			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + +		if (!smallest || (rdev1->size <smallest->size)) +			smallest = rdev1; +		cnt++; +	} +	if (cnt != mddev->raid_disks) { +		printk("raid0: too few disks (%d of %d) - aborting!\n", +			cnt, mddev->raid_disks); +		goto abort; +	} +	zone->nb_dev = cnt; +	zone->size = smallest->size * cnt; +	zone->zone_offset = 0; + +	current_offset = smallest->size; +	curr_zone_offset = zone->size; + +	/* now do the other zones */ +	for (i = 1; i < conf->nr_strip_zones; i++) +	{ +		zone = conf->strip_zone + i; +		zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; + +		printk("raid0: zone %d\n", i); +		zone->dev_offset = current_offset; +		smallest = NULL; +		c = 0; + +		for (j=0; j<cnt; j++) { +			char b[BDEVNAME_SIZE]; +			rdev = conf->strip_zone[0].dev[j]; +			printk("raid0: checking %s ...", bdevname(rdev->bdev,b)); +			if (rdev->size > current_offset) +			{ +				printk(" contained as device %d\n", c); +				zone->dev[c] = rdev; +				c++; +				if (!smallest || (rdev->size <smallest->size)) { +					smallest = rdev; +					printk("  (%llu) is smallest!.\n",  +						(unsigned long long)rdev->size); +				} +			} else +				printk(" nope.\n"); +		} + +		zone->nb_dev = c; +		zone->size = (smallest->size - current_offset) * c; +		printk("raid0: zone->nb_dev: %d, size: %llu\n", +			zone->nb_dev, (unsigned long long)zone->size); + +		zone->zone_offset = curr_zone_offset; +		curr_zone_offset += zone->size; + +		current_offset = smallest->size; +		printk("raid0: current zone offset: %llu\n", +			(unsigned long long)current_offset); +	} + +	/* Now find appropriate hash spacing. +	 * We want a number which causes most hash entries to cover +	 * at most two strips, but the hash table must be at most +	 * 1 PAGE.  We choose the smallest strip, or contiguous collection +	 * of strips, that has big enough size.  We never consider the last +	 * strip though as it's size has no bearing on the efficacy of the hash +	 * table. +	 */ +	conf->hash_spacing = curr_zone_offset; +	min_spacing = curr_zone_offset; +	sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*)); +	for (i=0; i < conf->nr_strip_zones-1; i++) { +		sector_t sz = 0; +		for (j=i; j<conf->nr_strip_zones-1 && +			     sz < min_spacing ; j++) +			sz += conf->strip_zone[j].size; +		if (sz >= min_spacing && sz < conf->hash_spacing) +			conf->hash_spacing = sz; +	} + +	mddev->queue->unplug_fn = raid0_unplug; + +	mddev->queue->issue_flush_fn = raid0_issue_flush; + +	printk("raid0: done.\n"); +	return 0; + abort: +	return 1; +} + +/** + *	raid0_mergeable_bvec -- tell bio layer if a two requests can be merged + *	@q: request queue + *	@bio: the buffer head that's been built up so far + *	@biovec: the request that could be merged to it. + * + *	Return amount of bytes we can accept at this offset + */ +static int raid0_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) +{ +	mddev_t *mddev = q->queuedata; +	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); +	int max; +	unsigned int chunk_sectors = mddev->chunk_size >> 9; +	unsigned int bio_sectors = bio->bi_size >> 9; + +	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; +	if (max < 0) max = 0; /* bio_add cannot handle a negative return */ +	if (max <= biovec->bv_len && bio_sectors == 0) +		return biovec->bv_len; +	else  +		return max; +} + +static int raid0_run (mddev_t *mddev) +{ +	unsigned  cur=0, i=0, nb_zone; +	s64 size; +	raid0_conf_t *conf; +	mdk_rdev_t *rdev; +	struct list_head *tmp; + +	printk("%s: setting max_sectors to %d, segment boundary to %d\n", +	       mdname(mddev), +	       mddev->chunk_size >> 9, +	       (mddev->chunk_size>>1)-1); +	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); +	blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1); + +	conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL); +	if (!conf) +		goto out; +	mddev->private = (void *)conf; +  +	conf->strip_zone = NULL; +	conf->devlist = NULL; +	if (create_strip_zones (mddev))  +		goto out_free_conf; + +	/* calculate array device size */ +	mddev->array_size = 0; +	ITERATE_RDEV(mddev,rdev,tmp) +		mddev->array_size += rdev->size; + +	printk("raid0 : md_size is %llu blocks.\n",  +		(unsigned long long)mddev->array_size); +	printk("raid0 : conf->hash_spacing is %llu blocks.\n", +		(unsigned long long)conf->hash_spacing); +	{ +#if __GNUC__ < 3 +		volatile +#endif +		sector_t s = mddev->array_size; +		sector_t space = conf->hash_spacing; +		int round; +		conf->preshift = 0; +		if (sizeof(sector_t) > sizeof(unsigned long)) { +			/*shift down space and s so that sector_div will work */ +			while (space > (sector_t) (~(unsigned long)0)) { +				s >>= 1; +				space >>= 1; +				s += 1; /* force round-up */ +				conf->preshift++; +			} +		} +		round = sector_div(s, (unsigned long)space) ? 1 : 0; +		nb_zone = s + round; +	} +	printk("raid0 : nb_zone is %d.\n", nb_zone); + +	printk("raid0 : Allocating %Zd bytes for hash.\n", +				nb_zone*sizeof(struct strip_zone*)); +	conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL); +	if (!conf->hash_table) +		goto out_free_conf; +	size = conf->strip_zone[cur].size; + +	for (i=0; i< nb_zone; i++) { +		conf->hash_table[i] = conf->strip_zone + cur; +		while (size <= conf->hash_spacing) { +			cur++; +			size += conf->strip_zone[cur].size; +		} +		size -= conf->hash_spacing; +	} +	if (conf->preshift) { +		conf->hash_spacing >>= conf->preshift; +		/* round hash_spacing up so when we divide by it, we +		 * err on the side of too-low, which is safest +		 */ +		conf->hash_spacing++; +	} + +	/* calculate the max read-ahead size. +	 * For read-ahead of large files to be effective, we need to +	 * readahead at least twice a whole stripe. i.e. number of devices +	 * multiplied by chunk size times 2. +	 * If an individual device has an ra_pages greater than the +	 * chunk size, then we will not drive that device as hard as it +	 * wants.  We consider this a configuration error: a larger +	 * chunksize should be used in that case. +	 */ +	{ +		int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; +		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) +			mddev->queue->backing_dev_info.ra_pages = 2* stripe; +	} + + +	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); +	return 0; + +out_free_conf: +	if (conf->strip_zone) +		kfree(conf->strip_zone); +	if (conf->devlist) +		kfree (conf->devlist); +	kfree(conf); +	mddev->private = NULL; +out: +	return 1; +} + +static int raid0_stop (mddev_t *mddev) +{ +	raid0_conf_t *conf = mddev_to_conf(mddev); + +	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +	kfree (conf->hash_table); +	conf->hash_table = NULL; +	kfree (conf->strip_zone); +	conf->strip_zone = NULL; +	kfree (conf); +	mddev->private = NULL; + +	return 0; +} + +static int raid0_make_request (request_queue_t *q, struct bio *bio) +{ +	mddev_t *mddev = q->queuedata; +	unsigned int sect_in_chunk, chunksize_bits,  chunk_size, chunk_sects; +	raid0_conf_t *conf = mddev_to_conf(mddev); +	struct strip_zone *zone; +	mdk_rdev_t *tmp_dev; +	unsigned long chunk; +	sector_t block, rsect; + +	if (bio_data_dir(bio)==WRITE) { +		disk_stat_inc(mddev->gendisk, writes); +		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); +	} else { +		disk_stat_inc(mddev->gendisk, reads); +		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); +	} + +	chunk_size = mddev->chunk_size >> 10; +	chunk_sects = mddev->chunk_size >> 9; +	chunksize_bits = ffz(~chunk_size); +	block = bio->bi_sector >> 1; +	 + +	if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) { +		struct bio_pair *bp; +		/* Sanity check -- queue functions should prevent this happening */ +		if (bio->bi_vcnt != 1 || +		    bio->bi_idx != 0) +			goto bad_map; +		/* This is a one page bio that upper layers +		 * refuse to split for us, so we need to split it. +		 */ +		bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); +		if (raid0_make_request(q, &bp->bio1)) +			generic_make_request(&bp->bio1); +		if (raid0_make_request(q, &bp->bio2)) +			generic_make_request(&bp->bio2); + +		bio_pair_release(bp); +		return 0; +	} +  + +	{ +#if __GNUC__ < 3 +		volatile +#endif +		sector_t x = block >> conf->preshift; +		sector_div(x, (unsigned long)conf->hash_spacing); +		zone = conf->hash_table[x]; +	} +  +	while (block >= (zone->zone_offset + zone->size))  +		zone++; +     +	sect_in_chunk = bio->bi_sector & ((chunk_size<<1) -1); + + +	{ +		sector_t x =  (block - zone->zone_offset) >> chunksize_bits; + +		sector_div(x, zone->nb_dev); +		chunk = x; +		BUG_ON(x != (sector_t)chunk); + +		x = block >> chunksize_bits; +		tmp_dev = zone->dev[sector_div(x, zone->nb_dev)]; +	} +	rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1) +		+ sect_in_chunk; +  +	bio->bi_bdev = tmp_dev->bdev; +	bio->bi_sector = rsect + tmp_dev->data_offset; + +	/* +	 * Let the main block layer submit the IO and resolve recursion: +	 */ +	return 1; + +bad_map: +	printk("raid0_make_request bug: can't convert block across chunks" +		" or bigger than %dk %llu %d\n", chunk_size,  +		(unsigned long long)bio->bi_sector, bio->bi_size >> 10); + +	bio_io_error(bio, bio->bi_size); +	return 0; +} +			    +static void raid0_status (struct seq_file *seq, mddev_t *mddev) +{ +#undef MD_DEBUG +#ifdef MD_DEBUG +	int j, k, h; +	char b[BDEVNAME_SIZE]; +	raid0_conf_t *conf = mddev_to_conf(mddev); +   +	h = 0; +	for (j = 0; j < conf->nr_strip_zones; j++) { +		seq_printf(seq, "      z%d", j); +		if (conf->hash_table[h] == conf->strip_zone+j) +			seq_printf("(h%d)", h++); +		seq_printf(seq, "=["); +		for (k = 0; k < conf->strip_zone[j].nb_dev; k++) +			seq_printf (seq, "%s/", bdevname( +				conf->strip_zone[j].dev[k]->bdev,b)); + +		seq_printf (seq, "] zo=%d do=%d s=%d\n", +				conf->strip_zone[j].zone_offset, +				conf->strip_zone[j].dev_offset, +				conf->strip_zone[j].size); +	} +#endif +	seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); +	return; +} + +static mdk_personality_t raid0_personality= +{ +	.name		= "raid0", +	.owner		= THIS_MODULE, +	.make_request	= raid0_make_request, +	.run		= raid0_run, +	.stop		= raid0_stop, +	.status		= raid0_status, +}; + +static int __init raid0_init (void) +{ +	return register_md_personality (RAID0, &raid0_personality); +} + +static void raid0_exit (void) +{ +	unregister_md_personality (RAID0); +} + +module_init(raid0_init); +module_exit(raid0_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("md-personality-2"); /* RAID0 */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c new file mode 100644 index 00000000000..a389394b52f --- /dev/null +++ b/drivers/md/raid1.c @@ -0,0 +1,1449 @@ +/* + * raid1.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * RAID-1 management functions. + * + * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 + * + * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> + * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/raid/raid1.h> + +/* + * Number of guaranteed r1bios in case of extreme VM load: + */ +#define	NR_RAID1_BIOS 256 + +static mdk_personality_t raid1_personality; + +static void unplug_slaves(mddev_t *mddev); + + +static void * r1bio_pool_alloc(unsigned int __nocast gfp_flags, void *data) +{ +	struct pool_info *pi = data; +	r1bio_t *r1_bio; +	int size = offsetof(r1bio_t, bios[pi->raid_disks]); + +	/* allocate a r1bio with room for raid_disks entries in the bios array */ +	r1_bio = kmalloc(size, gfp_flags); +	if (r1_bio) +		memset(r1_bio, 0, size); +	else +		unplug_slaves(pi->mddev); + +	return r1_bio; +} + +static void r1bio_pool_free(void *r1_bio, void *data) +{ +	kfree(r1_bio); +} + +#define RESYNC_BLOCK_SIZE (64*1024) +//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) +#define RESYNC_WINDOW (2048*1024) + +static void * r1buf_pool_alloc(unsigned int __nocast gfp_flags, void *data) +{ +	struct pool_info *pi = data; +	struct page *page; +	r1bio_t *r1_bio; +	struct bio *bio; +	int i, j; + +	r1_bio = r1bio_pool_alloc(gfp_flags, pi); +	if (!r1_bio) { +		unplug_slaves(pi->mddev); +		return NULL; +	} + +	/* +	 * Allocate bios : 1 for reading, n-1 for writing +	 */ +	for (j = pi->raid_disks ; j-- ; ) { +		bio = bio_alloc(gfp_flags, RESYNC_PAGES); +		if (!bio) +			goto out_free_bio; +		r1_bio->bios[j] = bio; +	} +	/* +	 * Allocate RESYNC_PAGES data pages and attach them to +	 * the first bio; +	 */ +	bio = r1_bio->bios[0]; +	for (i = 0; i < RESYNC_PAGES; i++) { +		page = alloc_page(gfp_flags); +		if (unlikely(!page)) +			goto out_free_pages; + +		bio->bi_io_vec[i].bv_page = page; +	} + +	r1_bio->master_bio = NULL; + +	return r1_bio; + +out_free_pages: +	for ( ; i > 0 ; i--) +		__free_page(bio->bi_io_vec[i-1].bv_page); +out_free_bio: +	while ( ++j < pi->raid_disks ) +		bio_put(r1_bio->bios[j]); +	r1bio_pool_free(r1_bio, data); +	return NULL; +} + +static void r1buf_pool_free(void *__r1_bio, void *data) +{ +	struct pool_info *pi = data; +	int i; +	r1bio_t *r1bio = __r1_bio; +	struct bio *bio = r1bio->bios[0]; + +	for (i = 0; i < RESYNC_PAGES; i++) { +		__free_page(bio->bi_io_vec[i].bv_page); +		bio->bi_io_vec[i].bv_page = NULL; +	} +	for (i=0 ; i < pi->raid_disks; i++) +		bio_put(r1bio->bios[i]); + +	r1bio_pool_free(r1bio, data); +} + +static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) +{ +	int i; + +	for (i = 0; i < conf->raid_disks; i++) { +		struct bio **bio = r1_bio->bios + i; +		if (*bio) +			bio_put(*bio); +		*bio = NULL; +	} +} + +static inline void free_r1bio(r1bio_t *r1_bio) +{ +	unsigned long flags; + +	conf_t *conf = mddev_to_conf(r1_bio->mddev); + +	/* +	 * Wake up any possible resync thread that waits for the device +	 * to go idle. +	 */ +	spin_lock_irqsave(&conf->resync_lock, flags); +	if (!--conf->nr_pending) { +		wake_up(&conf->wait_idle); +		wake_up(&conf->wait_resume); +	} +	spin_unlock_irqrestore(&conf->resync_lock, flags); + +	put_all_bios(conf, r1_bio); +	mempool_free(r1_bio, conf->r1bio_pool); +} + +static inline void put_buf(r1bio_t *r1_bio) +{ +	conf_t *conf = mddev_to_conf(r1_bio->mddev); +	unsigned long flags; + +	mempool_free(r1_bio, conf->r1buf_pool); + +	spin_lock_irqsave(&conf->resync_lock, flags); +	if (!conf->barrier) +		BUG(); +	--conf->barrier; +	wake_up(&conf->wait_resume); +	wake_up(&conf->wait_idle); + +	if (!--conf->nr_pending) { +		wake_up(&conf->wait_idle); +		wake_up(&conf->wait_resume); +	} +	spin_unlock_irqrestore(&conf->resync_lock, flags); +} + +static void reschedule_retry(r1bio_t *r1_bio) +{ +	unsigned long flags; +	mddev_t *mddev = r1_bio->mddev; +	conf_t *conf = mddev_to_conf(mddev); + +	spin_lock_irqsave(&conf->device_lock, flags); +	list_add(&r1_bio->retry_list, &conf->retry_list); +	spin_unlock_irqrestore(&conf->device_lock, flags); + +	md_wakeup_thread(mddev->thread); +} + +/* + * raid_end_bio_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid_end_bio_io(r1bio_t *r1_bio) +{ +	struct bio *bio = r1_bio->master_bio; + +	bio_endio(bio, bio->bi_size, +		test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); +	free_r1bio(r1_bio); +} + +/* + * Update disk head position estimator based on IRQ completion info. + */ +static inline void update_head_pos(int disk, r1bio_t *r1_bio) +{ +	conf_t *conf = mddev_to_conf(r1_bio->mddev); + +	conf->mirrors[disk].head_position = +		r1_bio->sector + (r1_bio->sectors); +} + +static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error) +{ +	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); +	int mirror; +	conf_t *conf = mddev_to_conf(r1_bio->mddev); + +	if (bio->bi_size) +		return 1; +	 +	mirror = r1_bio->read_disk; +	/* +	 * this branch is our 'one mirror IO has finished' event handler: +	 */ +	if (!uptodate) +		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); +	else +		/* +		 * Set R1BIO_Uptodate in our master bio, so that +		 * we will return a good error code for to the higher +		 * levels even if IO on some other mirrored buffer fails. +		 * +		 * The 'master' represents the composite IO operation to +		 * user-side. So if something waits for IO, then it will +		 * wait for the 'master' bio. +		 */ +		set_bit(R1BIO_Uptodate, &r1_bio->state); + +	update_head_pos(mirror, r1_bio); + +	/* +	 * we have only one bio on the read side +	 */ +	if (uptodate) +		raid_end_bio_io(r1_bio); +	else { +		/* +		 * oops, read error: +		 */ +		char b[BDEVNAME_SIZE]; +		if (printk_ratelimit()) +			printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", +			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); +		reschedule_retry(r1_bio); +	} + +	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); +	return 0; +} + +static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error) +{ +	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); +	int mirror; +	conf_t *conf = mddev_to_conf(r1_bio->mddev); + +	if (bio->bi_size) +		return 1; + +	for (mirror = 0; mirror < conf->raid_disks; mirror++) +		if (r1_bio->bios[mirror] == bio) +			break; + +	/* +	 * this branch is our 'one mirror IO has finished' event handler: +	 */ +	if (!uptodate) +		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); +	else +		/* +		 * Set R1BIO_Uptodate in our master bio, so that +		 * we will return a good error code for to the higher +		 * levels even if IO on some other mirrored buffer fails. +		 * +		 * The 'master' represents the composite IO operation to +		 * user-side. So if something waits for IO, then it will +		 * wait for the 'master' bio. +		 */ +		set_bit(R1BIO_Uptodate, &r1_bio->state); + +	update_head_pos(mirror, r1_bio); + +	/* +	 * +	 * Let's see if all mirrored write operations have finished +	 * already. +	 */ +	if (atomic_dec_and_test(&r1_bio->remaining)) { +		md_write_end(r1_bio->mddev); +		raid_end_bio_io(r1_bio); +	} + +	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); +	return 0; +} + + +/* + * This routine returns the disk from which the requested read should + * be done. There is a per-array 'next expected sequential IO' sector + * number - if this matches on the next IO then we use the last disk. + * There is also a per-disk 'last know head position' sector that is + * maintained from IRQ contexts, both the normal and the resync IO + * completion handlers update this position correctly. If there is no + * perfect sequential match then we pick the disk whose head is closest. + * + * If there are 2 mirrors in the same 2 devices, performance degrades + * because position is mirror, not device based. + * + * The rdev for the device selected will have nr_pending incremented. + */ +static int read_balance(conf_t *conf, r1bio_t *r1_bio) +{ +	const unsigned long this_sector = r1_bio->sector; +	int new_disk = conf->last_used, disk = new_disk; +	const int sectors = r1_bio->sectors; +	sector_t new_distance, current_distance; +	mdk_rdev_t *new_rdev, *rdev; + +	rcu_read_lock(); +	/* +	 * Check if it if we can balance. We can balance on the whole +	 * device if no resync is going on, or below the resync window. +	 * We take the first readable disk when above the resync window. +	 */ + retry: +	if (conf->mddev->recovery_cp < MaxSector && +	    (this_sector + sectors >= conf->next_resync)) { +		/* Choose the first operation device, for consistancy */ +		new_disk = 0; + +		while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || +		       !new_rdev->in_sync) { +			new_disk++; +			if (new_disk == conf->raid_disks) { +				new_disk = -1; +				break; +			} +		} +		goto rb_out; +	} + + +	/* make sure the disk is operational */ +	while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || +	       !new_rdev->in_sync) { +		if (new_disk <= 0) +			new_disk = conf->raid_disks; +		new_disk--; +		if (new_disk == disk) { +			new_disk = -1; +			goto rb_out; +		} +	} +	disk = new_disk; +	/* now disk == new_disk == starting point for search */ + +	/* +	 * Don't change to another disk for sequential reads: +	 */ +	if (conf->next_seq_sect == this_sector) +		goto rb_out; +	if (this_sector == conf->mirrors[new_disk].head_position) +		goto rb_out; + +	current_distance = abs(this_sector - conf->mirrors[disk].head_position); + +	/* Find the disk whose head is closest */ + +	do { +		if (disk <= 0) +			disk = conf->raid_disks; +		disk--; + +		if ((rdev=conf->mirrors[disk].rdev) == NULL || +		    !rdev->in_sync) +			continue; + +		if (!atomic_read(&rdev->nr_pending)) { +			new_disk = disk; +			new_rdev = rdev; +			break; +		} +		new_distance = abs(this_sector - conf->mirrors[disk].head_position); +		if (new_distance < current_distance) { +			current_distance = new_distance; +			new_disk = disk; +			new_rdev = rdev; +		} +	} while (disk != conf->last_used); + +rb_out: + + +	if (new_disk >= 0) { +		conf->next_seq_sect = this_sector + sectors; +		conf->last_used = new_disk; +		atomic_inc(&new_rdev->nr_pending); +		if (!new_rdev->in_sync) { +			/* cannot risk returning a device that failed +			 * before we inc'ed nr_pending +			 */ +			atomic_dec(&new_rdev->nr_pending); +			goto retry; +		} +	} +	rcu_read_unlock(); + +	return new_disk; +} + +static void unplug_slaves(mddev_t *mddev) +{ +	conf_t *conf = mddev_to_conf(mddev); +	int i; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks; i++) { +		mdk_rdev_t *rdev = conf->mirrors[i].rdev; +		if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { +			request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + +			atomic_inc(&rdev->nr_pending); +			rcu_read_unlock(); + +			if (r_queue->unplug_fn) +				r_queue->unplug_fn(r_queue); + +			rdev_dec_pending(rdev, mddev); +			rcu_read_lock(); +		} +	} +	rcu_read_unlock(); +} + +static void raid1_unplug(request_queue_t *q) +{ +	unplug_slaves(q->queuedata); +} + +static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, +			     sector_t *error_sector) +{ +	mddev_t *mddev = q->queuedata; +	conf_t *conf = mddev_to_conf(mddev); +	int i, ret = 0; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks && ret == 0; i++) { +		mdk_rdev_t *rdev = conf->mirrors[i].rdev; +		if (rdev && !rdev->faulty) { +			struct block_device *bdev = rdev->bdev; +			request_queue_t *r_queue = bdev_get_queue(bdev); + +			if (!r_queue->issue_flush_fn) +				ret = -EOPNOTSUPP; +			else { +				atomic_inc(&rdev->nr_pending); +				rcu_read_unlock(); +				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, +							      error_sector); +				rdev_dec_pending(rdev, mddev); +				rcu_read_lock(); +			} +		} +	} +	rcu_read_unlock(); +	return ret; +} + +/* + * Throttle resync depth, so that we can both get proper overlapping of + * requests, but are still able to handle normal requests quickly. + */ +#define RESYNC_DEPTH 32 + +static void device_barrier(conf_t *conf, sector_t sect) +{ +	spin_lock_irq(&conf->resync_lock); +	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), +			    conf->resync_lock, unplug_slaves(conf->mddev)); +	 +	if (!conf->barrier++) { +		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, +				    conf->resync_lock, unplug_slaves(conf->mddev)); +		if (conf->nr_pending) +			BUG(); +	} +	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, +			    conf->resync_lock, unplug_slaves(conf->mddev)); +	conf->next_resync = sect; +	spin_unlock_irq(&conf->resync_lock); +} + +static int make_request(request_queue_t *q, struct bio * bio) +{ +	mddev_t *mddev = q->queuedata; +	conf_t *conf = mddev_to_conf(mddev); +	mirror_info_t *mirror; +	r1bio_t *r1_bio; +	struct bio *read_bio; +	int i, disks; +	mdk_rdev_t *rdev; + +	/* +	 * Register the new request and wait if the reconstruction +	 * thread has put up a bar for new requests. +	 * Continue immediately if no resync is active currently. +	 */ +	spin_lock_irq(&conf->resync_lock); +	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); +	conf->nr_pending++; +	spin_unlock_irq(&conf->resync_lock); + +	if (bio_data_dir(bio)==WRITE) { +		disk_stat_inc(mddev->gendisk, writes); +		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); +	} else { +		disk_stat_inc(mddev->gendisk, reads); +		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); +	} + +	/* +	 * make_request() can abort the operation when READA is being +	 * used and no empty request is available. +	 * +	 */ +	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + +	r1_bio->master_bio = bio; +	r1_bio->sectors = bio->bi_size >> 9; + +	r1_bio->mddev = mddev; +	r1_bio->sector = bio->bi_sector; + +	r1_bio->state = 0; + +	if (bio_data_dir(bio) == READ) { +		/* +		 * read balancing logic: +		 */ +		int rdisk = read_balance(conf, r1_bio); + +		if (rdisk < 0) { +			/* couldn't find anywhere to read from */ +			raid_end_bio_io(r1_bio); +			return 0; +		} +		mirror = conf->mirrors + rdisk; + +		r1_bio->read_disk = rdisk; + +		read_bio = bio_clone(bio, GFP_NOIO); + +		r1_bio->bios[rdisk] = read_bio; + +		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; +		read_bio->bi_bdev = mirror->rdev->bdev; +		read_bio->bi_end_io = raid1_end_read_request; +		read_bio->bi_rw = READ; +		read_bio->bi_private = r1_bio; + +		generic_make_request(read_bio); +		return 0; +	} + +	/* +	 * WRITE: +	 */ +	/* first select target devices under spinlock and +	 * inc refcount on their rdev.  Record them by setting +	 * bios[x] to bio +	 */ +	disks = conf->raid_disks; +	rcu_read_lock(); +	for (i = 0;  i < disks; i++) { +		if ((rdev=conf->mirrors[i].rdev) != NULL && +		    !rdev->faulty) { +			atomic_inc(&rdev->nr_pending); +			if (rdev->faulty) { +				atomic_dec(&rdev->nr_pending); +				r1_bio->bios[i] = NULL; +			} else +				r1_bio->bios[i] = bio; +		} else +			r1_bio->bios[i] = NULL; +	} +	rcu_read_unlock(); + +	atomic_set(&r1_bio->remaining, 1); +	md_write_start(mddev); +	for (i = 0; i < disks; i++) { +		struct bio *mbio; +		if (!r1_bio->bios[i]) +			continue; + +		mbio = bio_clone(bio, GFP_NOIO); +		r1_bio->bios[i] = mbio; + +		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset; +		mbio->bi_bdev = conf->mirrors[i].rdev->bdev; +		mbio->bi_end_io	= raid1_end_write_request; +		mbio->bi_rw = WRITE; +		mbio->bi_private = r1_bio; + +		atomic_inc(&r1_bio->remaining); +		generic_make_request(mbio); +	} + +	if (atomic_dec_and_test(&r1_bio->remaining)) { +		md_write_end(mddev); +		raid_end_bio_io(r1_bio); +	} + +	return 0; +} + +static void status(struct seq_file *seq, mddev_t *mddev) +{ +	conf_t *conf = mddev_to_conf(mddev); +	int i; + +	seq_printf(seq, " [%d/%d] [", conf->raid_disks, +						conf->working_disks); +	for (i = 0; i < conf->raid_disks; i++) +		seq_printf(seq, "%s", +			      conf->mirrors[i].rdev && +			      conf->mirrors[i].rdev->in_sync ? "U" : "_"); +	seq_printf(seq, "]"); +} + + +static void error(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	char b[BDEVNAME_SIZE]; +	conf_t *conf = mddev_to_conf(mddev); + +	/* +	 * If it is not operational, then we have already marked it as dead +	 * else if it is the last working disks, ignore the error, let the +	 * next level up know. +	 * else mark the drive as failed +	 */ +	if (rdev->in_sync +	    && conf->working_disks == 1) +		/* +		 * Don't fail the drive, act as though we were just a +		 * normal single drive +		 */ +		return; +	if (rdev->in_sync) { +		mddev->degraded++; +		conf->working_disks--; +		/* +		 * if recovery is running, make sure it aborts. +		 */ +		set_bit(MD_RECOVERY_ERR, &mddev->recovery); +	} +	rdev->in_sync = 0; +	rdev->faulty = 1; +	mddev->sb_dirty = 1; +	printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" +		"	Operation continuing on %d devices\n", +		bdevname(rdev->bdev,b), conf->working_disks); +} + +static void print_conf(conf_t *conf) +{ +	int i; +	mirror_info_t *tmp; + +	printk("RAID1 conf printout:\n"); +	if (!conf) { +		printk("(!conf)\n"); +		return; +	} +	printk(" --- wd:%d rd:%d\n", conf->working_disks, +		conf->raid_disks); + +	for (i = 0; i < conf->raid_disks; i++) { +		char b[BDEVNAME_SIZE]; +		tmp = conf->mirrors + i; +		if (tmp->rdev) +			printk(" disk %d, wo:%d, o:%d, dev:%s\n", +				i, !tmp->rdev->in_sync, !tmp->rdev->faulty, +				bdevname(tmp->rdev->bdev,b)); +	} +} + +static void close_sync(conf_t *conf) +{ +	spin_lock_irq(&conf->resync_lock); +	wait_event_lock_irq(conf->wait_resume, !conf->barrier, +			    conf->resync_lock, 	unplug_slaves(conf->mddev)); +	spin_unlock_irq(&conf->resync_lock); + +	if (conf->barrier) BUG(); +	if (waitqueue_active(&conf->wait_idle)) BUG(); + +	mempool_destroy(conf->r1buf_pool); +	conf->r1buf_pool = NULL; +} + +static int raid1_spare_active(mddev_t *mddev) +{ +	int i; +	conf_t *conf = mddev->private; +	mirror_info_t *tmp; + +	/* +	 * Find all failed disks within the RAID1 configuration  +	 * and mark them readable +	 */ +	for (i = 0; i < conf->raid_disks; i++) { +		tmp = conf->mirrors + i; +		if (tmp->rdev  +		    && !tmp->rdev->faulty +		    && !tmp->rdev->in_sync) { +			conf->working_disks++; +			mddev->degraded--; +			tmp->rdev->in_sync = 1; +		} +	} + +	print_conf(conf); +	return 0; +} + + +static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	conf_t *conf = mddev->private; +	int found = 0; +	int mirror; +	mirror_info_t *p; + +	for (mirror=0; mirror < mddev->raid_disks; mirror++) +		if ( !(p=conf->mirrors+mirror)->rdev) { + +			blk_queue_stack_limits(mddev->queue, +					       rdev->bdev->bd_disk->queue); +			/* as we don't honour merge_bvec_fn, we must never risk +			 * violating it, so limit ->max_sector to one PAGE, as +			 * a one page request is never in violation. +			 */ +			if (rdev->bdev->bd_disk->queue->merge_bvec_fn && +			    mddev->queue->max_sectors > (PAGE_SIZE>>9)) +				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + +			p->head_position = 0; +			rdev->raid_disk = mirror; +			found = 1; +			p->rdev = rdev; +			break; +		} + +	print_conf(conf); +	return found; +} + +static int raid1_remove_disk(mddev_t *mddev, int number) +{ +	conf_t *conf = mddev->private; +	int err = 0; +	mdk_rdev_t *rdev; +	mirror_info_t *p = conf->mirrors+ number; + +	print_conf(conf); +	rdev = p->rdev; +	if (rdev) { +		if (rdev->in_sync || +		    atomic_read(&rdev->nr_pending)) { +			err = -EBUSY; +			goto abort; +		} +		p->rdev = NULL; +		synchronize_kernel(); +		if (atomic_read(&rdev->nr_pending)) { +			/* lost the race, try later */ +			err = -EBUSY; +			p->rdev = rdev; +		} +	} +abort: + +	print_conf(conf); +	return err; +} + + +static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) +{ +	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); +	conf_t *conf = mddev_to_conf(r1_bio->mddev); + +	if (bio->bi_size) +		return 1; + +	if (r1_bio->bios[r1_bio->read_disk] != bio) +		BUG(); +	update_head_pos(r1_bio->read_disk, r1_bio); +	/* +	 * we have read a block, now it needs to be re-written, +	 * or re-read if the read failed. +	 * We don't do much here, just schedule handling by raid1d +	 */ +	if (!uptodate) +		md_error(r1_bio->mddev, +			 conf->mirrors[r1_bio->read_disk].rdev); +	else +		set_bit(R1BIO_Uptodate, &r1_bio->state); +	rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); +	reschedule_retry(r1_bio); +	return 0; +} + +static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) +{ +	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); +	mddev_t *mddev = r1_bio->mddev; +	conf_t *conf = mddev_to_conf(mddev); +	int i; +	int mirror=0; + +	if (bio->bi_size) +		return 1; + +	for (i = 0; i < conf->raid_disks; i++) +		if (r1_bio->bios[i] == bio) { +			mirror = i; +			break; +		} +	if (!uptodate) +		md_error(mddev, conf->mirrors[mirror].rdev); +	update_head_pos(mirror, r1_bio); + +	if (atomic_dec_and_test(&r1_bio->remaining)) { +		md_done_sync(mddev, r1_bio->sectors, uptodate); +		put_buf(r1_bio); +	} +	rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); +	return 0; +} + +static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) +{ +	conf_t *conf = mddev_to_conf(mddev); +	int i; +	int disks = conf->raid_disks; +	struct bio *bio, *wbio; + +	bio = r1_bio->bios[r1_bio->read_disk]; + +	/* +	 * schedule writes +	 */ +	if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { +		/* +		 * There is no point trying a read-for-reconstruct as +		 * reconstruct is about to be aborted +		 */ +		char b[BDEVNAME_SIZE]; +		printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" +			" for block %llu\n", +			bdevname(bio->bi_bdev,b),  +			(unsigned long long)r1_bio->sector); +		md_done_sync(mddev, r1_bio->sectors, 0); +		put_buf(r1_bio); +		return; +	} + +	atomic_set(&r1_bio->remaining, 1); +	for (i = 0; i < disks ; i++) { +		wbio = r1_bio->bios[i]; +		if (wbio->bi_end_io != end_sync_write) +			continue; + +		atomic_inc(&conf->mirrors[i].rdev->nr_pending); +		atomic_inc(&r1_bio->remaining); +		md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); +		generic_make_request(wbio); +	} + +	if (atomic_dec_and_test(&r1_bio->remaining)) { +		md_done_sync(mddev, r1_bio->sectors, 1); +		put_buf(r1_bio); +	} +} + +/* + * This is a kernel thread which: + * + *	1.	Retries failed read operations on working mirrors. + *	2.	Updates the raid superblock when problems encounter. + *	3.	Performs writes following reads for array syncronising. + */ + +static void raid1d(mddev_t *mddev) +{ +	r1bio_t *r1_bio; +	struct bio *bio; +	unsigned long flags; +	conf_t *conf = mddev_to_conf(mddev); +	struct list_head *head = &conf->retry_list; +	int unplug=0; +	mdk_rdev_t *rdev; + +	md_check_recovery(mddev); +	md_handle_safemode(mddev); +	 +	for (;;) { +		char b[BDEVNAME_SIZE]; +		spin_lock_irqsave(&conf->device_lock, flags); +		if (list_empty(head)) +			break; +		r1_bio = list_entry(head->prev, r1bio_t, retry_list); +		list_del(head->prev); +		spin_unlock_irqrestore(&conf->device_lock, flags); + +		mddev = r1_bio->mddev; +		conf = mddev_to_conf(mddev); +		if (test_bit(R1BIO_IsSync, &r1_bio->state)) { +			sync_request_write(mddev, r1_bio); +			unplug = 1; +		} else { +			int disk; +			bio = r1_bio->bios[r1_bio->read_disk]; +			if ((disk=read_balance(conf, r1_bio)) == -1) { +				printk(KERN_ALERT "raid1: %s: unrecoverable I/O" +				       " read error for block %llu\n", +				       bdevname(bio->bi_bdev,b), +				       (unsigned long long)r1_bio->sector); +				raid_end_bio_io(r1_bio); +			} else { +				r1_bio->bios[r1_bio->read_disk] = NULL; +				r1_bio->read_disk = disk; +				bio_put(bio); +				bio = bio_clone(r1_bio->master_bio, GFP_NOIO); +				r1_bio->bios[r1_bio->read_disk] = bio; +				rdev = conf->mirrors[disk].rdev; +				if (printk_ratelimit()) +					printk(KERN_ERR "raid1: %s: redirecting sector %llu to" +					       " another mirror\n", +					       bdevname(rdev->bdev,b), +					       (unsigned long long)r1_bio->sector); +				bio->bi_sector = r1_bio->sector + rdev->data_offset; +				bio->bi_bdev = rdev->bdev; +				bio->bi_end_io = raid1_end_read_request; +				bio->bi_rw = READ; +				bio->bi_private = r1_bio; +				unplug = 1; +				generic_make_request(bio); +			} +		} +	} +	spin_unlock_irqrestore(&conf->device_lock, flags); +	if (unplug) +		unplug_slaves(mddev); +} + + +static int init_resync(conf_t *conf) +{ +	int buffs; + +	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; +	if (conf->r1buf_pool) +		BUG(); +	conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, +					  conf->poolinfo); +	if (!conf->r1buf_pool) +		return -ENOMEM; +	conf->next_resync = 0; +	return 0; +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * + * This is achieved by tracking pending requests and a 'barrier' concept + * that can be installed to exclude normal IO requests. + */ + +static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) +{ +	conf_t *conf = mddev_to_conf(mddev); +	mirror_info_t *mirror; +	r1bio_t *r1_bio; +	struct bio *bio; +	sector_t max_sector, nr_sectors; +	int disk; +	int i; +	int write_targets = 0; + +	if (!conf->r1buf_pool) +		if (init_resync(conf)) +			return -ENOMEM; + +	max_sector = mddev->size << 1; +	if (sector_nr >= max_sector) { +		close_sync(conf); +		return 0; +	} + +	/* +	 * If there is non-resync activity waiting for us then +	 * put in a delay to throttle resync. +	 */ +	if (!go_faster && waitqueue_active(&conf->wait_resume)) +		msleep_interruptible(1000); +	device_barrier(conf, sector_nr + RESYNC_SECTORS); + +	/* +	 * If reconstructing, and >1 working disc, +	 * could dedicate one to rebuild and others to +	 * service read requests .. +	 */ +	disk = conf->last_used; +	/* make sure disk is operational */ + +	while (conf->mirrors[disk].rdev == NULL || +	       !conf->mirrors[disk].rdev->in_sync) { +		if (disk <= 0) +			disk = conf->raid_disks; +		disk--; +		if (disk == conf->last_used) +			break; +	} +	conf->last_used = disk; +	atomic_inc(&conf->mirrors[disk].rdev->nr_pending); + + +	mirror = conf->mirrors + disk; + +	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); + +	spin_lock_irq(&conf->resync_lock); +	conf->nr_pending++; +	spin_unlock_irq(&conf->resync_lock); + +	r1_bio->mddev = mddev; +	r1_bio->sector = sector_nr; +	set_bit(R1BIO_IsSync, &r1_bio->state); +	r1_bio->read_disk = disk; + +	for (i=0; i < conf->raid_disks; i++) { +		bio = r1_bio->bios[i]; + +		/* take from bio_init */ +		bio->bi_next = NULL; +		bio->bi_flags |= 1 << BIO_UPTODATE; +		bio->bi_rw = 0; +		bio->bi_vcnt = 0; +		bio->bi_idx = 0; +		bio->bi_phys_segments = 0; +		bio->bi_hw_segments = 0; +		bio->bi_size = 0; +		bio->bi_end_io = NULL; +		bio->bi_private = NULL; + +		if (i == disk) { +			bio->bi_rw = READ; +			bio->bi_end_io = end_sync_read; +		} else if (conf->mirrors[i].rdev && +			   !conf->mirrors[i].rdev->faulty && +			   (!conf->mirrors[i].rdev->in_sync || +			    sector_nr + RESYNC_SECTORS > mddev->recovery_cp)) { +			bio->bi_rw = WRITE; +			bio->bi_end_io = end_sync_write; +			write_targets ++; +		} else +			continue; +		bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset; +		bio->bi_bdev = conf->mirrors[i].rdev->bdev; +		bio->bi_private = r1_bio; +	} +	if (write_targets == 0) { +		/* There is nowhere to write, so all non-sync +		 * drives must be failed - so we are finished +		 */ +		int rv = max_sector - sector_nr; +		md_done_sync(mddev, rv, 1); +		put_buf(r1_bio); +		rdev_dec_pending(conf->mirrors[disk].rdev, mddev); +		return rv; +	} + +	nr_sectors = 0; +	do { +		struct page *page; +		int len = PAGE_SIZE; +		if (sector_nr + (len>>9) > max_sector) +			len = (max_sector - sector_nr) << 9; +		if (len == 0) +			break; +		for (i=0 ; i < conf->raid_disks; i++) { +			bio = r1_bio->bios[i]; +			if (bio->bi_end_io) { +				page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page; +				if (bio_add_page(bio, page, len, 0) == 0) { +					/* stop here */ +					r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page; +					while (i > 0) { +						i--; +						bio = r1_bio->bios[i]; +						if (bio->bi_end_io==NULL) continue; +						/* remove last page from this bio */ +						bio->bi_vcnt--; +						bio->bi_size -= len; +						bio->bi_flags &= ~(1<< BIO_SEG_VALID); +					} +					goto bio_full; +				} +			} +		} +		nr_sectors += len>>9; +		sector_nr += len>>9; +	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); + bio_full: +	bio = r1_bio->bios[disk]; +	r1_bio->sectors = nr_sectors; + +	md_sync_acct(mirror->rdev->bdev, nr_sectors); + +	generic_make_request(bio); + +	return nr_sectors; +} + +static int run(mddev_t *mddev) +{ +	conf_t *conf; +	int i, j, disk_idx; +	mirror_info_t *disk; +	mdk_rdev_t *rdev; +	struct list_head *tmp; + +	if (mddev->level != 1) { +		printk("raid1: %s: raid level not set to mirroring (%d)\n", +		       mdname(mddev), mddev->level); +		goto out; +	} +	/* +	 * copy the already verified devices into our private RAID1 +	 * bookkeeping area. [whatever we allocate in run(), +	 * should be freed in stop()] +	 */ +	conf = kmalloc(sizeof(conf_t), GFP_KERNEL); +	mddev->private = conf; +	if (!conf) +		goto out_no_mem; + +	memset(conf, 0, sizeof(*conf)); +	conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,  +				 GFP_KERNEL); +	if (!conf->mirrors) +		goto out_no_mem; + +	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); + +	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); +	if (!conf->poolinfo) +		goto out_no_mem; +	conf->poolinfo->mddev = mddev; +	conf->poolinfo->raid_disks = mddev->raid_disks; +	conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, +					  r1bio_pool_free, +					  conf->poolinfo); +	if (!conf->r1bio_pool) +		goto out_no_mem; + +	mddev->queue->unplug_fn = raid1_unplug; + +	mddev->queue->issue_flush_fn = raid1_issue_flush; + +	ITERATE_RDEV(mddev, rdev, tmp) { +		disk_idx = rdev->raid_disk; +		if (disk_idx >= mddev->raid_disks +		    || disk_idx < 0) +			continue; +		disk = conf->mirrors + disk_idx; + +		disk->rdev = rdev; + +		blk_queue_stack_limits(mddev->queue, +				       rdev->bdev->bd_disk->queue); +		/* as we don't honour merge_bvec_fn, we must never risk +		 * violating it, so limit ->max_sector to one PAGE, as +		 * a one page request is never in violation. +		 */ +		if (rdev->bdev->bd_disk->queue->merge_bvec_fn && +		    mddev->queue->max_sectors > (PAGE_SIZE>>9)) +			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + +		disk->head_position = 0; +		if (!rdev->faulty && rdev->in_sync) +			conf->working_disks++; +	} +	conf->raid_disks = mddev->raid_disks; +	conf->mddev = mddev; +	spin_lock_init(&conf->device_lock); +	INIT_LIST_HEAD(&conf->retry_list); +	if (conf->working_disks == 1) +		mddev->recovery_cp = MaxSector; + +	spin_lock_init(&conf->resync_lock); +	init_waitqueue_head(&conf->wait_idle); +	init_waitqueue_head(&conf->wait_resume); + +	if (!conf->working_disks) { +		printk(KERN_ERR "raid1: no operational mirrors for %s\n", +			mdname(mddev)); +		goto out_free_conf; +	} + +	mddev->degraded = 0; +	for (i = 0; i < conf->raid_disks; i++) { + +		disk = conf->mirrors + i; + +		if (!disk->rdev) { +			disk->head_position = 0; +			mddev->degraded++; +		} +	} + +	/* +	 * find the first working one and use it as a starting point +	 * to read balancing. +	 */ +	for (j = 0; j < conf->raid_disks && +		     (!conf->mirrors[j].rdev || +		      !conf->mirrors[j].rdev->in_sync) ; j++) +		/* nothing */; +	conf->last_used = j; + + + +	{ +		mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); +		if (!mddev->thread) { +			printk(KERN_ERR  +				"raid1: couldn't allocate thread for %s\n",  +				mdname(mddev)); +			goto out_free_conf; +		} +	} +	printk(KERN_INFO  +		"raid1: raid set %s active with %d out of %d mirrors\n", +		mdname(mddev), mddev->raid_disks - mddev->degraded,  +		mddev->raid_disks); +	/* +	 * Ok, everything is just fine now +	 */ +	mddev->array_size = mddev->size; + +	return 0; + +out_no_mem: +	printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", +	       mdname(mddev)); + +out_free_conf: +	if (conf) { +		if (conf->r1bio_pool) +			mempool_destroy(conf->r1bio_pool); +		if (conf->mirrors) +			kfree(conf->mirrors); +		if (conf->poolinfo) +			kfree(conf->poolinfo); +		kfree(conf); +		mddev->private = NULL; +	} +out: +	return -EIO; +} + +static int stop(mddev_t *mddev) +{ +	conf_t *conf = mddev_to_conf(mddev); + +	md_unregister_thread(mddev->thread); +	mddev->thread = NULL; +	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +	if (conf->r1bio_pool) +		mempool_destroy(conf->r1bio_pool); +	if (conf->mirrors) +		kfree(conf->mirrors); +	if (conf->poolinfo) +		kfree(conf->poolinfo); +	kfree(conf); +	mddev->private = NULL; +	return 0; +} + +static int raid1_resize(mddev_t *mddev, sector_t sectors) +{ +	/* no resync is happening, and there is enough space +	 * on all devices, so we can resize. +	 * We need to make sure resync covers any new space. +	 * If the array is shrinking we should possibly wait until +	 * any io in the removed space completes, but it hardly seems +	 * worth it. +	 */ +	mddev->array_size = sectors>>1; +	set_capacity(mddev->gendisk, mddev->array_size << 1); +	mddev->changed = 1; +	if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { +		mddev->recovery_cp = mddev->size << 1; +		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +	} +	mddev->size = mddev->array_size; +	return 0; +} + +static int raid1_reshape(mddev_t *mddev, int raid_disks) +{ +	/* We need to: +	 * 1/ resize the r1bio_pool +	 * 2/ resize conf->mirrors +	 * +	 * We allocate a new r1bio_pool if we can. +	 * Then raise a device barrier and wait until all IO stops. +	 * Then resize conf->mirrors and swap in the new r1bio pool. +	 */ +	mempool_t *newpool, *oldpool; +	struct pool_info *newpoolinfo; +	mirror_info_t *newmirrors; +	conf_t *conf = mddev_to_conf(mddev); + +	int d; + +	for (d= raid_disks; d < conf->raid_disks; d++) +		if (conf->mirrors[d].rdev) +			return -EBUSY; + +	newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); +	if (!newpoolinfo) +		return -ENOMEM; +	newpoolinfo->mddev = mddev; +	newpoolinfo->raid_disks = raid_disks; + +	newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, +				 r1bio_pool_free, newpoolinfo); +	if (!newpool) { +		kfree(newpoolinfo); +		return -ENOMEM; +	} +	newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); +	if (!newmirrors) { +		kfree(newpoolinfo); +		mempool_destroy(newpool); +		return -ENOMEM; +	} +	memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks); + +	spin_lock_irq(&conf->resync_lock); +	conf->barrier++; +	wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, +			    conf->resync_lock, unplug_slaves(mddev)); +	spin_unlock_irq(&conf->resync_lock); + +	/* ok, everything is stopped */ +	oldpool = conf->r1bio_pool; +	conf->r1bio_pool = newpool; +	for (d=0; d < raid_disks && d < conf->raid_disks; d++) +		newmirrors[d] = conf->mirrors[d]; +	kfree(conf->mirrors); +	conf->mirrors = newmirrors; +	kfree(conf->poolinfo); +	conf->poolinfo = newpoolinfo; + +	mddev->degraded += (raid_disks - conf->raid_disks); +	conf->raid_disks = mddev->raid_disks = raid_disks; + +	spin_lock_irq(&conf->resync_lock); +	conf->barrier--; +	spin_unlock_irq(&conf->resync_lock); +	wake_up(&conf->wait_resume); +	wake_up(&conf->wait_idle); + + +	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +	md_wakeup_thread(mddev->thread); + +	mempool_destroy(oldpool); +	return 0; +} + + +static mdk_personality_t raid1_personality = +{ +	.name		= "raid1", +	.owner		= THIS_MODULE, +	.make_request	= make_request, +	.run		= run, +	.stop		= stop, +	.status		= status, +	.error_handler	= error, +	.hot_add_disk	= raid1_add_disk, +	.hot_remove_disk= raid1_remove_disk, +	.spare_active	= raid1_spare_active, +	.sync_request	= sync_request, +	.resize		= raid1_resize, +	.reshape	= raid1_reshape, +}; + +static int __init raid_init(void) +{ +	return register_md_personality(RAID1, &raid1_personality); +} + +static void raid_exit(void) +{ +	unregister_md_personality(RAID1); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("md-personality-3"); /* RAID1 */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c new file mode 100644 index 00000000000..b100bfe4fdc --- /dev/null +++ b/drivers/md/raid10.c @@ -0,0 +1,1787 @@ +/* + * raid10.c : Multiple Devices driver for Linux + * + * Copyright (C) 2000-2004 Neil Brown + * + * RAID-10 support for md. + * + * Base on code in raid1.c.  See raid1.c for futher copyright information. + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/raid/raid10.h> + +/* + * RAID10 provides a combination of RAID0 and RAID1 functionality. + * The layout of data is defined by + *    chunk_size + *    raid_disks + *    near_copies (stored in low byte of layout) + *    far_copies (stored in second byte of layout) + * + * The data to be stored is divided into chunks using chunksize. + * Each device is divided into far_copies sections. + * In each section, chunks are laid out in a style similar to raid0, but + * near_copies copies of each chunk is stored (each on a different drive). + * The starting device for each section is offset near_copies from the starting + * device of the previous section. + * Thus there are (near_copies*far_copies) of each chunk, and each is on a different + * drive. + * near_copies and far_copies must be at least one, and their product is at most + * raid_disks. + */ + +/* + * Number of guaranteed r10bios in case of extreme VM load: + */ +#define	NR_RAID10_BIOS 256 + +static void unplug_slaves(mddev_t *mddev); + +static void * r10bio_pool_alloc(unsigned int __nocast gfp_flags, void *data) +{ +	conf_t *conf = data; +	r10bio_t *r10_bio; +	int size = offsetof(struct r10bio_s, devs[conf->copies]); + +	/* allocate a r10bio with room for raid_disks entries in the bios array */ +	r10_bio = kmalloc(size, gfp_flags); +	if (r10_bio) +		memset(r10_bio, 0, size); +	else +		unplug_slaves(conf->mddev); + +	return r10_bio; +} + +static void r10bio_pool_free(void *r10_bio, void *data) +{ +	kfree(r10_bio); +} + +#define RESYNC_BLOCK_SIZE (64*1024) +//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) +#define RESYNC_WINDOW (2048*1024) + +/* + * When performing a resync, we need to read and compare, so + * we need as many pages are there are copies. + * When performing a recovery, we need 2 bios, one for read, + * one for write (we recover only one drive per r10buf) + * + */ +static void * r10buf_pool_alloc(unsigned int __nocast gfp_flags, void *data) +{ +	conf_t *conf = data; +	struct page *page; +	r10bio_t *r10_bio; +	struct bio *bio; +	int i, j; +	int nalloc; + +	r10_bio = r10bio_pool_alloc(gfp_flags, conf); +	if (!r10_bio) { +		unplug_slaves(conf->mddev); +		return NULL; +	} + +	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) +		nalloc = conf->copies; /* resync */ +	else +		nalloc = 2; /* recovery */ + +	/* +	 * Allocate bios. +	 */ +	for (j = nalloc ; j-- ; ) { +		bio = bio_alloc(gfp_flags, RESYNC_PAGES); +		if (!bio) +			goto out_free_bio; +		r10_bio->devs[j].bio = bio; +	} +	/* +	 * Allocate RESYNC_PAGES data pages and attach them +	 * where needed. +	 */ +	for (j = 0 ; j < nalloc; j++) { +		bio = r10_bio->devs[j].bio; +		for (i = 0; i < RESYNC_PAGES; i++) { +			page = alloc_page(gfp_flags); +			if (unlikely(!page)) +				goto out_free_pages; + +			bio->bi_io_vec[i].bv_page = page; +		} +	} + +	return r10_bio; + +out_free_pages: +	for ( ; i > 0 ; i--) +		__free_page(bio->bi_io_vec[i-1].bv_page); +	while (j--) +		for (i = 0; i < RESYNC_PAGES ; i++) +			__free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); +	j = -1; +out_free_bio: +	while ( ++j < nalloc ) +		bio_put(r10_bio->devs[j].bio); +	r10bio_pool_free(r10_bio, conf); +	return NULL; +} + +static void r10buf_pool_free(void *__r10_bio, void *data) +{ +	int i; +	conf_t *conf = data; +	r10bio_t *r10bio = __r10_bio; +	int j; + +	for (j=0; j < conf->copies; j++) { +		struct bio *bio = r10bio->devs[j].bio; +		if (bio) { +			for (i = 0; i < RESYNC_PAGES; i++) { +				__free_page(bio->bi_io_vec[i].bv_page); +				bio->bi_io_vec[i].bv_page = NULL; +			} +			bio_put(bio); +		} +	} +	r10bio_pool_free(r10bio, conf); +} + +static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) +{ +	int i; + +	for (i = 0; i < conf->copies; i++) { +		struct bio **bio = & r10_bio->devs[i].bio; +		if (*bio) +			bio_put(*bio); +		*bio = NULL; +	} +} + +static inline void free_r10bio(r10bio_t *r10_bio) +{ +	unsigned long flags; + +	conf_t *conf = mddev_to_conf(r10_bio->mddev); + +	/* +	 * Wake up any possible resync thread that waits for the device +	 * to go idle. +	 */ +	spin_lock_irqsave(&conf->resync_lock, flags); +	if (!--conf->nr_pending) { +		wake_up(&conf->wait_idle); +		wake_up(&conf->wait_resume); +	} +	spin_unlock_irqrestore(&conf->resync_lock, flags); + +	put_all_bios(conf, r10_bio); +	mempool_free(r10_bio, conf->r10bio_pool); +} + +static inline void put_buf(r10bio_t *r10_bio) +{ +	conf_t *conf = mddev_to_conf(r10_bio->mddev); +	unsigned long flags; + +	mempool_free(r10_bio, conf->r10buf_pool); + +	spin_lock_irqsave(&conf->resync_lock, flags); +	if (!conf->barrier) +		BUG(); +	--conf->barrier; +	wake_up(&conf->wait_resume); +	wake_up(&conf->wait_idle); + +	if (!--conf->nr_pending) { +		wake_up(&conf->wait_idle); +		wake_up(&conf->wait_resume); +	} +	spin_unlock_irqrestore(&conf->resync_lock, flags); +} + +static void reschedule_retry(r10bio_t *r10_bio) +{ +	unsigned long flags; +	mddev_t *mddev = r10_bio->mddev; +	conf_t *conf = mddev_to_conf(mddev); + +	spin_lock_irqsave(&conf->device_lock, flags); +	list_add(&r10_bio->retry_list, &conf->retry_list); +	spin_unlock_irqrestore(&conf->device_lock, flags); + +	md_wakeup_thread(mddev->thread); +} + +/* + * raid_end_bio_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid_end_bio_io(r10bio_t *r10_bio) +{ +	struct bio *bio = r10_bio->master_bio; + +	bio_endio(bio, bio->bi_size, +		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); +	free_r10bio(r10_bio); +} + +/* + * Update disk head position estimator based on IRQ completion info. + */ +static inline void update_head_pos(int slot, r10bio_t *r10_bio) +{ +	conf_t *conf = mddev_to_conf(r10_bio->mddev); + +	conf->mirrors[r10_bio->devs[slot].devnum].head_position = +		r10_bio->devs[slot].addr + (r10_bio->sectors); +} + +static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error) +{ +	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); +	int slot, dev; +	conf_t *conf = mddev_to_conf(r10_bio->mddev); + +	if (bio->bi_size) +		return 1; + +	slot = r10_bio->read_slot; +	dev = r10_bio->devs[slot].devnum; +	/* +	 * this branch is our 'one mirror IO has finished' event handler: +	 */ +	if (!uptodate) +		md_error(r10_bio->mddev, conf->mirrors[dev].rdev); +	else +		/* +		 * Set R10BIO_Uptodate in our master bio, so that +		 * we will return a good error code to the higher +		 * levels even if IO on some other mirrored buffer fails. +		 * +		 * The 'master' represents the composite IO operation to +		 * user-side. So if something waits for IO, then it will +		 * wait for the 'master' bio. +		 */ +		set_bit(R10BIO_Uptodate, &r10_bio->state); + +	update_head_pos(slot, r10_bio); + +	/* +	 * we have only one bio on the read side +	 */ +	if (uptodate) +		raid_end_bio_io(r10_bio); +	else { +		/* +		 * oops, read error: +		 */ +		char b[BDEVNAME_SIZE]; +		if (printk_ratelimit()) +			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", +			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); +		reschedule_retry(r10_bio); +	} + +	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); +	return 0; +} + +static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error) +{ +	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); +	int slot, dev; +	conf_t *conf = mddev_to_conf(r10_bio->mddev); + +	if (bio->bi_size) +		return 1; + +	for (slot = 0; slot < conf->copies; slot++) +		if (r10_bio->devs[slot].bio == bio) +			break; +	dev = r10_bio->devs[slot].devnum; + +	/* +	 * this branch is our 'one mirror IO has finished' event handler: +	 */ +	if (!uptodate) +		md_error(r10_bio->mddev, conf->mirrors[dev].rdev); +	else +		/* +		 * Set R10BIO_Uptodate in our master bio, so that +		 * we will return a good error code for to the higher +		 * levels even if IO on some other mirrored buffer fails. +		 * +		 * The 'master' represents the composite IO operation to +		 * user-side. So if something waits for IO, then it will +		 * wait for the 'master' bio. +		 */ +		set_bit(R10BIO_Uptodate, &r10_bio->state); + +	update_head_pos(slot, r10_bio); + +	/* +	 * +	 * Let's see if all mirrored write operations have finished +	 * already. +	 */ +	if (atomic_dec_and_test(&r10_bio->remaining)) { +		md_write_end(r10_bio->mddev); +		raid_end_bio_io(r10_bio); +	} + +	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); +	return 0; +} + + +/* + * RAID10 layout manager + * Aswell as the chunksize and raid_disks count, there are two + * parameters: near_copies and far_copies. + * near_copies * far_copies must be <= raid_disks. + * Normally one of these will be 1. + * If both are 1, we get raid0. + * If near_copies == raid_disks, we get raid1. + * + * Chunks are layed out in raid0 style with near_copies copies of the + * first chunk, followed by near_copies copies of the next chunk and + * so on. + * If far_copies > 1, then after 1/far_copies of the array has been assigned + * as described above, we start again with a device offset of near_copies. + * So we effectively have another copy of the whole array further down all + * the drives, but with blocks on different drives. + * With this layout, and block is never stored twice on the one device. + * + * raid10_find_phys finds the sector offset of a given virtual sector + * on each device that it is on. If a block isn't on a device, + * that entry in the array is set to MaxSector. + * + * raid10_find_virt does the reverse mapping, from a device and a + * sector offset to a virtual address + */ + +static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio) +{ +	int n,f; +	sector_t sector; +	sector_t chunk; +	sector_t stripe; +	int dev; + +	int slot = 0; + +	/* now calculate first sector/dev */ +	chunk = r10bio->sector >> conf->chunk_shift; +	sector = r10bio->sector & conf->chunk_mask; + +	chunk *= conf->near_copies; +	stripe = chunk; +	dev = sector_div(stripe, conf->raid_disks); + +	sector += stripe << conf->chunk_shift; + +	/* and calculate all the others */ +	for (n=0; n < conf->near_copies; n++) { +		int d = dev; +		sector_t s = sector; +		r10bio->devs[slot].addr = sector; +		r10bio->devs[slot].devnum = d; +		slot++; + +		for (f = 1; f < conf->far_copies; f++) { +			d += conf->near_copies; +			if (d >= conf->raid_disks) +				d -= conf->raid_disks; +			s += conf->stride; +			r10bio->devs[slot].devnum = d; +			r10bio->devs[slot].addr = s; +			slot++; +		} +		dev++; +		if (dev >= conf->raid_disks) { +			dev = 0; +			sector += (conf->chunk_mask + 1); +		} +	} +	BUG_ON(slot != conf->copies); +} + +static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) +{ +	sector_t offset, chunk, vchunk; + +	while (sector > conf->stride) { +		sector -= conf->stride; +		if (dev < conf->near_copies) +			dev += conf->raid_disks - conf->near_copies; +		else +			dev -= conf->near_copies; +	} + +	offset = sector & conf->chunk_mask; +	chunk = sector >> conf->chunk_shift; +	vchunk = chunk * conf->raid_disks + dev; +	sector_div(vchunk, conf->near_copies); +	return (vchunk << conf->chunk_shift) + offset; +} + +/** + *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged + *	@q: request queue + *	@bio: the buffer head that's been built up so far + *	@biovec: the request that could be merged to it. + * + *	Return amount of bytes we can accept at this offset + *      If near_copies == raid_disk, there are no striping issues, + *      but in that case, the function isn't called at all. + */ +static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio, +				struct bio_vec *bio_vec) +{ +	mddev_t *mddev = q->queuedata; +	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); +	int max; +	unsigned int chunk_sectors = mddev->chunk_size >> 9; +	unsigned int bio_sectors = bio->bi_size >> 9; + +	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; +	if (max < 0) max = 0; /* bio_add cannot handle a negative return */ +	if (max <= bio_vec->bv_len && bio_sectors == 0) +		return bio_vec->bv_len; +	else +		return max; +} + +/* + * This routine returns the disk from which the requested read should + * be done. There is a per-array 'next expected sequential IO' sector + * number - if this matches on the next IO then we use the last disk. + * There is also a per-disk 'last know head position' sector that is + * maintained from IRQ contexts, both the normal and the resync IO + * completion handlers update this position correctly. If there is no + * perfect sequential match then we pick the disk whose head is closest. + * + * If there are 2 mirrors in the same 2 devices, performance degrades + * because position is mirror, not device based. + * + * The rdev for the device selected will have nr_pending incremented. + */ + +/* + * FIXME: possibly should rethink readbalancing and do it differently + * depending on near_copies / far_copies geometry. + */ +static int read_balance(conf_t *conf, r10bio_t *r10_bio) +{ +	const unsigned long this_sector = r10_bio->sector; +	int disk, slot, nslot; +	const int sectors = r10_bio->sectors; +	sector_t new_distance, current_distance; + +	raid10_find_phys(conf, r10_bio); +	rcu_read_lock(); +	/* +	 * Check if we can balance. We can balance on the whole +	 * device if no resync is going on, or below the resync window. +	 * We take the first readable disk when above the resync window. +	 */ +	if (conf->mddev->recovery_cp < MaxSector +	    && (this_sector + sectors >= conf->next_resync)) { +		/* make sure that disk is operational */ +		slot = 0; +		disk = r10_bio->devs[slot].devnum; + +		while (!conf->mirrors[disk].rdev || +		       !conf->mirrors[disk].rdev->in_sync) { +			slot++; +			if (slot == conf->copies) { +				slot = 0; +				disk = -1; +				break; +			} +			disk = r10_bio->devs[slot].devnum; +		} +		goto rb_out; +	} + + +	/* make sure the disk is operational */ +	slot = 0; +	disk = r10_bio->devs[slot].devnum; +	while (!conf->mirrors[disk].rdev || +	       !conf->mirrors[disk].rdev->in_sync) { +		slot ++; +		if (slot == conf->copies) { +			disk = -1; +			goto rb_out; +		} +		disk = r10_bio->devs[slot].devnum; +	} + + +	current_distance = abs(this_sector - conf->mirrors[disk].head_position); + +	/* Find the disk whose head is closest */ + +	for (nslot = slot; nslot < conf->copies; nslot++) { +		int ndisk = r10_bio->devs[nslot].devnum; + + +		if (!conf->mirrors[ndisk].rdev || +		    !conf->mirrors[ndisk].rdev->in_sync) +			continue; + +		if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) { +			disk = ndisk; +			slot = nslot; +			break; +		} +		new_distance = abs(r10_bio->devs[nslot].addr - +				   conf->mirrors[ndisk].head_position); +		if (new_distance < current_distance) { +			current_distance = new_distance; +			disk = ndisk; +			slot = nslot; +		} +	} + +rb_out: +	r10_bio->read_slot = slot; +/*	conf->next_seq_sect = this_sector + sectors;*/ + +	if (disk >= 0 && conf->mirrors[disk].rdev) +		atomic_inc(&conf->mirrors[disk].rdev->nr_pending); +	rcu_read_unlock(); + +	return disk; +} + +static void unplug_slaves(mddev_t *mddev) +{ +	conf_t *conf = mddev_to_conf(mddev); +	int i; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks; i++) { +		mdk_rdev_t *rdev = conf->mirrors[i].rdev; +		if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { +			request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + +			atomic_inc(&rdev->nr_pending); +			rcu_read_unlock(); + +			if (r_queue->unplug_fn) +				r_queue->unplug_fn(r_queue); + +			rdev_dec_pending(rdev, mddev); +			rcu_read_lock(); +		} +	} +	rcu_read_unlock(); +} + +static void raid10_unplug(request_queue_t *q) +{ +	unplug_slaves(q->queuedata); +} + +static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, +			     sector_t *error_sector) +{ +	mddev_t *mddev = q->queuedata; +	conf_t *conf = mddev_to_conf(mddev); +	int i, ret = 0; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks && ret == 0; i++) { +		mdk_rdev_t *rdev = conf->mirrors[i].rdev; +		if (rdev && !rdev->faulty) { +			struct block_device *bdev = rdev->bdev; +			request_queue_t *r_queue = bdev_get_queue(bdev); + +			if (!r_queue->issue_flush_fn) +				ret = -EOPNOTSUPP; +			else { +				atomic_inc(&rdev->nr_pending); +				rcu_read_unlock(); +				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, +							      error_sector); +				rdev_dec_pending(rdev, mddev); +				rcu_read_lock(); +			} +		} +	} +	rcu_read_unlock(); +	return ret; +} + +/* + * Throttle resync depth, so that we can both get proper overlapping of + * requests, but are still able to handle normal requests quickly. + */ +#define RESYNC_DEPTH 32 + +static void device_barrier(conf_t *conf, sector_t sect) +{ +	spin_lock_irq(&conf->resync_lock); +	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), +			    conf->resync_lock, unplug_slaves(conf->mddev)); + +	if (!conf->barrier++) { +		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, +				    conf->resync_lock, unplug_slaves(conf->mddev)); +		if (conf->nr_pending) +			BUG(); +	} +	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, +			    conf->resync_lock, unplug_slaves(conf->mddev)); +	conf->next_resync = sect; +	spin_unlock_irq(&conf->resync_lock); +} + +static int make_request(request_queue_t *q, struct bio * bio) +{ +	mddev_t *mddev = q->queuedata; +	conf_t *conf = mddev_to_conf(mddev); +	mirror_info_t *mirror; +	r10bio_t *r10_bio; +	struct bio *read_bio; +	int i; +	int chunk_sects = conf->chunk_mask + 1; + +	/* If this request crosses a chunk boundary, we need to +	 * split it.  This will only happen for 1 PAGE (or less) requests. +	 */ +	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) +		      > chunk_sects && +		    conf->near_copies < conf->raid_disks)) { +		struct bio_pair *bp; +		/* Sanity check -- queue functions should prevent this happening */ +		if (bio->bi_vcnt != 1 || +		    bio->bi_idx != 0) +			goto bad_map; +		/* This is a one page bio that upper layers +		 * refuse to split for us, so we need to split it. +		 */ +		bp = bio_split(bio, bio_split_pool, +			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); +		if (make_request(q, &bp->bio1)) +			generic_make_request(&bp->bio1); +		if (make_request(q, &bp->bio2)) +			generic_make_request(&bp->bio2); + +		bio_pair_release(bp); +		return 0; +	bad_map: +		printk("raid10_make_request bug: can't convert block across chunks" +		       " or bigger than %dk %llu %d\n", chunk_sects/2, +		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + +		bio_io_error(bio, bio->bi_size); +		return 0; +	} + +	/* +	 * Register the new request and wait if the reconstruction +	 * thread has put up a bar for new requests. +	 * Continue immediately if no resync is active currently. +	 */ +	spin_lock_irq(&conf->resync_lock); +	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); +	conf->nr_pending++; +	spin_unlock_irq(&conf->resync_lock); + +	if (bio_data_dir(bio)==WRITE) { +		disk_stat_inc(mddev->gendisk, writes); +		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); +	} else { +		disk_stat_inc(mddev->gendisk, reads); +		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); +	} + +	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + +	r10_bio->master_bio = bio; +	r10_bio->sectors = bio->bi_size >> 9; + +	r10_bio->mddev = mddev; +	r10_bio->sector = bio->bi_sector; + +	if (bio_data_dir(bio) == READ) { +		/* +		 * read balancing logic: +		 */ +		int disk = read_balance(conf, r10_bio); +		int slot = r10_bio->read_slot; +		if (disk < 0) { +			raid_end_bio_io(r10_bio); +			return 0; +		} +		mirror = conf->mirrors + disk; + +		read_bio = bio_clone(bio, GFP_NOIO); + +		r10_bio->devs[slot].bio = read_bio; + +		read_bio->bi_sector = r10_bio->devs[slot].addr + +			mirror->rdev->data_offset; +		read_bio->bi_bdev = mirror->rdev->bdev; +		read_bio->bi_end_io = raid10_end_read_request; +		read_bio->bi_rw = READ; +		read_bio->bi_private = r10_bio; + +		generic_make_request(read_bio); +		return 0; +	} + +	/* +	 * WRITE: +	 */ +	/* first select target devices under spinlock and +	 * inc refcount on their rdev.  Record them by setting +	 * bios[x] to bio +	 */ +	raid10_find_phys(conf, r10_bio); +	rcu_read_lock(); +	for (i = 0;  i < conf->copies; i++) { +		int d = r10_bio->devs[i].devnum; +		if (conf->mirrors[d].rdev && +		    !conf->mirrors[d].rdev->faulty) { +			atomic_inc(&conf->mirrors[d].rdev->nr_pending); +			r10_bio->devs[i].bio = bio; +		} else +			r10_bio->devs[i].bio = NULL; +	} +	rcu_read_unlock(); + +	atomic_set(&r10_bio->remaining, 1); +	md_write_start(mddev); +	for (i = 0; i < conf->copies; i++) { +		struct bio *mbio; +		int d = r10_bio->devs[i].devnum; +		if (!r10_bio->devs[i].bio) +			continue; + +		mbio = bio_clone(bio, GFP_NOIO); +		r10_bio->devs[i].bio = mbio; + +		mbio->bi_sector	= r10_bio->devs[i].addr+ +			conf->mirrors[d].rdev->data_offset; +		mbio->bi_bdev = conf->mirrors[d].rdev->bdev; +		mbio->bi_end_io	= raid10_end_write_request; +		mbio->bi_rw = WRITE; +		mbio->bi_private = r10_bio; + +		atomic_inc(&r10_bio->remaining); +		generic_make_request(mbio); +	} + +	if (atomic_dec_and_test(&r10_bio->remaining)) { +		md_write_end(mddev); +		raid_end_bio_io(r10_bio); +	} + +	return 0; +} + +static void status(struct seq_file *seq, mddev_t *mddev) +{ +	conf_t *conf = mddev_to_conf(mddev); +	int i; + +	if (conf->near_copies < conf->raid_disks) +		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); +	if (conf->near_copies > 1) +		seq_printf(seq, " %d near-copies", conf->near_copies); +	if (conf->far_copies > 1) +		seq_printf(seq, " %d far-copies", conf->far_copies); + +	seq_printf(seq, " [%d/%d] [", conf->raid_disks, +						conf->working_disks); +	for (i = 0; i < conf->raid_disks; i++) +		seq_printf(seq, "%s", +			      conf->mirrors[i].rdev && +			      conf->mirrors[i].rdev->in_sync ? "U" : "_"); +	seq_printf(seq, "]"); +} + +static void error(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	char b[BDEVNAME_SIZE]; +	conf_t *conf = mddev_to_conf(mddev); + +	/* +	 * If it is not operational, then we have already marked it as dead +	 * else if it is the last working disks, ignore the error, let the +	 * next level up know. +	 * else mark the drive as failed +	 */ +	if (rdev->in_sync +	    && conf->working_disks == 1) +		/* +		 * Don't fail the drive, just return an IO error. +		 * The test should really be more sophisticated than +		 * "working_disks == 1", but it isn't critical, and +		 * can wait until we do more sophisticated "is the drive +		 * really dead" tests... +		 */ +		return; +	if (rdev->in_sync) { +		mddev->degraded++; +		conf->working_disks--; +		/* +		 * if recovery is running, make sure it aborts. +		 */ +		set_bit(MD_RECOVERY_ERR, &mddev->recovery); +	} +	rdev->in_sync = 0; +	rdev->faulty = 1; +	mddev->sb_dirty = 1; +	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" +		"	Operation continuing on %d devices\n", +		bdevname(rdev->bdev,b), conf->working_disks); +} + +static void print_conf(conf_t *conf) +{ +	int i; +	mirror_info_t *tmp; + +	printk("RAID10 conf printout:\n"); +	if (!conf) { +		printk("(!conf)\n"); +		return; +	} +	printk(" --- wd:%d rd:%d\n", conf->working_disks, +		conf->raid_disks); + +	for (i = 0; i < conf->raid_disks; i++) { +		char b[BDEVNAME_SIZE]; +		tmp = conf->mirrors + i; +		if (tmp->rdev) +			printk(" disk %d, wo:%d, o:%d, dev:%s\n", +				i, !tmp->rdev->in_sync, !tmp->rdev->faulty, +				bdevname(tmp->rdev->bdev,b)); +	} +} + +static void close_sync(conf_t *conf) +{ +	spin_lock_irq(&conf->resync_lock); +	wait_event_lock_irq(conf->wait_resume, !conf->barrier, +			    conf->resync_lock, 	unplug_slaves(conf->mddev)); +	spin_unlock_irq(&conf->resync_lock); + +	if (conf->barrier) BUG(); +	if (waitqueue_active(&conf->wait_idle)) BUG(); + +	mempool_destroy(conf->r10buf_pool); +	conf->r10buf_pool = NULL; +} + +static int raid10_spare_active(mddev_t *mddev) +{ +	int i; +	conf_t *conf = mddev->private; +	mirror_info_t *tmp; + +	/* +	 * Find all non-in_sync disks within the RAID10 configuration +	 * and mark them in_sync +	 */ +	for (i = 0; i < conf->raid_disks; i++) { +		tmp = conf->mirrors + i; +		if (tmp->rdev +		    && !tmp->rdev->faulty +		    && !tmp->rdev->in_sync) { +			conf->working_disks++; +			mddev->degraded--; +			tmp->rdev->in_sync = 1; +		} +	} + +	print_conf(conf); +	return 0; +} + + +static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	conf_t *conf = mddev->private; +	int found = 0; +	int mirror; +	mirror_info_t *p; + +	if (mddev->recovery_cp < MaxSector) +		/* only hot-add to in-sync arrays, as recovery is +		 * very different from resync +		 */ +		return 0; + +	for (mirror=0; mirror < mddev->raid_disks; mirror++) +		if ( !(p=conf->mirrors+mirror)->rdev) { + +			blk_queue_stack_limits(mddev->queue, +					       rdev->bdev->bd_disk->queue); +			/* as we don't honour merge_bvec_fn, we must never risk +			 * violating it, so limit ->max_sector to one PAGE, as +			 * a one page request is never in violation. +			 */ +			if (rdev->bdev->bd_disk->queue->merge_bvec_fn && +			    mddev->queue->max_sectors > (PAGE_SIZE>>9)) +				mddev->queue->max_sectors = (PAGE_SIZE>>9); + +			p->head_position = 0; +			rdev->raid_disk = mirror; +			found = 1; +			p->rdev = rdev; +			break; +		} + +	print_conf(conf); +	return found; +} + +static int raid10_remove_disk(mddev_t *mddev, int number) +{ +	conf_t *conf = mddev->private; +	int err = 0; +	mdk_rdev_t *rdev; +	mirror_info_t *p = conf->mirrors+ number; + +	print_conf(conf); +	rdev = p->rdev; +	if (rdev) { +		if (rdev->in_sync || +		    atomic_read(&rdev->nr_pending)) { +			err = -EBUSY; +			goto abort; +		} +		p->rdev = NULL; +		synchronize_kernel(); +		if (atomic_read(&rdev->nr_pending)) { +			/* lost the race, try later */ +			err = -EBUSY; +			p->rdev = rdev; +		} +	} +abort: + +	print_conf(conf); +	return err; +} + + +static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) +{ +	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); +	conf_t *conf = mddev_to_conf(r10_bio->mddev); +	int i,d; + +	if (bio->bi_size) +		return 1; + +	for (i=0; i<conf->copies; i++) +		if (r10_bio->devs[i].bio == bio) +			break; +	if (i == conf->copies) +		BUG(); +	update_head_pos(i, r10_bio); +	d = r10_bio->devs[i].devnum; +	if (!uptodate) +		md_error(r10_bio->mddev, +			 conf->mirrors[d].rdev); + +	/* for reconstruct, we always reschedule after a read. +	 * for resync, only after all reads +	 */ +	if (test_bit(R10BIO_IsRecover, &r10_bio->state) || +	    atomic_dec_and_test(&r10_bio->remaining)) { +		/* we have read all the blocks, +		 * do the comparison in process context in raid10d +		 */ +		reschedule_retry(r10_bio); +	} +	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); +	return 0; +} + +static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) +{ +	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); +	mddev_t *mddev = r10_bio->mddev; +	conf_t *conf = mddev_to_conf(mddev); +	int i,d; + +	if (bio->bi_size) +		return 1; + +	for (i = 0; i < conf->copies; i++) +		if (r10_bio->devs[i].bio == bio) +			break; +	d = r10_bio->devs[i].devnum; + +	if (!uptodate) +		md_error(mddev, conf->mirrors[d].rdev); +	update_head_pos(i, r10_bio); + +	while (atomic_dec_and_test(&r10_bio->remaining)) { +		if (r10_bio->master_bio == NULL) { +			/* the primary of several recovery bios */ +			md_done_sync(mddev, r10_bio->sectors, 1); +			put_buf(r10_bio); +			break; +		} else { +			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; +			put_buf(r10_bio); +			r10_bio = r10_bio2; +		} +	} +	rdev_dec_pending(conf->mirrors[d].rdev, mddev); +	return 0; +} + +/* + * Note: sync and recover and handled very differently for raid10 + * This code is for resync. + * For resync, we read through virtual addresses and read all blocks. + * If there is any error, we schedule a write.  The lowest numbered + * drive is authoritative. + * However requests come for physical address, so we need to map. + * For every physical address there are raid_disks/copies virtual addresses, + * which is always are least one, but is not necessarly an integer. + * This means that a physical address can span multiple chunks, so we may + * have to submit multiple io requests for a single sync request. + */ +/* + * We check if all blocks are in-sync and only write to blocks that + * aren't in sync + */ +static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) +{ +	conf_t *conf = mddev_to_conf(mddev); +	int i, first; +	struct bio *tbio, *fbio; + +	atomic_set(&r10_bio->remaining, 1); + +	/* find the first device with a block */ +	for (i=0; i<conf->copies; i++) +		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) +			break; + +	if (i == conf->copies) +		goto done; + +	first = i; +	fbio = r10_bio->devs[i].bio; + +	/* now find blocks with errors */ +	for (i=first+1 ; i < conf->copies ; i++) { +		int vcnt, j, d; + +		if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) +			continue; +		/* We know that the bi_io_vec layout is the same for +		 * both 'first' and 'i', so we just compare them. +		 * All vec entries are PAGE_SIZE; +		 */ +		tbio = r10_bio->devs[i].bio; +		vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); +		for (j = 0; j < vcnt; j++) +			if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), +				   page_address(tbio->bi_io_vec[j].bv_page), +				   PAGE_SIZE)) +				break; +		if (j == vcnt) +			continue; +		/* Ok, we need to write this bio +		 * First we need to fixup bv_offset, bv_len and +		 * bi_vecs, as the read request might have corrupted these +		 */ +		tbio->bi_vcnt = vcnt; +		tbio->bi_size = r10_bio->sectors << 9; +		tbio->bi_idx = 0; +		tbio->bi_phys_segments = 0; +		tbio->bi_hw_segments = 0; +		tbio->bi_hw_front_size = 0; +		tbio->bi_hw_back_size = 0; +		tbio->bi_flags &= ~(BIO_POOL_MASK - 1); +		tbio->bi_flags |= 1 << BIO_UPTODATE; +		tbio->bi_next = NULL; +		tbio->bi_rw = WRITE; +		tbio->bi_private = r10_bio; +		tbio->bi_sector = r10_bio->devs[i].addr; + +		for (j=0; j < vcnt ; j++) { +			tbio->bi_io_vec[j].bv_offset = 0; +			tbio->bi_io_vec[j].bv_len = PAGE_SIZE; + +			memcpy(page_address(tbio->bi_io_vec[j].bv_page), +			       page_address(fbio->bi_io_vec[j].bv_page), +			       PAGE_SIZE); +		} +		tbio->bi_end_io = end_sync_write; + +		d = r10_bio->devs[i].devnum; +		atomic_inc(&conf->mirrors[d].rdev->nr_pending); +		atomic_inc(&r10_bio->remaining); +		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); + +		tbio->bi_sector += conf->mirrors[d].rdev->data_offset; +		tbio->bi_bdev = conf->mirrors[d].rdev->bdev; +		generic_make_request(tbio); +	} + +done: +	if (atomic_dec_and_test(&r10_bio->remaining)) { +		md_done_sync(mddev, r10_bio->sectors, 1); +		put_buf(r10_bio); +	} +} + +/* + * Now for the recovery code. + * Recovery happens across physical sectors. + * We recover all non-is_sync drives by finding the virtual address of + * each, and then choose a working drive that also has that virt address. + * There is a separate r10_bio for each non-in_sync drive. + * Only the first two slots are in use. The first for reading, + * The second for writing. + * + */ + +static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) +{ +	conf_t *conf = mddev_to_conf(mddev); +	int i, d; +	struct bio *bio, *wbio; + + +	/* move the pages across to the second bio +	 * and submit the write request +	 */ +	bio = r10_bio->devs[0].bio; +	wbio = r10_bio->devs[1].bio; +	for (i=0; i < wbio->bi_vcnt; i++) { +		struct page *p = bio->bi_io_vec[i].bv_page; +		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; +		wbio->bi_io_vec[i].bv_page = p; +	} +	d = r10_bio->devs[1].devnum; + +	atomic_inc(&conf->mirrors[d].rdev->nr_pending); +	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); +	generic_make_request(wbio); +} + + +/* + * This is a kernel thread which: + * + *	1.	Retries failed read operations on working mirrors. + *	2.	Updates the raid superblock when problems encounter. + *	3.	Performs writes following reads for array syncronising. + */ + +static void raid10d(mddev_t *mddev) +{ +	r10bio_t *r10_bio; +	struct bio *bio; +	unsigned long flags; +	conf_t *conf = mddev_to_conf(mddev); +	struct list_head *head = &conf->retry_list; +	int unplug=0; +	mdk_rdev_t *rdev; + +	md_check_recovery(mddev); +	md_handle_safemode(mddev); + +	for (;;) { +		char b[BDEVNAME_SIZE]; +		spin_lock_irqsave(&conf->device_lock, flags); +		if (list_empty(head)) +			break; +		r10_bio = list_entry(head->prev, r10bio_t, retry_list); +		list_del(head->prev); +		spin_unlock_irqrestore(&conf->device_lock, flags); + +		mddev = r10_bio->mddev; +		conf = mddev_to_conf(mddev); +		if (test_bit(R10BIO_IsSync, &r10_bio->state)) { +			sync_request_write(mddev, r10_bio); +			unplug = 1; +		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) { +			recovery_request_write(mddev, r10_bio); +			unplug = 1; +		} else { +			int mirror; +			bio = r10_bio->devs[r10_bio->read_slot].bio; +			r10_bio->devs[r10_bio->read_slot].bio = NULL; +			bio_put(bio); +			mirror = read_balance(conf, r10_bio); +			if (mirror == -1) { +				printk(KERN_ALERT "raid10: %s: unrecoverable I/O" +				       " read error for block %llu\n", +				       bdevname(bio->bi_bdev,b), +				       (unsigned long long)r10_bio->sector); +				raid_end_bio_io(r10_bio); +			} else { +				rdev = conf->mirrors[mirror].rdev; +				if (printk_ratelimit()) +					printk(KERN_ERR "raid10: %s: redirecting sector %llu to" +					       " another mirror\n", +					       bdevname(rdev->bdev,b), +					       (unsigned long long)r10_bio->sector); +				bio = bio_clone(r10_bio->master_bio, GFP_NOIO); +				r10_bio->devs[r10_bio->read_slot].bio = bio; +				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr +					+ rdev->data_offset; +				bio->bi_bdev = rdev->bdev; +				bio->bi_rw = READ; +				bio->bi_private = r10_bio; +				bio->bi_end_io = raid10_end_read_request; +				unplug = 1; +				generic_make_request(bio); +			} +		} +	} +	spin_unlock_irqrestore(&conf->device_lock, flags); +	if (unplug) +		unplug_slaves(mddev); +} + + +static int init_resync(conf_t *conf) +{ +	int buffs; + +	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; +	if (conf->r10buf_pool) +		BUG(); +	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); +	if (!conf->r10buf_pool) +		return -ENOMEM; +	conf->next_resync = 0; +	return 0; +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * + * This is achieved by tracking pending requests and a 'barrier' concept + * that can be installed to exclude normal IO requests. + * + * Resync and recovery are handled very differently. + * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. + * + * For resync, we iterate over virtual addresses, read all copies, + * and update if there are differences.  If only one copy is live, + * skip it. + * For recovery, we iterate over physical addresses, read a good + * value for each non-in_sync drive, and over-write. + * + * So, for recovery we may have several outstanding complex requests for a + * given address, one for each out-of-sync device.  We model this by allocating + * a number of r10_bio structures, one for each out-of-sync device. + * As we setup these structures, we collect all bio's together into a list + * which we then process collectively to add pages, and then process again + * to pass to generic_make_request. + * + * The r10_bio structures are linked using a borrowed master_bio pointer. + * This link is counted in ->remaining.  When the r10_bio that points to NULL + * has its remaining count decremented to 0, the whole complex operation + * is complete. + * + */ + +static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) +{ +	conf_t *conf = mddev_to_conf(mddev); +	r10bio_t *r10_bio; +	struct bio *biolist = NULL, *bio; +	sector_t max_sector, nr_sectors; +	int disk; +	int i; + +	sector_t sectors_skipped = 0; +	int chunks_skipped = 0; + +	if (!conf->r10buf_pool) +		if (init_resync(conf)) +			return -ENOMEM; + + skipped: +	max_sector = mddev->size << 1; +	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) +		max_sector = mddev->resync_max_sectors; +	if (sector_nr >= max_sector) { +		close_sync(conf); +		return sectors_skipped; +	} +	if (chunks_skipped >= conf->raid_disks) { +		/* if there has been nothing to do on any drive, +		 * then there is nothing to do at all.. +		 */ +		sector_t sec = max_sector - sector_nr; +		md_done_sync(mddev, sec, 1); +		return sec + sectors_skipped; +	} + +	/* make sure whole request will fit in a chunk - if chunks +	 * are meaningful +	 */ +	if (conf->near_copies < conf->raid_disks && +	    max_sector > (sector_nr | conf->chunk_mask)) +		max_sector = (sector_nr | conf->chunk_mask) + 1; +	/* +	 * If there is non-resync activity waiting for us then +	 * put in a delay to throttle resync. +	 */ +	if (!go_faster && waitqueue_active(&conf->wait_resume)) +		msleep_interruptible(1000); +	device_barrier(conf, sector_nr + RESYNC_SECTORS); + +	/* Again, very different code for resync and recovery. +	 * Both must result in an r10bio with a list of bios that +	 * have bi_end_io, bi_sector, bi_bdev set, +	 * and bi_private set to the r10bio. +	 * For recovery, we may actually create several r10bios +	 * with 2 bios in each, that correspond to the bios in the main one. +	 * In this case, the subordinate r10bios link back through a +	 * borrowed master_bio pointer, and the counter in the master +	 * includes a ref from each subordinate. +	 */ +	/* First, we decide what to do and set ->bi_end_io +	 * To end_sync_read if we want to read, and +	 * end_sync_write if we will want to write. +	 */ + +	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { +		/* recovery... the complicated one */ +		int i, j, k; +		r10_bio = NULL; + +		for (i=0 ; i<conf->raid_disks; i++) +			if (conf->mirrors[i].rdev && +			    !conf->mirrors[i].rdev->in_sync) { +				/* want to reconstruct this device */ +				r10bio_t *rb2 = r10_bio; + +				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); +				spin_lock_irq(&conf->resync_lock); +				conf->nr_pending++; +				if (rb2) conf->barrier++; +				spin_unlock_irq(&conf->resync_lock); +				atomic_set(&r10_bio->remaining, 0); + +				r10_bio->master_bio = (struct bio*)rb2; +				if (rb2) +					atomic_inc(&rb2->remaining); +				r10_bio->mddev = mddev; +				set_bit(R10BIO_IsRecover, &r10_bio->state); +				r10_bio->sector = raid10_find_virt(conf, sector_nr, i); +				raid10_find_phys(conf, r10_bio); +				for (j=0; j<conf->copies;j++) { +					int d = r10_bio->devs[j].devnum; +					if (conf->mirrors[d].rdev && +					    conf->mirrors[d].rdev->in_sync) { +						/* This is where we read from */ +						bio = r10_bio->devs[0].bio; +						bio->bi_next = biolist; +						biolist = bio; +						bio->bi_private = r10_bio; +						bio->bi_end_io = end_sync_read; +						bio->bi_rw = 0; +						bio->bi_sector = r10_bio->devs[j].addr + +							conf->mirrors[d].rdev->data_offset; +						bio->bi_bdev = conf->mirrors[d].rdev->bdev; +						atomic_inc(&conf->mirrors[d].rdev->nr_pending); +						atomic_inc(&r10_bio->remaining); +						/* and we write to 'i' */ + +						for (k=0; k<conf->copies; k++) +							if (r10_bio->devs[k].devnum == i) +								break; +						bio = r10_bio->devs[1].bio; +						bio->bi_next = biolist; +						biolist = bio; +						bio->bi_private = r10_bio; +						bio->bi_end_io = end_sync_write; +						bio->bi_rw = 1; +						bio->bi_sector = r10_bio->devs[k].addr + +							conf->mirrors[i].rdev->data_offset; +						bio->bi_bdev = conf->mirrors[i].rdev->bdev; + +						r10_bio->devs[0].devnum = d; +						r10_bio->devs[1].devnum = i; + +						break; +					} +				} +				if (j == conf->copies) { +					BUG(); +				} +			} +		if (biolist == NULL) { +			while (r10_bio) { +				r10bio_t *rb2 = r10_bio; +				r10_bio = (r10bio_t*) rb2->master_bio; +				rb2->master_bio = NULL; +				put_buf(rb2); +			} +			goto giveup; +		} +	} else { +		/* resync. Schedule a read for every block at this virt offset */ +		int count = 0; +		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + +		spin_lock_irq(&conf->resync_lock); +		conf->nr_pending++; +		spin_unlock_irq(&conf->resync_lock); + +		r10_bio->mddev = mddev; +		atomic_set(&r10_bio->remaining, 0); + +		r10_bio->master_bio = NULL; +		r10_bio->sector = sector_nr; +		set_bit(R10BIO_IsSync, &r10_bio->state); +		raid10_find_phys(conf, r10_bio); +		r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; + +		for (i=0; i<conf->copies; i++) { +			int d = r10_bio->devs[i].devnum; +			bio = r10_bio->devs[i].bio; +			bio->bi_end_io = NULL; +			if (conf->mirrors[d].rdev == NULL || +			    conf->mirrors[d].rdev->faulty) +				continue; +			atomic_inc(&conf->mirrors[d].rdev->nr_pending); +			atomic_inc(&r10_bio->remaining); +			bio->bi_next = biolist; +			biolist = bio; +			bio->bi_private = r10_bio; +			bio->bi_end_io = end_sync_read; +			bio->bi_rw = 0; +			bio->bi_sector = r10_bio->devs[i].addr + +				conf->mirrors[d].rdev->data_offset; +			bio->bi_bdev = conf->mirrors[d].rdev->bdev; +			count++; +		} + +		if (count < 2) { +			for (i=0; i<conf->copies; i++) { +				int d = r10_bio->devs[i].devnum; +				if (r10_bio->devs[i].bio->bi_end_io) +					rdev_dec_pending(conf->mirrors[d].rdev, mddev); +			} +			put_buf(r10_bio); +			biolist = NULL; +			goto giveup; +		} +	} + +	for (bio = biolist; bio ; bio=bio->bi_next) { + +		bio->bi_flags &= ~(BIO_POOL_MASK - 1); +		if (bio->bi_end_io) +			bio->bi_flags |= 1 << BIO_UPTODATE; +		bio->bi_vcnt = 0; +		bio->bi_idx = 0; +		bio->bi_phys_segments = 0; +		bio->bi_hw_segments = 0; +		bio->bi_size = 0; +	} + +	nr_sectors = 0; +	do { +		struct page *page; +		int len = PAGE_SIZE; +		disk = 0; +		if (sector_nr + (len>>9) > max_sector) +			len = (max_sector - sector_nr) << 9; +		if (len == 0) +			break; +		for (bio= biolist ; bio ; bio=bio->bi_next) { +			page = bio->bi_io_vec[bio->bi_vcnt].bv_page; +			if (bio_add_page(bio, page, len, 0) == 0) { +				/* stop here */ +				struct bio *bio2; +				bio->bi_io_vec[bio->bi_vcnt].bv_page = page; +				for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { +					/* remove last page from this bio */ +					bio2->bi_vcnt--; +					bio2->bi_size -= len; +					bio2->bi_flags &= ~(1<< BIO_SEG_VALID); +				} +				goto bio_full; +			} +			disk = i; +		} +		nr_sectors += len>>9; +		sector_nr += len>>9; +	} while (biolist->bi_vcnt < RESYNC_PAGES); + bio_full: +	r10_bio->sectors = nr_sectors; + +	while (biolist) { +		bio = biolist; +		biolist = biolist->bi_next; + +		bio->bi_next = NULL; +		r10_bio = bio->bi_private; +		r10_bio->sectors = nr_sectors; + +		if (bio->bi_end_io == end_sync_read) { +			md_sync_acct(bio->bi_bdev, nr_sectors); +			generic_make_request(bio); +		} +	} + +	return sectors_skipped + nr_sectors; + giveup: +	/* There is nowhere to write, so all non-sync +	 * drives must be failed, so try the next chunk... +	 */ +	{ +	int sec = max_sector - sector_nr; +	sectors_skipped += sec; +	chunks_skipped ++; +	sector_nr = max_sector; +	md_done_sync(mddev, sec, 1); +	goto skipped; +	} +} + +static int run(mddev_t *mddev) +{ +	conf_t *conf; +	int i, disk_idx; +	mirror_info_t *disk; +	mdk_rdev_t *rdev; +	struct list_head *tmp; +	int nc, fc; +	sector_t stride, size; + +	if (mddev->level != 10) { +		printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n", +		       mdname(mddev), mddev->level); +		goto out; +	} +	nc = mddev->layout & 255; +	fc = (mddev->layout >> 8) & 255; +	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || +	    (mddev->layout >> 16)) { +		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", +		       mdname(mddev), mddev->layout); +		goto out; +	} +	/* +	 * copy the already verified devices into our private RAID10 +	 * bookkeeping area. [whatever we allocate in run(), +	 * should be freed in stop()] +	 */ +	conf = kmalloc(sizeof(conf_t), GFP_KERNEL); +	mddev->private = conf; +	if (!conf) { +		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", +			mdname(mddev)); +		goto out; +	} +	memset(conf, 0, sizeof(*conf)); +	conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, +				 GFP_KERNEL); +	if (!conf->mirrors) { +		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", +		       mdname(mddev)); +		goto out_free_conf; +	} +	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); + +	conf->near_copies = nc; +	conf->far_copies = fc; +	conf->copies = nc*fc; +	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; +	conf->chunk_shift = ffz(~mddev->chunk_size) - 9; +	stride = mddev->size >> (conf->chunk_shift-1); +	sector_div(stride, fc); +	conf->stride = stride << conf->chunk_shift; + +	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, +						r10bio_pool_free, conf); +	if (!conf->r10bio_pool) { +		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", +			mdname(mddev)); +		goto out_free_conf; +	} +	mddev->queue->unplug_fn = raid10_unplug; + +	mddev->queue->issue_flush_fn = raid10_issue_flush; + +	ITERATE_RDEV(mddev, rdev, tmp) { +		disk_idx = rdev->raid_disk; +		if (disk_idx >= mddev->raid_disks +		    || disk_idx < 0) +			continue; +		disk = conf->mirrors + disk_idx; + +		disk->rdev = rdev; + +		blk_queue_stack_limits(mddev->queue, +				       rdev->bdev->bd_disk->queue); +		/* as we don't honour merge_bvec_fn, we must never risk +		 * violating it, so limit ->max_sector to one PAGE, as +		 * a one page request is never in violation. +		 */ +		if (rdev->bdev->bd_disk->queue->merge_bvec_fn && +		    mddev->queue->max_sectors > (PAGE_SIZE>>9)) +			mddev->queue->max_sectors = (PAGE_SIZE>>9); + +		disk->head_position = 0; +		if (!rdev->faulty && rdev->in_sync) +			conf->working_disks++; +	} +	conf->raid_disks = mddev->raid_disks; +	conf->mddev = mddev; +	spin_lock_init(&conf->device_lock); +	INIT_LIST_HEAD(&conf->retry_list); + +	spin_lock_init(&conf->resync_lock); +	init_waitqueue_head(&conf->wait_idle); +	init_waitqueue_head(&conf->wait_resume); + +	if (!conf->working_disks) { +		printk(KERN_ERR "raid10: no operational mirrors for %s\n", +			mdname(mddev)); +		goto out_free_conf; +	} + +	mddev->degraded = 0; +	for (i = 0; i < conf->raid_disks; i++) { + +		disk = conf->mirrors + i; + +		if (!disk->rdev) { +			disk->head_position = 0; +			mddev->degraded++; +		} +	} + + +	mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); +	if (!mddev->thread) { +		printk(KERN_ERR +		       "raid10: couldn't allocate thread for %s\n", +		       mdname(mddev)); +		goto out_free_conf; +	} + +	printk(KERN_INFO +		"raid10: raid set %s active with %d out of %d devices\n", +		mdname(mddev), mddev->raid_disks - mddev->degraded, +		mddev->raid_disks); +	/* +	 * Ok, everything is just fine now +	 */ +	size = conf->stride * conf->raid_disks; +	sector_div(size, conf->near_copies); +	mddev->array_size = size/2; +	mddev->resync_max_sectors = size; + +	/* Calculate max read-ahead size. +	 * We need to readahead at least twice a whole stripe.... +	 * maybe... +	 */ +	{ +		int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; +		stripe /= conf->near_copies; +		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) +			mddev->queue->backing_dev_info.ra_pages = 2* stripe; +	} + +	if (conf->near_copies < mddev->raid_disks) +		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); +	return 0; + +out_free_conf: +	if (conf->r10bio_pool) +		mempool_destroy(conf->r10bio_pool); +	if (conf->mirrors) +		kfree(conf->mirrors); +	kfree(conf); +	mddev->private = NULL; +out: +	return -EIO; +} + +static int stop(mddev_t *mddev) +{ +	conf_t *conf = mddev_to_conf(mddev); + +	md_unregister_thread(mddev->thread); +	mddev->thread = NULL; +	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +	if (conf->r10bio_pool) +		mempool_destroy(conf->r10bio_pool); +	if (conf->mirrors) +		kfree(conf->mirrors); +	kfree(conf); +	mddev->private = NULL; +	return 0; +} + + +static mdk_personality_t raid10_personality = +{ +	.name		= "raid10", +	.owner		= THIS_MODULE, +	.make_request	= make_request, +	.run		= run, +	.stop		= stop, +	.status		= status, +	.error_handler	= error, +	.hot_add_disk	= raid10_add_disk, +	.hot_remove_disk= raid10_remove_disk, +	.spare_active	= raid10_spare_active, +	.sync_request	= sync_request, +}; + +static int __init raid_init(void) +{ +	return register_md_personality(RAID10, &raid10_personality); +} + +static void raid_exit(void) +{ +	unregister_md_personality(RAID10); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("md-personality-9"); /* RAID10 */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c new file mode 100644 index 00000000000..52c3a81c4aa --- /dev/null +++ b/drivers/md/raid5.c @@ -0,0 +1,1965 @@ +/* + * raid5.c : Multiple Devices driver for Linux + *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + *	   Copyright (C) 1999, 2000 Ingo Molnar + * + * RAID-5 management functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/raid/raid5.h> +#include <linux/highmem.h> +#include <linux/bitops.h> +#include <asm/atomic.h> + +/* + * Stripe cache + */ + +#define NR_STRIPES		256 +#define STRIPE_SIZE		PAGE_SIZE +#define STRIPE_SHIFT		(PAGE_SHIFT - 9) +#define STRIPE_SECTORS		(STRIPE_SIZE>>9) +#define	IO_THRESHOLD		1 +#define HASH_PAGES		1 +#define HASH_PAGES_ORDER	0 +#define NR_HASH			(HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) +#define HASH_MASK		(NR_HASH - 1) + +#define stripe_hash(conf, sect)	((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) + +/* bio's attached to a stripe+device for I/O are linked together in bi_sector + * order without overlap.  There may be several bio's per stripe+device, and + * a bio could span several devices. + * When walking this list for a particular stripe+device, we must never proceed + * beyond a bio that extends past this device, as the next bio might no longer + * be valid. + * This macro is used to determine the 'next' bio in the list, given the sector + * of the current stripe+device + */ +#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) +/* + * The following can be used to debug the driver + */ +#define RAID5_DEBUG	0 +#define RAID5_PARANOIA	1 +#if RAID5_PARANOIA && defined(CONFIG_SMP) +# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) +#else +# define CHECK_DEVLOCK() +#endif + +#define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x))) +#if RAID5_DEBUG +#define inline +#define __inline__ +#endif + +static void print_raid5_conf (raid5_conf_t *conf); + +static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) +{ +	if (atomic_dec_and_test(&sh->count)) { +		if (!list_empty(&sh->lru)) +			BUG(); +		if (atomic_read(&conf->active_stripes)==0) +			BUG(); +		if (test_bit(STRIPE_HANDLE, &sh->state)) { +			if (test_bit(STRIPE_DELAYED, &sh->state)) +				list_add_tail(&sh->lru, &conf->delayed_list); +			else +				list_add_tail(&sh->lru, &conf->handle_list); +			md_wakeup_thread(conf->mddev->thread); +		} else { +			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { +				atomic_dec(&conf->preread_active_stripes); +				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) +					md_wakeup_thread(conf->mddev->thread); +			} +			list_add_tail(&sh->lru, &conf->inactive_list); +			atomic_dec(&conf->active_stripes); +			if (!conf->inactive_blocked || +			    atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4)) +				wake_up(&conf->wait_for_stripe); +		} +	} +} +static void release_stripe(struct stripe_head *sh) +{ +	raid5_conf_t *conf = sh->raid_conf; +	unsigned long flags; +	 +	spin_lock_irqsave(&conf->device_lock, flags); +	__release_stripe(conf, sh); +	spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static void remove_hash(struct stripe_head *sh) +{ +	PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); + +	if (sh->hash_pprev) { +		if (sh->hash_next) +			sh->hash_next->hash_pprev = sh->hash_pprev; +		*sh->hash_pprev = sh->hash_next; +		sh->hash_pprev = NULL; +	} +} + +static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) +{ +	struct stripe_head **shp = &stripe_hash(conf, sh->sector); + +	PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); + +	CHECK_DEVLOCK(); +	if ((sh->hash_next = *shp) != NULL) +		(*shp)->hash_pprev = &sh->hash_next; +	*shp = sh; +	sh->hash_pprev = shp; +} + + +/* find an idle stripe, make sure it is unhashed, and return it. */ +static struct stripe_head *get_free_stripe(raid5_conf_t *conf) +{ +	struct stripe_head *sh = NULL; +	struct list_head *first; + +	CHECK_DEVLOCK(); +	if (list_empty(&conf->inactive_list)) +		goto out; +	first = conf->inactive_list.next; +	sh = list_entry(first, struct stripe_head, lru); +	list_del_init(first); +	remove_hash(sh); +	atomic_inc(&conf->active_stripes); +out: +	return sh; +} + +static void shrink_buffers(struct stripe_head *sh, int num) +{ +	struct page *p; +	int i; + +	for (i=0; i<num ; i++) { +		p = sh->dev[i].page; +		if (!p) +			continue; +		sh->dev[i].page = NULL; +		page_cache_release(p); +	} +} + +static int grow_buffers(struct stripe_head *sh, int num) +{ +	int i; + +	for (i=0; i<num; i++) { +		struct page *page; + +		if (!(page = alloc_page(GFP_KERNEL))) { +			return 1; +		} +		sh->dev[i].page = page; +	} +	return 0; +} + +static void raid5_build_block (struct stripe_head *sh, int i); + +static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) +{ +	raid5_conf_t *conf = sh->raid_conf; +	int disks = conf->raid_disks, i; + +	if (atomic_read(&sh->count) != 0) +		BUG(); +	if (test_bit(STRIPE_HANDLE, &sh->state)) +		BUG(); +	 +	CHECK_DEVLOCK(); +	PRINTK("init_stripe called, stripe %llu\n",  +		(unsigned long long)sh->sector); + +	remove_hash(sh); +	 +	sh->sector = sector; +	sh->pd_idx = pd_idx; +	sh->state = 0; + +	for (i=disks; i--; ) { +		struct r5dev *dev = &sh->dev[i]; + +		if (dev->toread || dev->towrite || dev->written || +		    test_bit(R5_LOCKED, &dev->flags)) { +			printk("sector=%llx i=%d %p %p %p %d\n", +			       (unsigned long long)sh->sector, i, dev->toread, +			       dev->towrite, dev->written, +			       test_bit(R5_LOCKED, &dev->flags)); +			BUG(); +		} +		dev->flags = 0; +		raid5_build_block(sh, i); +	} +	insert_hash(conf, sh); +} + +static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector) +{ +	struct stripe_head *sh; + +	CHECK_DEVLOCK(); +	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); +	for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) +		if (sh->sector == sector) +			return sh; +	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); +	return NULL; +} + +static void unplug_slaves(mddev_t *mddev); +static void raid5_unplug_device(request_queue_t *q); + +static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, +					     int pd_idx, int noblock)  +{ +	struct stripe_head *sh; + +	PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector); + +	spin_lock_irq(&conf->device_lock); + +	do { +		sh = __find_stripe(conf, sector); +		if (!sh) { +			if (!conf->inactive_blocked) +				sh = get_free_stripe(conf); +			if (noblock && sh == NULL) +				break; +			if (!sh) { +				conf->inactive_blocked = 1; +				wait_event_lock_irq(conf->wait_for_stripe, +						    !list_empty(&conf->inactive_list) && +						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) +						     || !conf->inactive_blocked), +						    conf->device_lock, +						    unplug_slaves(conf->mddev); +					); +				conf->inactive_blocked = 0; +			} else +				init_stripe(sh, sector, pd_idx); +		} else { +			if (atomic_read(&sh->count)) { +				if (!list_empty(&sh->lru)) +					BUG(); +			} else { +				if (!test_bit(STRIPE_HANDLE, &sh->state)) +					atomic_inc(&conf->active_stripes); +				if (list_empty(&sh->lru)) +					BUG(); +				list_del_init(&sh->lru); +			} +		} +	} while (sh == NULL); + +	if (sh) +		atomic_inc(&sh->count); + +	spin_unlock_irq(&conf->device_lock); +	return sh; +} + +static int grow_stripes(raid5_conf_t *conf, int num) +{ +	struct stripe_head *sh; +	kmem_cache_t *sc; +	int devs = conf->raid_disks; + +	sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev)); + +	sc = kmem_cache_create(conf->cache_name,  +			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), +			       0, 0, NULL, NULL); +	if (!sc) +		return 1; +	conf->slab_cache = sc; +	while (num--) { +		sh = kmem_cache_alloc(sc, GFP_KERNEL); +		if (!sh) +			return 1; +		memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev)); +		sh->raid_conf = conf; +		spin_lock_init(&sh->lock); + +		if (grow_buffers(sh, conf->raid_disks)) { +			shrink_buffers(sh, conf->raid_disks); +			kmem_cache_free(sc, sh); +			return 1; +		} +		/* we just created an active stripe so... */ +		atomic_set(&sh->count, 1); +		atomic_inc(&conf->active_stripes); +		INIT_LIST_HEAD(&sh->lru); +		release_stripe(sh); +	} +	return 0; +} + +static void shrink_stripes(raid5_conf_t *conf) +{ +	struct stripe_head *sh; + +	while (1) { +		spin_lock_irq(&conf->device_lock); +		sh = get_free_stripe(conf); +		spin_unlock_irq(&conf->device_lock); +		if (!sh) +			break; +		if (atomic_read(&sh->count)) +			BUG(); +		shrink_buffers(sh, conf->raid_disks); +		kmem_cache_free(conf->slab_cache, sh); +		atomic_dec(&conf->active_stripes); +	} +	kmem_cache_destroy(conf->slab_cache); +	conf->slab_cache = NULL; +} + +static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done, +				   int error) +{ + 	struct stripe_head *sh = bi->bi_private; +	raid5_conf_t *conf = sh->raid_conf; +	int disks = conf->raid_disks, i; +	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + +	if (bi->bi_size) +		return 1; + +	for (i=0 ; i<disks; i++) +		if (bi == &sh->dev[i].req) +			break; + +	PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",  +		(unsigned long long)sh->sector, i, atomic_read(&sh->count),  +		uptodate); +	if (i == disks) { +		BUG(); +		return 0; +	} + +	if (uptodate) { +#if 0 +		struct bio *bio; +		unsigned long flags; +		spin_lock_irqsave(&conf->device_lock, flags); +		/* we can return a buffer if we bypassed the cache or +		 * if the top buffer is not in highmem.  If there are +		 * multiple buffers, leave the extra work to +		 * handle_stripe +		 */ +		buffer = sh->bh_read[i]; +		if (buffer && +		    (!PageHighMem(buffer->b_page) +		     || buffer->b_page == bh->b_page ) +			) { +			sh->bh_read[i] = buffer->b_reqnext; +			buffer->b_reqnext = NULL; +		} else +			buffer = NULL; +		spin_unlock_irqrestore(&conf->device_lock, flags); +		if (sh->bh_page[i]==bh->b_page) +			set_buffer_uptodate(bh); +		if (buffer) { +			if (buffer->b_page != bh->b_page) +				memcpy(buffer->b_data, bh->b_data, bh->b_size); +			buffer->b_end_io(buffer, 1); +		} +#else +		set_bit(R5_UPTODATE, &sh->dev[i].flags); +#endif		 +	} else { +		md_error(conf->mddev, conf->disks[i].rdev); +		clear_bit(R5_UPTODATE, &sh->dev[i].flags); +	} +	rdev_dec_pending(conf->disks[i].rdev, conf->mddev); +#if 0 +	/* must restore b_page before unlocking buffer... */ +	if (sh->bh_page[i] != bh->b_page) { +		bh->b_page = sh->bh_page[i]; +		bh->b_data = page_address(bh->b_page); +		clear_buffer_uptodate(bh); +	} +#endif +	clear_bit(R5_LOCKED, &sh->dev[i].flags); +	set_bit(STRIPE_HANDLE, &sh->state); +	release_stripe(sh); +	return 0; +} + +static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, +				    int error) +{ + 	struct stripe_head *sh = bi->bi_private; +	raid5_conf_t *conf = sh->raid_conf; +	int disks = conf->raid_disks, i; +	unsigned long flags; +	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + +	if (bi->bi_size) +		return 1; + +	for (i=0 ; i<disks; i++) +		if (bi == &sh->dev[i].req) +			break; + +	PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",  +		(unsigned long long)sh->sector, i, atomic_read(&sh->count), +		uptodate); +	if (i == disks) { +		BUG(); +		return 0; +	} + +	spin_lock_irqsave(&conf->device_lock, flags); +	if (!uptodate) +		md_error(conf->mddev, conf->disks[i].rdev); + +	rdev_dec_pending(conf->disks[i].rdev, conf->mddev); +	 +	clear_bit(R5_LOCKED, &sh->dev[i].flags); +	set_bit(STRIPE_HANDLE, &sh->state); +	__release_stripe(conf, sh); +	spin_unlock_irqrestore(&conf->device_lock, flags); +	return 0; +} + + +static sector_t compute_blocknr(struct stripe_head *sh, int i); +	 +static void raid5_build_block (struct stripe_head *sh, int i) +{ +	struct r5dev *dev = &sh->dev[i]; + +	bio_init(&dev->req); +	dev->req.bi_io_vec = &dev->vec; +	dev->req.bi_vcnt++; +	dev->req.bi_max_vecs++; +	dev->vec.bv_page = dev->page; +	dev->vec.bv_len = STRIPE_SIZE; +	dev->vec.bv_offset = 0; + +	dev->req.bi_sector = sh->sector; +	dev->req.bi_private = sh; + +	dev->flags = 0; +	if (i != sh->pd_idx) +		dev->sector = compute_blocknr(sh, i); +} + +static void error(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	char b[BDEVNAME_SIZE]; +	raid5_conf_t *conf = (raid5_conf_t *) mddev->private; +	PRINTK("raid5: error called\n"); + +	if (!rdev->faulty) { +		mddev->sb_dirty = 1; +		if (rdev->in_sync) { +			conf->working_disks--; +			mddev->degraded++; +			conf->failed_disks++; +			rdev->in_sync = 0; +			/* +			 * if recovery was running, make sure it aborts. +			 */ +			set_bit(MD_RECOVERY_ERR, &mddev->recovery); +		} +		rdev->faulty = 1; +		printk (KERN_ALERT +			"raid5: Disk failure on %s, disabling device." +			" Operation continuing on %d devices\n", +			bdevname(rdev->bdev,b), conf->working_disks); +	} +}	 + +/* + * Input: a 'big' sector number, + * Output: index of the data and parity disk, and the sector # in them. + */ +static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, +			unsigned int data_disks, unsigned int * dd_idx, +			unsigned int * pd_idx, raid5_conf_t *conf) +{ +	long stripe; +	unsigned long chunk_number; +	unsigned int chunk_offset; +	sector_t new_sector; +	int sectors_per_chunk = conf->chunk_size >> 9; + +	/* First compute the information on this sector */ + +	/* +	 * Compute the chunk number and the sector offset inside the chunk +	 */ +	chunk_offset = sector_div(r_sector, sectors_per_chunk); +	chunk_number = r_sector; +	BUG_ON(r_sector != chunk_number); + +	/* +	 * Compute the stripe number +	 */ +	stripe = chunk_number / data_disks; + +	/* +	 * Compute the data disk and parity disk indexes inside the stripe +	 */ +	*dd_idx = chunk_number % data_disks; + +	/* +	 * Select the parity disk based on the user selected algorithm. +	 */ +	if (conf->level == 4) +		*pd_idx = data_disks; +	else switch (conf->algorithm) { +		case ALGORITHM_LEFT_ASYMMETRIC: +			*pd_idx = data_disks - stripe % raid_disks; +			if (*dd_idx >= *pd_idx) +				(*dd_idx)++; +			break; +		case ALGORITHM_RIGHT_ASYMMETRIC: +			*pd_idx = stripe % raid_disks; +			if (*dd_idx >= *pd_idx) +				(*dd_idx)++; +			break; +		case ALGORITHM_LEFT_SYMMETRIC: +			*pd_idx = data_disks - stripe % raid_disks; +			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; +			break; +		case ALGORITHM_RIGHT_SYMMETRIC: +			*pd_idx = stripe % raid_disks; +			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; +			break; +		default: +			printk("raid5: unsupported algorithm %d\n", +				conf->algorithm); +	} + +	/* +	 * Finally, compute the new sector number +	 */ +	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; +	return new_sector; +} + + +static sector_t compute_blocknr(struct stripe_head *sh, int i) +{ +	raid5_conf_t *conf = sh->raid_conf; +	int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; +	sector_t new_sector = sh->sector, check; +	int sectors_per_chunk = conf->chunk_size >> 9; +	sector_t stripe; +	int chunk_offset; +	int chunk_number, dummy1, dummy2, dd_idx = i; +	sector_t r_sector; + +	chunk_offset = sector_div(new_sector, sectors_per_chunk); +	stripe = new_sector; +	BUG_ON(new_sector != stripe); + +	 +	switch (conf->algorithm) { +		case ALGORITHM_LEFT_ASYMMETRIC: +		case ALGORITHM_RIGHT_ASYMMETRIC: +			if (i > sh->pd_idx) +				i--; +			break; +		case ALGORITHM_LEFT_SYMMETRIC: +		case ALGORITHM_RIGHT_SYMMETRIC: +			if (i < sh->pd_idx) +				i += raid_disks; +			i -= (sh->pd_idx + 1); +			break; +		default: +			printk("raid5: unsupported algorithm %d\n", +				conf->algorithm); +	} + +	chunk_number = stripe * data_disks + i; +	r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; + +	check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); +	if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { +		printk("compute_blocknr: map not correct\n"); +		return 0; +	} +	return r_sector; +} + + + +/* + * Copy data between a page in the stripe cache, and a bio. + * There are no alignment or size guarantees between the page or the + * bio except that there is some overlap. + * All iovecs in the bio must be considered. + */ +static void copy_data(int frombio, struct bio *bio, +		     struct page *page, +		     sector_t sector) +{ +	char *pa = page_address(page); +	struct bio_vec *bvl; +	int i; +	int page_offset; + +	if (bio->bi_sector >= sector) +		page_offset = (signed)(bio->bi_sector - sector) * 512; +	else +		page_offset = (signed)(sector - bio->bi_sector) * -512; +	bio_for_each_segment(bvl, bio, i) { +		int len = bio_iovec_idx(bio,i)->bv_len; +		int clen; +		int b_offset = 0; + +		if (page_offset < 0) { +			b_offset = -page_offset; +			page_offset += b_offset; +			len -= b_offset; +		} + +		if (len > 0 && page_offset + len > STRIPE_SIZE) +			clen = STRIPE_SIZE - page_offset; +		else clen = len; +			 +		if (clen > 0) { +			char *ba = __bio_kmap_atomic(bio, i, KM_USER0); +			if (frombio) +				memcpy(pa+page_offset, ba+b_offset, clen); +			else +				memcpy(ba+b_offset, pa+page_offset, clen); +			__bio_kunmap_atomic(ba, KM_USER0); +		} +		if (clen < len) /* hit end of page */ +			break; +		page_offset +=  len; +	} +} + +#define check_xor() 	do { 						\ +			   if (count == MAX_XOR_BLOCKS) {		\ +				xor_block(count, STRIPE_SIZE, ptr);	\ +				count = 1;				\ +			   }						\ +			} while(0) + + +static void compute_block(struct stripe_head *sh, int dd_idx) +{ +	raid5_conf_t *conf = sh->raid_conf; +	int i, count, disks = conf->raid_disks; +	void *ptr[MAX_XOR_BLOCKS], *p; + +	PRINTK("compute_block, stripe %llu, idx %d\n",  +		(unsigned long long)sh->sector, dd_idx); + +	ptr[0] = page_address(sh->dev[dd_idx].page); +	memset(ptr[0], 0, STRIPE_SIZE); +	count = 1; +	for (i = disks ; i--; ) { +		if (i == dd_idx) +			continue; +		p = page_address(sh->dev[i].page); +		if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) +			ptr[count++] = p; +		else +			printk("compute_block() %d, stripe %llu, %d" +				" not present\n", dd_idx, +				(unsigned long long)sh->sector, i); + +		check_xor(); +	} +	if (count != 1) +		xor_block(count, STRIPE_SIZE, ptr); +	set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); +} + +static void compute_parity(struct stripe_head *sh, int method) +{ +	raid5_conf_t *conf = sh->raid_conf; +	int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; +	void *ptr[MAX_XOR_BLOCKS]; +	struct bio *chosen; + +	PRINTK("compute_parity, stripe %llu, method %d\n", +		(unsigned long long)sh->sector, method); + +	count = 1; +	ptr[0] = page_address(sh->dev[pd_idx].page); +	switch(method) { +	case READ_MODIFY_WRITE: +		if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)) +			BUG(); +		for (i=disks ; i-- ;) { +			if (i==pd_idx) +				continue; +			if (sh->dev[i].towrite && +			    test_bit(R5_UPTODATE, &sh->dev[i].flags)) { +				ptr[count++] = page_address(sh->dev[i].page); +				chosen = sh->dev[i].towrite; +				sh->dev[i].towrite = NULL; + +				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) +					wake_up(&conf->wait_for_overlap); + +				if (sh->dev[i].written) BUG(); +				sh->dev[i].written = chosen; +				check_xor(); +			} +		} +		break; +	case RECONSTRUCT_WRITE: +		memset(ptr[0], 0, STRIPE_SIZE); +		for (i= disks; i-- ;) +			if (i!=pd_idx && sh->dev[i].towrite) { +				chosen = sh->dev[i].towrite; +				sh->dev[i].towrite = NULL; + +				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) +					wake_up(&conf->wait_for_overlap); + +				if (sh->dev[i].written) BUG(); +				sh->dev[i].written = chosen; +			} +		break; +	case CHECK_PARITY: +		break; +	} +	if (count>1) { +		xor_block(count, STRIPE_SIZE, ptr); +		count = 1; +	} +	 +	for (i = disks; i--;) +		if (sh->dev[i].written) { +			sector_t sector = sh->dev[i].sector; +			struct bio *wbi = sh->dev[i].written; +			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { +				copy_data(1, wbi, sh->dev[i].page, sector); +				wbi = r5_next_bio(wbi, sector); +			} + +			set_bit(R5_LOCKED, &sh->dev[i].flags); +			set_bit(R5_UPTODATE, &sh->dev[i].flags); +		} + +	switch(method) { +	case RECONSTRUCT_WRITE: +	case CHECK_PARITY: +		for (i=disks; i--;) +			if (i != pd_idx) { +				ptr[count++] = page_address(sh->dev[i].page); +				check_xor(); +			} +		break; +	case READ_MODIFY_WRITE: +		for (i = disks; i--;) +			if (sh->dev[i].written) { +				ptr[count++] = page_address(sh->dev[i].page); +				check_xor(); +			} +	} +	if (count != 1) +		xor_block(count, STRIPE_SIZE, ptr); +	 +	if (method != CHECK_PARITY) { +		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); +		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags); +	} else +		clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); +} + +/* + * Each stripe/dev can have one or more bion attached. + * toread/towrite point to the first in a chain.  + * The bi_next chain must be in order. + */ +static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) +{ +	struct bio **bip; +	raid5_conf_t *conf = sh->raid_conf; + +	PRINTK("adding bh b#%llu to stripe s#%llu\n", +		(unsigned long long)bi->bi_sector, +		(unsigned long long)sh->sector); + + +	spin_lock(&sh->lock); +	spin_lock_irq(&conf->device_lock); +	if (forwrite) +		bip = &sh->dev[dd_idx].towrite; +	else +		bip = &sh->dev[dd_idx].toread; +	while (*bip && (*bip)->bi_sector < bi->bi_sector) { +		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) +			goto overlap; +		bip = & (*bip)->bi_next; +	} +	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) +		goto overlap; + +	if (*bip && bi->bi_next && (*bip) != bi->bi_next) +		BUG(); +	if (*bip) +		bi->bi_next = *bip; +	*bip = bi; +	bi->bi_phys_segments ++; +	spin_unlock_irq(&conf->device_lock); +	spin_unlock(&sh->lock); + +	PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n", +		(unsigned long long)bi->bi_sector, +		(unsigned long long)sh->sector, dd_idx); + +	if (forwrite) { +		/* check if page is covered */ +		sector_t sector = sh->dev[dd_idx].sector; +		for (bi=sh->dev[dd_idx].towrite; +		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && +			     bi && bi->bi_sector <= sector; +		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { +			if (bi->bi_sector + (bi->bi_size>>9) >= sector) +				sector = bi->bi_sector + (bi->bi_size>>9); +		} +		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) +			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); +	} +	return 1; + + overlap: +	set_bit(R5_Overlap, &sh->dev[dd_idx].flags); +	spin_unlock_irq(&conf->device_lock); +	spin_unlock(&sh->lock); +	return 0; +} + + +/* + * handle_stripe - do things to a stripe. + * + * We lock the stripe and then examine the state of various bits + * to see what needs to be done. + * Possible results: + *    return some read request which now have data + *    return some write requests which are safely on disc + *    schedule a read on some buffers + *    schedule a write of some buffers + *    return confirmation of parity correctness + * + * Parity calculations are done inside the stripe lock + * buffers are taken off read_list or write_list, and bh_cache buffers + * get BH_Lock set before the stripe lock is released. + * + */ +  +static void handle_stripe(struct stripe_head *sh) +{ +	raid5_conf_t *conf = sh->raid_conf; +	int disks = conf->raid_disks; +	struct bio *return_bi= NULL; +	struct bio *bi; +	int i; +	int syncing; +	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; +	int non_overwrite = 0; +	int failed_num=0; +	struct r5dev *dev; + +	PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", +		(unsigned long long)sh->sector, atomic_read(&sh->count), +		sh->pd_idx); + +	spin_lock(&sh->lock); +	clear_bit(STRIPE_HANDLE, &sh->state); +	clear_bit(STRIPE_DELAYED, &sh->state); + +	syncing = test_bit(STRIPE_SYNCING, &sh->state); +	/* Now to look around and see what can be done */ + +	for (i=disks; i--; ) { +		mdk_rdev_t *rdev; +		dev = &sh->dev[i]; +		clear_bit(R5_Insync, &dev->flags); +		clear_bit(R5_Syncio, &dev->flags); + +		PRINTK("check %d: state 0x%lx read %p write %p written %p\n", +			i, dev->flags, dev->toread, dev->towrite, dev->written); +		/* maybe we can reply to a read */ +		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { +			struct bio *rbi, *rbi2; +			PRINTK("Return read for disc %d\n", i); +			spin_lock_irq(&conf->device_lock); +			rbi = dev->toread; +			dev->toread = NULL; +			if (test_and_clear_bit(R5_Overlap, &dev->flags)) +				wake_up(&conf->wait_for_overlap); +			spin_unlock_irq(&conf->device_lock); +			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { +				copy_data(0, rbi, dev->page, dev->sector); +				rbi2 = r5_next_bio(rbi, dev->sector); +				spin_lock_irq(&conf->device_lock); +				if (--rbi->bi_phys_segments == 0) { +					rbi->bi_next = return_bi; +					return_bi = rbi; +				} +				spin_unlock_irq(&conf->device_lock); +				rbi = rbi2; +			} +		} + +		/* now count some things */ +		if (test_bit(R5_LOCKED, &dev->flags)) locked++; +		if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + +		 +		if (dev->toread) to_read++; +		if (dev->towrite) { +			to_write++; +			if (!test_bit(R5_OVERWRITE, &dev->flags)) +				non_overwrite++; +		} +		if (dev->written) written++; +		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ +		if (!rdev || !rdev->in_sync) { +			failed++; +			failed_num = i; +		} else +			set_bit(R5_Insync, &dev->flags); +	} +	PRINTK("locked=%d uptodate=%d to_read=%d" +		" to_write=%d failed=%d failed_num=%d\n", +		locked, uptodate, to_read, to_write, failed, failed_num); +	/* check if the array has lost two devices and, if so, some requests might +	 * need to be failed +	 */ +	if (failed > 1 && to_read+to_write+written) { +		spin_lock_irq(&conf->device_lock); +		for (i=disks; i--; ) { +			/* fail all writes first */ +			bi = sh->dev[i].towrite; +			sh->dev[i].towrite = NULL; +			if (bi) to_write--; + +			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) +				wake_up(&conf->wait_for_overlap); + +			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ +				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); +				clear_bit(BIO_UPTODATE, &bi->bi_flags); +				if (--bi->bi_phys_segments == 0) { +					md_write_end(conf->mddev); +					bi->bi_next = return_bi; +					return_bi = bi; +				} +				bi = nextbi; +			} +			/* and fail all 'written' */ +			bi = sh->dev[i].written; +			sh->dev[i].written = NULL; +			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { +				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); +				clear_bit(BIO_UPTODATE, &bi->bi_flags); +				if (--bi->bi_phys_segments == 0) { +					md_write_end(conf->mddev); +					bi->bi_next = return_bi; +					return_bi = bi; +				} +				bi = bi2; +			} + +			/* fail any reads if this device is non-operational */ +			if (!test_bit(R5_Insync, &sh->dev[i].flags)) { +				bi = sh->dev[i].toread; +				sh->dev[i].toread = NULL; +				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) +					wake_up(&conf->wait_for_overlap); +				if (bi) to_read--; +				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ +					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); +					clear_bit(BIO_UPTODATE, &bi->bi_flags); +					if (--bi->bi_phys_segments == 0) { +						bi->bi_next = return_bi; +						return_bi = bi; +					} +					bi = nextbi; +				} +			} +		} +		spin_unlock_irq(&conf->device_lock); +	} +	if (failed > 1 && syncing) { +		md_done_sync(conf->mddev, STRIPE_SECTORS,0); +		clear_bit(STRIPE_SYNCING, &sh->state); +		syncing = 0; +	} + +	/* might be able to return some write requests if the parity block +	 * is safe, or on a failed drive +	 */ +	dev = &sh->dev[sh->pd_idx]; +	if ( written && +	     ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && +		test_bit(R5_UPTODATE, &dev->flags)) +	       || (failed == 1 && failed_num == sh->pd_idx)) +	    ) { +	    /* any written block on an uptodate or failed drive can be returned. +	     * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but  +	     * never LOCKED, so we don't need to test 'failed' directly. +	     */ +	    for (i=disks; i--; ) +		if (sh->dev[i].written) { +		    dev = &sh->dev[i]; +		    if (!test_bit(R5_LOCKED, &dev->flags) && +			 test_bit(R5_UPTODATE, &dev->flags) ) { +			/* We can return any write requests */ +			    struct bio *wbi, *wbi2; +			    PRINTK("Return write for disc %d\n", i); +			    spin_lock_irq(&conf->device_lock); +			    wbi = dev->written; +			    dev->written = NULL; +			    while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { +				    wbi2 = r5_next_bio(wbi, dev->sector); +				    if (--wbi->bi_phys_segments == 0) { +					    md_write_end(conf->mddev); +					    wbi->bi_next = return_bi; +					    return_bi = wbi; +				    } +				    wbi = wbi2; +			    } +			    spin_unlock_irq(&conf->device_lock); +		    } +		} +	} + +	/* Now we might consider reading some blocks, either to check/generate +	 * parity, or to satisfy requests +	 * or to load a block that is being partially written. +	 */ +	if (to_read || non_overwrite || (syncing && (uptodate < disks))) { +		for (i=disks; i--;) { +			dev = &sh->dev[i]; +			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && +			    (dev->toread || +			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || +			     syncing || +			     (failed && (sh->dev[failed_num].toread || +					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) +				    ) +				) { +				/* we would like to get this block, possibly +				 * by computing it, but we might not be able to +				 */ +				if (uptodate == disks-1) { +					PRINTK("Computing block %d\n", i); +					compute_block(sh, i); +					uptodate++; +				} else if (test_bit(R5_Insync, &dev->flags)) { +					set_bit(R5_LOCKED, &dev->flags); +					set_bit(R5_Wantread, &dev->flags); +#if 0 +					/* if I am just reading this block and we don't have +					   a failed drive, or any pending writes then sidestep the cache */ +					if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && +					    ! syncing && !failed && !to_write) { +						sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page; +						sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data; +					} +#endif +					locked++; +					PRINTK("Reading block %d (sync=%d)\n",  +						i, syncing); +					if (syncing) +						md_sync_acct(conf->disks[i].rdev->bdev, +							     STRIPE_SECTORS); +				} +			} +		} +		set_bit(STRIPE_HANDLE, &sh->state); +	} + +	/* now to consider writing and what else, if anything should be read */ +	if (to_write) { +		int rmw=0, rcw=0; +		for (i=disks ; i--;) { +			/* would I have to read this buffer for read_modify_write */ +			dev = &sh->dev[i]; +			if ((dev->towrite || i == sh->pd_idx) && +			    (!test_bit(R5_LOCKED, &dev->flags)  +#if 0 +|| sh->bh_page[i]!=bh->b_page +#endif +				    ) && +			    !test_bit(R5_UPTODATE, &dev->flags)) { +				if (test_bit(R5_Insync, &dev->flags) +/*				    && !(!mddev->insync && i == sh->pd_idx) */ +					) +					rmw++; +				else rmw += 2*disks;  /* cannot read it */ +			} +			/* Would I have to read this buffer for reconstruct_write */ +			if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && +			    (!test_bit(R5_LOCKED, &dev->flags)  +#if 0 +|| sh->bh_page[i] != bh->b_page +#endif +				    ) && +			    !test_bit(R5_UPTODATE, &dev->flags)) { +				if (test_bit(R5_Insync, &dev->flags)) rcw++; +				else rcw += 2*disks; +			} +		} +		PRINTK("for sector %llu, rmw=%d rcw=%d\n",  +			(unsigned long long)sh->sector, rmw, rcw); +		set_bit(STRIPE_HANDLE, &sh->state); +		if (rmw < rcw && rmw > 0) +			/* prefer read-modify-write, but need to get some data */ +			for (i=disks; i--;) { +				dev = &sh->dev[i]; +				if ((dev->towrite || i == sh->pd_idx) && +				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && +				    test_bit(R5_Insync, &dev->flags)) { +					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) +					{ +						PRINTK("Read_old block %d for r-m-w\n", i); +						set_bit(R5_LOCKED, &dev->flags); +						set_bit(R5_Wantread, &dev->flags); +						locked++; +					} else { +						set_bit(STRIPE_DELAYED, &sh->state); +						set_bit(STRIPE_HANDLE, &sh->state); +					} +				} +			} +		if (rcw <= rmw && rcw > 0) +			/* want reconstruct write, but need to get some data */ +			for (i=disks; i--;) { +				dev = &sh->dev[i]; +				if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && +				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && +				    test_bit(R5_Insync, &dev->flags)) { +					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) +					{ +						PRINTK("Read_old block %d for Reconstruct\n", i); +						set_bit(R5_LOCKED, &dev->flags); +						set_bit(R5_Wantread, &dev->flags); +						locked++; +					} else { +						set_bit(STRIPE_DELAYED, &sh->state); +						set_bit(STRIPE_HANDLE, &sh->state); +					} +				} +			} +		/* now if nothing is locked, and if we have enough data, we can start a write request */ +		if (locked == 0 && (rcw == 0 ||rmw == 0)) { +			PRINTK("Computing parity...\n"); +			compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); +			/* now every locked buffer is ready to be written */ +			for (i=disks; i--;) +				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { +					PRINTK("Writing block %d\n", i); +					locked++; +					set_bit(R5_Wantwrite, &sh->dev[i].flags); +					if (!test_bit(R5_Insync, &sh->dev[i].flags) +					    || (i==sh->pd_idx && failed == 0)) +						set_bit(STRIPE_INSYNC, &sh->state); +				} +			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { +				atomic_dec(&conf->preread_active_stripes); +				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) +					md_wakeup_thread(conf->mddev->thread); +			} +		} +	} + +	/* maybe we need to check and possibly fix the parity for this stripe +	 * Any reads will already have been scheduled, so we just see if enough data +	 * is available +	 */ +	if (syncing && locked == 0 && +	    !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { +		set_bit(STRIPE_HANDLE, &sh->state); +		if (failed == 0) { +			char *pagea; +			if (uptodate != disks) +				BUG(); +			compute_parity(sh, CHECK_PARITY); +			uptodate--; +			pagea = page_address(sh->dev[sh->pd_idx].page); +			if ((*(u32*)pagea) == 0 && +			    !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { +				/* parity is correct (on disc, not in buffer any more) */ +				set_bit(STRIPE_INSYNC, &sh->state); +			} +		} +		if (!test_bit(STRIPE_INSYNC, &sh->state)) { +			if (failed==0) +				failed_num = sh->pd_idx; +			/* should be able to compute the missing block and write it to spare */ +			if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) { +				if (uptodate+1 != disks) +					BUG(); +				compute_block(sh, failed_num); +				uptodate++; +			} +			if (uptodate != disks) +				BUG(); +			dev = &sh->dev[failed_num]; +			set_bit(R5_LOCKED, &dev->flags); +			set_bit(R5_Wantwrite, &dev->flags); +			locked++; +			set_bit(STRIPE_INSYNC, &sh->state); +			set_bit(R5_Syncio, &dev->flags); +		} +	} +	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { +		md_done_sync(conf->mddev, STRIPE_SECTORS,1); +		clear_bit(STRIPE_SYNCING, &sh->state); +	} +	 +	spin_unlock(&sh->lock); + +	while ((bi=return_bi)) { +		int bytes = bi->bi_size; + +		return_bi = bi->bi_next; +		bi->bi_next = NULL; +		bi->bi_size = 0; +		bi->bi_end_io(bi, bytes, 0); +	} +	for (i=disks; i-- ;) { +		int rw; +		struct bio *bi; +		mdk_rdev_t *rdev; +		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) +			rw = 1; +		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) +			rw = 0; +		else +			continue; +  +		bi = &sh->dev[i].req; +  +		bi->bi_rw = rw; +		if (rw) +			bi->bi_end_io = raid5_end_write_request; +		else +			bi->bi_end_io = raid5_end_read_request; +  +		rcu_read_lock(); +		rdev = conf->disks[i].rdev; +		if (rdev && rdev->faulty) +			rdev = NULL; +		if (rdev) +			atomic_inc(&rdev->nr_pending); +		rcu_read_unlock(); +  +		if (rdev) { +			if (test_bit(R5_Syncio, &sh->dev[i].flags)) +				md_sync_acct(rdev->bdev, STRIPE_SECTORS); + +			bi->bi_bdev = rdev->bdev; +			PRINTK("for %llu schedule op %ld on disc %d\n", +				(unsigned long long)sh->sector, bi->bi_rw, i); +			atomic_inc(&sh->count); +			bi->bi_sector = sh->sector + rdev->data_offset; +			bi->bi_flags = 1 << BIO_UPTODATE; +			bi->bi_vcnt = 1;	 +			bi->bi_max_vecs = 1; +			bi->bi_idx = 0; +			bi->bi_io_vec = &sh->dev[i].vec; +			bi->bi_io_vec[0].bv_len = STRIPE_SIZE; +			bi->bi_io_vec[0].bv_offset = 0; +			bi->bi_size = STRIPE_SIZE; +			bi->bi_next = NULL; +			generic_make_request(bi); +		} else { +			PRINTK("skip op %ld on disc %d for sector %llu\n", +				bi->bi_rw, i, (unsigned long long)sh->sector); +			clear_bit(R5_LOCKED, &sh->dev[i].flags); +			set_bit(STRIPE_HANDLE, &sh->state); +		} +	} +} + +static inline void raid5_activate_delayed(raid5_conf_t *conf) +{ +	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { +		while (!list_empty(&conf->delayed_list)) { +			struct list_head *l = conf->delayed_list.next; +			struct stripe_head *sh; +			sh = list_entry(l, struct stripe_head, lru); +			list_del_init(l); +			clear_bit(STRIPE_DELAYED, &sh->state); +			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) +				atomic_inc(&conf->preread_active_stripes); +			list_add_tail(&sh->lru, &conf->handle_list); +		} +	} +} + +static void unplug_slaves(mddev_t *mddev) +{ +	raid5_conf_t *conf = mddev_to_conf(mddev); +	int i; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks; i++) { +		mdk_rdev_t *rdev = conf->disks[i].rdev; +		if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { +			request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + +			atomic_inc(&rdev->nr_pending); +			rcu_read_unlock(); + +			if (r_queue->unplug_fn) +				r_queue->unplug_fn(r_queue); + +			rdev_dec_pending(rdev, mddev); +			rcu_read_lock(); +		} +	} +	rcu_read_unlock(); +} + +static void raid5_unplug_device(request_queue_t *q) +{ +	mddev_t *mddev = q->queuedata; +	raid5_conf_t *conf = mddev_to_conf(mddev); +	unsigned long flags; + +	spin_lock_irqsave(&conf->device_lock, flags); + +	if (blk_remove_plug(q)) +		raid5_activate_delayed(conf); +	md_wakeup_thread(mddev->thread); + +	spin_unlock_irqrestore(&conf->device_lock, flags); + +	unplug_slaves(mddev); +} + +static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, +			     sector_t *error_sector) +{ +	mddev_t *mddev = q->queuedata; +	raid5_conf_t *conf = mddev_to_conf(mddev); +	int i, ret = 0; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks && ret == 0; i++) { +		mdk_rdev_t *rdev = conf->disks[i].rdev; +		if (rdev && !rdev->faulty) { +			struct block_device *bdev = rdev->bdev; +			request_queue_t *r_queue = bdev_get_queue(bdev); + +			if (!r_queue->issue_flush_fn) +				ret = -EOPNOTSUPP; +			else { +				atomic_inc(&rdev->nr_pending); +				rcu_read_unlock(); +				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, +							      error_sector); +				rdev_dec_pending(rdev, mddev); +				rcu_read_lock(); +			} +		} +	} +	rcu_read_unlock(); +	return ret; +} + +static inline void raid5_plug_device(raid5_conf_t *conf) +{ +	spin_lock_irq(&conf->device_lock); +	blk_plug_device(conf->mddev->queue); +	spin_unlock_irq(&conf->device_lock); +} + +static int make_request (request_queue_t *q, struct bio * bi) +{ +	mddev_t *mddev = q->queuedata; +	raid5_conf_t *conf = mddev_to_conf(mddev); +	const unsigned int raid_disks = conf->raid_disks; +	const unsigned int data_disks = raid_disks - 1; +	unsigned int dd_idx, pd_idx; +	sector_t new_sector; +	sector_t logical_sector, last_sector; +	struct stripe_head *sh; + +	if (bio_data_dir(bi)==WRITE) { +		disk_stat_inc(mddev->gendisk, writes); +		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); +	} else { +		disk_stat_inc(mddev->gendisk, reads); +		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi)); +	} + +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); +	last_sector = bi->bi_sector + (bi->bi_size>>9); +	bi->bi_next = NULL; +	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */ +	if ( bio_data_dir(bi) == WRITE ) +		md_write_start(mddev); +	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { +		DEFINE_WAIT(w); +		 +		new_sector = raid5_compute_sector(logical_sector, +						  raid_disks, data_disks, &dd_idx, &pd_idx, conf); + +		PRINTK("raid5: make_request, sector %llu logical %llu\n", +			(unsigned long long)new_sector,  +			(unsigned long long)logical_sector); + +	retry: +		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); +		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK)); +		if (sh) { +			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { +				/* Add failed due to overlap.  Flush everything +				 * and wait a while +				 */ +				raid5_unplug_device(mddev->queue); +				release_stripe(sh); +				schedule(); +				goto retry; +			} +			finish_wait(&conf->wait_for_overlap, &w); +			raid5_plug_device(conf); +			handle_stripe(sh); +			release_stripe(sh); + +		} else { +			/* cannot get stripe for read-ahead, just give-up */ +			clear_bit(BIO_UPTODATE, &bi->bi_flags); +			finish_wait(&conf->wait_for_overlap, &w); +			break; +		} +			 +	} +	spin_lock_irq(&conf->device_lock); +	if (--bi->bi_phys_segments == 0) { +		int bytes = bi->bi_size; + +		if ( bio_data_dir(bi) == WRITE ) +			md_write_end(mddev); +		bi->bi_size = 0; +		bi->bi_end_io(bi, bytes, 0); +	} +	spin_unlock_irq(&conf->device_lock); +	return 0; +} + +/* FIXME go_faster isn't used */ +static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) +{ +	raid5_conf_t *conf = (raid5_conf_t *) mddev->private; +	struct stripe_head *sh; +	int sectors_per_chunk = conf->chunk_size >> 9; +	sector_t x; +	unsigned long stripe; +	int chunk_offset; +	int dd_idx, pd_idx; +	sector_t first_sector; +	int raid_disks = conf->raid_disks; +	int data_disks = raid_disks-1; + +	if (sector_nr >= mddev->size <<1) { +		/* just being told to finish up .. nothing much to do */ +		unplug_slaves(mddev); +		return 0; +	} +	/* if there is 1 or more failed drives and we are trying +	 * to resync, then assert that we are finished, because there is +	 * nothing we can do. +	 */ +	if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { +		int rv = (mddev->size << 1) - sector_nr; +		md_done_sync(mddev, rv, 1); +		return rv; +	} + +	x = sector_nr; +	chunk_offset = sector_div(x, sectors_per_chunk); +	stripe = x; +	BUG_ON(x != stripe); + +	first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk +		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); +	sh = get_active_stripe(conf, sector_nr, pd_idx, 1); +	if (sh == NULL) { +		sh = get_active_stripe(conf, sector_nr, pd_idx, 0); +		/* make sure we don't swamp the stripe cache if someone else +		 * is trying to get access  +		 */ +		set_current_state(TASK_UNINTERRUPTIBLE); +		schedule_timeout(1); +	} +	spin_lock(&sh->lock);	 +	set_bit(STRIPE_SYNCING, &sh->state); +	clear_bit(STRIPE_INSYNC, &sh->state); +	spin_unlock(&sh->lock); + +	handle_stripe(sh); +	release_stripe(sh); + +	return STRIPE_SECTORS; +} + +/* + * This is our raid5 kernel thread. + * + * We scan the hash table for stripes which can be handled now. + * During the scan, completed stripes are saved for us by the interrupt + * handler, so that they will not have to wait for our next wakeup. + */ +static void raid5d (mddev_t *mddev) +{ +	struct stripe_head *sh; +	raid5_conf_t *conf = mddev_to_conf(mddev); +	int handled; + +	PRINTK("+++ raid5d active\n"); + +	md_check_recovery(mddev); +	md_handle_safemode(mddev); + +	handled = 0; +	spin_lock_irq(&conf->device_lock); +	while (1) { +		struct list_head *first; + +		if (list_empty(&conf->handle_list) && +		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && +		    !blk_queue_plugged(mddev->queue) && +		    !list_empty(&conf->delayed_list)) +			raid5_activate_delayed(conf); + +		if (list_empty(&conf->handle_list)) +			break; + +		first = conf->handle_list.next; +		sh = list_entry(first, struct stripe_head, lru); + +		list_del_init(first); +		atomic_inc(&sh->count); +		if (atomic_read(&sh->count)!= 1) +			BUG(); +		spin_unlock_irq(&conf->device_lock); +		 +		handled++; +		handle_stripe(sh); +		release_stripe(sh); + +		spin_lock_irq(&conf->device_lock); +	} +	PRINTK("%d stripes handled\n", handled); + +	spin_unlock_irq(&conf->device_lock); + +	unplug_slaves(mddev); + +	PRINTK("--- raid5d inactive\n"); +} + +static int run (mddev_t *mddev) +{ +	raid5_conf_t *conf; +	int raid_disk, memory; +	mdk_rdev_t *rdev; +	struct disk_info *disk; +	struct list_head *tmp; + +	if (mddev->level != 5 && mddev->level != 4) { +		printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev), mddev->level); +		return -EIO; +	} + +	mddev->private = kmalloc (sizeof (raid5_conf_t) +				  + mddev->raid_disks * sizeof(struct disk_info), +				  GFP_KERNEL); +	if ((conf = mddev->private) == NULL) +		goto abort; +	memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); +	conf->mddev = mddev; + +	if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) +		goto abort; +	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); + +	spin_lock_init(&conf->device_lock); +	init_waitqueue_head(&conf->wait_for_stripe); +	init_waitqueue_head(&conf->wait_for_overlap); +	INIT_LIST_HEAD(&conf->handle_list); +	INIT_LIST_HEAD(&conf->delayed_list); +	INIT_LIST_HEAD(&conf->inactive_list); +	atomic_set(&conf->active_stripes, 0); +	atomic_set(&conf->preread_active_stripes, 0); + +	mddev->queue->unplug_fn = raid5_unplug_device; +	mddev->queue->issue_flush_fn = raid5_issue_flush; + +	PRINTK("raid5: run(%s) called.\n", mdname(mddev)); + +	ITERATE_RDEV(mddev,rdev,tmp) { +		raid_disk = rdev->raid_disk; +		if (raid_disk >= mddev->raid_disks +		    || raid_disk < 0) +			continue; +		disk = conf->disks + raid_disk; + +		disk->rdev = rdev; + +		if (rdev->in_sync) { +			char b[BDEVNAME_SIZE]; +			printk(KERN_INFO "raid5: device %s operational as raid" +				" disk %d\n", bdevname(rdev->bdev,b), +				raid_disk); +			conf->working_disks++; +		} +	} + +	conf->raid_disks = mddev->raid_disks; +	/* +	 * 0 for a fully functional array, 1 for a degraded array. +	 */ +	mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; +	conf->mddev = mddev; +	conf->chunk_size = mddev->chunk_size; +	conf->level = mddev->level; +	conf->algorithm = mddev->layout; +	conf->max_nr_stripes = NR_STRIPES; + +	/* device size must be a multiple of chunk size */ +	mddev->size &= ~(mddev->chunk_size/1024 -1); + +	if (!conf->chunk_size || conf->chunk_size % 4) { +		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", +			conf->chunk_size, mdname(mddev)); +		goto abort; +	} +	if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { +		printk(KERN_ERR  +			"raid5: unsupported parity algorithm %d for %s\n", +			conf->algorithm, mdname(mddev)); +		goto abort; +	} +	if (mddev->degraded > 1) { +		printk(KERN_ERR "raid5: not enough operational devices for %s" +			" (%d/%d failed)\n", +			mdname(mddev), conf->failed_disks, conf->raid_disks); +		goto abort; +	} + +	if (mddev->degraded == 1 && +	    mddev->recovery_cp != MaxSector) { +		printk(KERN_ERR  +			"raid5: cannot start dirty degraded array for %s\n", +			mdname(mddev)); +		goto abort; +	} + +	{ +		mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); +		if (!mddev->thread) { +			printk(KERN_ERR  +				"raid5: couldn't allocate thread for %s\n", +				mdname(mddev)); +			goto abort; +		} +	} +memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + +		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; +	if (grow_stripes(conf, conf->max_nr_stripes)) { +		printk(KERN_ERR  +			"raid5: couldn't allocate %dkB for buffers\n", memory); +		shrink_stripes(conf); +		md_unregister_thread(mddev->thread); +		goto abort; +	} else +		printk(KERN_INFO "raid5: allocated %dkB for %s\n", +			memory, mdname(mddev)); + +	if (mddev->degraded == 0) +		printk("raid5: raid level %d set %s active with %d out of %d" +			" devices, algorithm %d\n", conf->level, mdname(mddev),  +			mddev->raid_disks-mddev->degraded, mddev->raid_disks, +			conf->algorithm); +	else +		printk(KERN_ALERT "raid5: raid level %d set %s active with %d" +			" out of %d devices, algorithm %d\n", conf->level, +			mdname(mddev), mddev->raid_disks - mddev->degraded, +			mddev->raid_disks, conf->algorithm); + +	print_raid5_conf(conf); + +	/* read-ahead size must cover two whole stripes, which is +	 * 2 * (n-1) * chunksize where 'n' is the number of raid devices +	 */ +	{ +		int stripe = (mddev->raid_disks-1) * mddev->chunk_size +			/ PAGE_CACHE_SIZE; +		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) +			mddev->queue->backing_dev_info.ra_pages = 2 * stripe; +	} + +	/* Ok, everything is just fine now */ +	mddev->array_size =  mddev->size * (mddev->raid_disks - 1); +	return 0; +abort: +	if (conf) { +		print_raid5_conf(conf); +		if (conf->stripe_hashtbl) +			free_pages((unsigned long) conf->stripe_hashtbl, +							HASH_PAGES_ORDER); +		kfree(conf); +	} +	mddev->private = NULL; +	printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); +	return -EIO; +} + + + +static int stop (mddev_t *mddev) +{ +	raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + +	md_unregister_thread(mddev->thread); +	mddev->thread = NULL; +	shrink_stripes(conf); +	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); +	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +	kfree(conf); +	mddev->private = NULL; +	return 0; +} + +#if RAID5_DEBUG +static void print_sh (struct stripe_head *sh) +{ +	int i; + +	printk("sh %llu, pd_idx %d, state %ld.\n", +		(unsigned long long)sh->sector, sh->pd_idx, sh->state); +	printk("sh %llu,  count %d.\n", +		(unsigned long long)sh->sector, atomic_read(&sh->count)); +	printk("sh %llu, ", (unsigned long long)sh->sector); +	for (i = 0; i < sh->raid_conf->raid_disks; i++) { +		printk("(cache%d: %p %ld) ",  +			i, sh->dev[i].page, sh->dev[i].flags); +	} +	printk("\n"); +} + +static void printall (raid5_conf_t *conf) +{ +	struct stripe_head *sh; +	int i; + +	spin_lock_irq(&conf->device_lock); +	for (i = 0; i < NR_HASH; i++) { +		sh = conf->stripe_hashtbl[i]; +		for (; sh; sh = sh->hash_next) { +			if (sh->raid_conf != conf) +				continue; +			print_sh(sh); +		} +	} +	spin_unlock_irq(&conf->device_lock); +} +#endif + +static void status (struct seq_file *seq, mddev_t *mddev) +{ +	raid5_conf_t *conf = (raid5_conf_t *) mddev->private; +	int i; + +	seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); +	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); +	for (i = 0; i < conf->raid_disks; i++) +		seq_printf (seq, "%s", +			       conf->disks[i].rdev && +			       conf->disks[i].rdev->in_sync ? "U" : "_"); +	seq_printf (seq, "]"); +#if RAID5_DEBUG +#define D(x) \ +	seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x)) +	printall(conf); +#endif +} + +static void print_raid5_conf (raid5_conf_t *conf) +{ +	int i; +	struct disk_info *tmp; + +	printk("RAID5 conf printout:\n"); +	if (!conf) { +		printk("(conf==NULL)\n"); +		return; +	} +	printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, +		 conf->working_disks, conf->failed_disks); + +	for (i = 0; i < conf->raid_disks; i++) { +		char b[BDEVNAME_SIZE]; +		tmp = conf->disks + i; +		if (tmp->rdev) +		printk(" disk %d, o:%d, dev:%s\n", +			i, !tmp->rdev->faulty, +			bdevname(tmp->rdev->bdev,b)); +	} +} + +static int raid5_spare_active(mddev_t *mddev) +{ +	int i; +	raid5_conf_t *conf = mddev->private; +	struct disk_info *tmp; + +	for (i = 0; i < conf->raid_disks; i++) { +		tmp = conf->disks + i; +		if (tmp->rdev +		    && !tmp->rdev->faulty +		    && !tmp->rdev->in_sync) { +			mddev->degraded--; +			conf->failed_disks--; +			conf->working_disks++; +			tmp->rdev->in_sync = 1; +		} +	} +	print_raid5_conf(conf); +	return 0; +} + +static int raid5_remove_disk(mddev_t *mddev, int number) +{ +	raid5_conf_t *conf = mddev->private; +	int err = 0; +	mdk_rdev_t *rdev; +	struct disk_info *p = conf->disks + number; + +	print_raid5_conf(conf); +	rdev = p->rdev; +	if (rdev) { +		if (rdev->in_sync || +		    atomic_read(&rdev->nr_pending)) { +			err = -EBUSY; +			goto abort; +		} +		p->rdev = NULL; +		synchronize_kernel(); +		if (atomic_read(&rdev->nr_pending)) { +			/* lost the race, try later */ +			err = -EBUSY; +			p->rdev = rdev; +		} +	} +abort: + +	print_raid5_conf(conf); +	return err; +} + +static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	raid5_conf_t *conf = mddev->private; +	int found = 0; +	int disk; +	struct disk_info *p; + +	if (mddev->degraded > 1) +		/* no point adding a device */ +		return 0; + +	/* +	 * find the disk ... +	 */ +	for (disk=0; disk < mddev->raid_disks; disk++) +		if ((p=conf->disks + disk)->rdev == NULL) { +			rdev->in_sync = 0; +			rdev->raid_disk = disk; +			found = 1; +			p->rdev = rdev; +			break; +		} +	print_raid5_conf(conf); +	return found; +} + +static int raid5_resize(mddev_t *mddev, sector_t sectors) +{ +	/* no resync is happening, and there is enough space +	 * on all devices, so we can resize. +	 * We need to make sure resync covers any new space. +	 * If the array is shrinking we should possibly wait until +	 * any io in the removed space completes, but it hardly seems +	 * worth it. +	 */ +	sectors &= ~((sector_t)mddev->chunk_size/512 - 1); +	mddev->array_size = (sectors * (mddev->raid_disks-1))>>1; +	set_capacity(mddev->gendisk, mddev->array_size << 1); +	mddev->changed = 1; +	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) { +		mddev->recovery_cp = mddev->size << 1; +		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +	} +	mddev->size = sectors /2; +	return 0; +} + +static mdk_personality_t raid5_personality= +{ +	.name		= "raid5", +	.owner		= THIS_MODULE, +	.make_request	= make_request, +	.run		= run, +	.stop		= stop, +	.status		= status, +	.error_handler	= error, +	.hot_add_disk	= raid5_add_disk, +	.hot_remove_disk= raid5_remove_disk, +	.spare_active	= raid5_spare_active, +	.sync_request	= sync_request, +	.resize		= raid5_resize, +}; + +static int __init raid5_init (void) +{ +	return register_md_personality (RAID5, &raid5_personality); +} + +static void raid5_exit (void) +{ +	unregister_md_personality (RAID5); +} + +module_init(raid5_init); +module_exit(raid5_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("md-personality-4"); /* RAID5 */ diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h new file mode 100644 index 00000000000..f80ee6350ed --- /dev/null +++ b/drivers/md/raid6.h @@ -0,0 +1,135 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2003 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +#ifndef LINUX_RAID_RAID6_H +#define LINUX_RAID_RAID6_H + +#ifdef __KERNEL__ + +/* Set to 1 to use kernel-wide empty_zero_page */ +#define RAID6_USE_EMPTY_ZERO_PAGE 0 + +#include <linux/module.h> +#include <linux/stddef.h> +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mempool.h> +#include <linux/list.h> +#include <linux/vmalloc.h> +#include <linux/raid/md.h> +#include <linux/raid/raid5.h> + +typedef raid5_conf_t raid6_conf_t; /* Same configuration */ + +/* Additional compute_parity mode -- updates the parity w/o LOCKING */ +#define UPDATE_PARITY	4 + +/* We need a pre-zeroed page... if we don't want to use the kernel-provided +   one define it here */ +#if RAID6_USE_EMPTY_ZERO_PAGE +# define raid6_empty_zero_page empty_zero_page +#else +extern const char raid6_empty_zero_page[PAGE_SIZE]; +#endif + +#else /* ! __KERNEL__ */ +/* Used for testing in user space */ + +#include <errno.h> +#include <inttypes.h> +#include <limits.h> +#include <stddef.h> +#include <sys/mman.h> +#include <sys/types.h> + +/* Not standard, but glibc defines it */ +#define BITS_PER_LONG __WORDSIZE + +typedef uint8_t  u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif +extern const char raid6_empty_zero_page[PAGE_SIZE]; + +#define __init +#define __exit +#define __attribute_const__ __attribute__((const)) + +#define preempt_enable() +#define preempt_disable() + +#endif /* __KERNEL__ */ + +/* Routine choices */ +struct raid6_calls { +	void (*gen_syndrome)(int, size_t, void **); +	int  (*valid)(void);	/* Returns 1 if this routine set is usable */ +	const char *name;	/* Name of this routine set */ +	int prefer;		/* Has special performance attribute */ +}; + +/* Selected algorithm */ +extern struct raid6_calls raid6_call; + +/* Algorithm list */ +extern const struct raid6_calls * const raid6_algos[]; +int raid6_select_algo(void); + +/* Return values from chk_syndrome */ +#define RAID6_OK	0 +#define RAID6_P_BAD	1 +#define RAID6_Q_BAD	2 +#define RAID6_PQ_BAD	3 + +/* Galois field tables */ +extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); +extern const u8 raid6_gfexp[256]      __attribute__((aligned(256))); +extern const u8 raid6_gfinv[256]      __attribute__((aligned(256))); +extern const u8 raid6_gfexi[256]      __attribute__((aligned(256))); + +/* Recovery routines */ +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); +void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); +void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); + +/* Some definitions to allow code to be compiled for testing in userspace */ +#ifndef __KERNEL__ + +# define jiffies	raid6_jiffies() +# define printk 	printf +# define GFP_KERNEL	0 +# define __get_free_pages(x,y)	((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0)) +# define free_pages(x,y)	munmap((void *)(x), (y)*PAGE_SIZE) + +static inline void cpu_relax(void) +{ +	/* Nothing */ +} + +#undef  HZ +#define HZ 1000 +static inline uint32_t raid6_jiffies(void) +{ +	struct timeval tv; +	gettimeofday(&tv, NULL); +	return tv.tv_sec*1000 + tv.tv_usec/1000; +} + +#endif /* ! __KERNEL__ */ + +#endif /* LINUX_RAID_RAID6_H */ diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c new file mode 100644 index 00000000000..acf386fc4b4 --- /dev/null +++ b/drivers/md/raid6algos.c @@ -0,0 +1,153 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2002 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6algos.c + * + * Algorithm list and algorithm selection for RAID-6 + */ + +#include "raid6.h" +#ifndef __KERNEL__ +#include <sys/mman.h> +#endif + +struct raid6_calls raid6_call; + +/* Various routine sets */ +extern const struct raid6_calls raid6_intx1; +extern const struct raid6_calls raid6_intx2; +extern const struct raid6_calls raid6_intx4; +extern const struct raid6_calls raid6_intx8; +extern const struct raid6_calls raid6_intx16; +extern const struct raid6_calls raid6_intx32; +extern const struct raid6_calls raid6_mmxx1; +extern const struct raid6_calls raid6_mmxx2; +extern const struct raid6_calls raid6_sse1x1; +extern const struct raid6_calls raid6_sse1x2; +extern const struct raid6_calls raid6_sse2x1; +extern const struct raid6_calls raid6_sse2x2; +extern const struct raid6_calls raid6_sse2x4; +extern const struct raid6_calls raid6_altivec1; +extern const struct raid6_calls raid6_altivec2; +extern const struct raid6_calls raid6_altivec4; +extern const struct raid6_calls raid6_altivec8; + +const struct raid6_calls * const raid6_algos[] = { +	&raid6_intx1, +	&raid6_intx2, +	&raid6_intx4, +	&raid6_intx8, +#if defined(__ia64__) +	&raid6_intx16, +	&raid6_intx32, +#endif +#if defined(__i386__) +	&raid6_mmxx1, +	&raid6_mmxx2, +	&raid6_sse1x1, +	&raid6_sse1x2, +	&raid6_sse2x1, +	&raid6_sse2x2, +#endif +#if defined(__x86_64__) +	&raid6_sse2x1, +	&raid6_sse2x2, +	&raid6_sse2x4, +#endif +#ifdef CONFIG_ALTIVEC +	&raid6_altivec1, +	&raid6_altivec2, +	&raid6_altivec4, +	&raid6_altivec8, +#endif +	NULL +}; + +#ifdef __KERNEL__ +#define RAID6_TIME_JIFFIES_LG2	4 +#else +/* Need more time to be stable in userspace */ +#define RAID6_TIME_JIFFIES_LG2	9 +#endif + +/* Try to pick the best algorithm */ +/* This code uses the gfmul table as convenient data set to abuse */ + +int __init raid6_select_algo(void) +{ +	const struct raid6_calls * const * algo; +	const struct raid6_calls * best; +	char *syndromes; +	void *dptrs[(65536/PAGE_SIZE)+2]; +	int i, disks; +	unsigned long perf, bestperf; +	int bestprefer; +	unsigned long j0, j1; + +	disks = (65536/PAGE_SIZE)+2; +	for ( i = 0 ; i < disks-2 ; i++ ) { +		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; +	} + +	/* Normal code - use a 2-page allocation to avoid D$ conflict */ +	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); + +	if ( !syndromes ) { +		printk("raid6: Yikes!  No memory available.\n"); +		return -ENOMEM; +	} + +	dptrs[disks-2] = syndromes; +	dptrs[disks-1] = syndromes + PAGE_SIZE; + +	bestperf = 0;  bestprefer = 0;  best = NULL; + +	for ( algo = raid6_algos ; *algo ; algo++ ) { +		if ( !(*algo)->valid || (*algo)->valid() ) { +			perf = 0; + +			preempt_disable(); +			j0 = jiffies; +			while ( (j1 = jiffies) == j0 ) +				cpu_relax(); +			while ( (jiffies-j1) < (1 << RAID6_TIME_JIFFIES_LG2) ) { +				(*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); +				perf++; +			} +			preempt_enable(); + +			if ( (*algo)->prefer > bestprefer || +			     ((*algo)->prefer == bestprefer && +			      perf > bestperf) ) { +				best = *algo; +				bestprefer = best->prefer; +				bestperf = perf; +			} +			printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, +			       (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); +		} +	} + +	if ( best ) +		printk("raid6: using algorithm %s (%ld MB/s)\n", +		       best->name, +		       (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); +	else +		printk("raid6: Yikes!  No algorithm found!\n"); + +	raid6_call = *best; + +	free_pages((unsigned long)syndromes, 1); + +	return best ? 0 : -EINVAL; +} diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc new file mode 100644 index 00000000000..1de8f030eee --- /dev/null +++ b/drivers/md/raid6altivec.uc @@ -0,0 +1,122 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6altivec$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.pl + * + * <benh> hpa: in process, + * you can just "steal" the vec unit with enable_kernel_altivec() (but + * bracked this with preempt_disable/enable or in a lock) + */ + +#include "raid6.h" + +#ifdef CONFIG_ALTIVEC + +#include <altivec.h> +#include <asm/system.h> +#include <asm/cputable.h> + +/* + * This is the C data type to use + */ + +typedef vector unsigned char unative_t; + +#define NBYTES(x) ((vector unsigned char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) +#define NSIZE	sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ +	return vec_add(v,v); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ +	unative_t zv = NBYTES(0); + +	/* vec_cmpgt returns a vector bool char; thus the need for the cast */ +	return (unative_t)vec_cmpgt(zv, v); +} + + +/* This is noinline to make damned sure that gcc doesn't move any of the +   Altivec code around the enable/disable code */ +static void noinline +raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; + +	unative_t wd$$, wq$$, wp$$, w1$$, w2$$; +	unative_t x1d = NBYTES(0x1d); + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { +		wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; +		for ( z = z0-1 ; z >= 0 ; z-- ) { +			wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; +			wp$$ = vec_xor(wp$$, wd$$); +			w2$$ = MASK(wq$$); +			w1$$ = SHLBYTE(wq$$); +			w2$$ = vec_and(w2$$, x1d); +			w1$$ = vec_xor(w1$$, w2$$); +			wq$$ = vec_xor(w1$$, wd$$); +		} +		*(unative_t *)&p[d+NSIZE*$$] = wp$$; +		*(unative_t *)&q[d+NSIZE*$$] = wq$$; +	} +} + +static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	preempt_disable(); +	enable_kernel_altivec(); + +	raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs); + +	preempt_enable(); +} + +int raid6_have_altivec(void); +#if $# == 1 +int raid6_have_altivec(void) +{ +	/* This assumes either all CPUs have Altivec or none does */ +	return cpu_has_feature(CPU_FTR_ALTIVEC); +} +#endif + +const struct raid6_calls raid6_altivec$# = { +	raid6_altivec$#_gen_syndrome, +	raid6_have_altivec, +	"altivecx$#", +	0 +}; + +#endif /* CONFIG_ALTIVEC */ diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc new file mode 100644 index 00000000000..ad004cee0e2 --- /dev/null +++ b/drivers/md/raid6int.uc @@ -0,0 +1,117 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6int$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.pl + */ + +#include "raid6.h" + +/* + * This is the C data type to use + */ + +/* Change this from BITS_PER_LONG if there is something better... */ +#if BITS_PER_LONG == 64 +# define NBYTES(x) ((x) * 0x0101010101010101UL) +# define NSIZE  8 +# define NSHIFT 3 +# define NSTRING "64" +typedef u64 unative_t; +#else +# define NBYTES(x) ((x) * 0x01010101U) +# define NSIZE  4 +# define NSHIFT 2 +# define NSTRING "32" +typedef u32 unative_t; +#endif + + + +/* + * IA-64 wants insane amounts of unrolling.  On other architectures that + * is just a waste of space. + */ +#if ($# <= 8) || defined(__ia64__) + + +/* + * These sub-operations are separate inlines since they can sometimes be + * specially optimized using architecture-specific hacks. + */ + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ +	unative_t vv; + +	vv = (v << 1) & NBYTES(0xfe); +	return vv; +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ +	unative_t vv; + +	vv = v & NBYTES(0x80); +	vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ +	return vv; +} + + +static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; + +	unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { +		wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; +		for ( z = z0-1 ; z >= 0 ; z-- ) { +			wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; +			wp$$ ^= wd$$; +			w2$$ = MASK(wq$$); +			w1$$ = SHLBYTE(wq$$); +			w2$$ &= NBYTES(0x1d); +			w1$$ ^= w2$$; +			wq$$ = w1$$ ^ wd$$; +		} +		*(unative_t *)&p[d+NSIZE*$$] = wp$$; +		*(unative_t *)&q[d+NSIZE*$$] = wq$$; +	} +} + +const struct raid6_calls raid6_intx$# = { +	raid6_int$#_gen_syndrome, +	NULL,		/* always valid */ +	"int" NSTRING "x$#", +	0 +}; + +#endif diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c new file mode 100644 index 00000000000..7e30ab29691 --- /dev/null +++ b/drivers/md/raid6main.c @@ -0,0 +1,2136 @@ +/* + * raid6main.c : Multiple Devices driver for Linux + *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + *	   Copyright (C) 1999, 2000 Ingo Molnar + *	   Copyright (C) 2002, 2003 H. Peter Anvin + * + * RAID-6 management functions.  This code is derived from raid5.c. + * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1). + * + * Thanks to Penguin Computing for making the RAID-6 development possible + * by donating a test server! + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/bitops.h> +#include <asm/atomic.h> +#include "raid6.h" + +/* + * Stripe cache + */ + +#define NR_STRIPES		256 +#define STRIPE_SIZE		PAGE_SIZE +#define STRIPE_SHIFT		(PAGE_SHIFT - 9) +#define STRIPE_SECTORS		(STRIPE_SIZE>>9) +#define	IO_THRESHOLD		1 +#define HASH_PAGES		1 +#define HASH_PAGES_ORDER	0 +#define NR_HASH			(HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) +#define HASH_MASK		(NR_HASH - 1) + +#define stripe_hash(conf, sect)	((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) + +/* bio's attached to a stripe+device for I/O are linked together in bi_sector + * order without overlap.  There may be several bio's per stripe+device, and + * a bio could span several devices. + * When walking this list for a particular stripe+device, we must never proceed + * beyond a bio that extends past this device, as the next bio might no longer + * be valid. + * This macro is used to determine the 'next' bio in the list, given the sector + * of the current stripe+device + */ +#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) +/* + * The following can be used to debug the driver + */ +#define RAID6_DEBUG	0	/* Extremely verbose printk */ +#define RAID6_PARANOIA	1	/* Check spinlocks */ +#define RAID6_DUMPSTATE 0	/* Include stripe cache state in /proc/mdstat */ +#if RAID6_PARANOIA && defined(CONFIG_SMP) +# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) +#else +# define CHECK_DEVLOCK() +#endif + +#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x))) +#if RAID6_DEBUG +#undef inline +#undef __inline__ +#define inline +#define __inline__ +#endif + +#if !RAID6_USE_EMPTY_ZERO_PAGE +/* In .bss so it's zeroed */ +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +#endif + +static inline int raid6_next_disk(int disk, int raid_disks) +{ +	disk++; +	return (disk < raid_disks) ? disk : 0; +} + +static void print_raid6_conf (raid6_conf_t *conf); + +static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh) +{ +	if (atomic_dec_and_test(&sh->count)) { +		if (!list_empty(&sh->lru)) +			BUG(); +		if (atomic_read(&conf->active_stripes)==0) +			BUG(); +		if (test_bit(STRIPE_HANDLE, &sh->state)) { +			if (test_bit(STRIPE_DELAYED, &sh->state)) +				list_add_tail(&sh->lru, &conf->delayed_list); +			else +				list_add_tail(&sh->lru, &conf->handle_list); +			md_wakeup_thread(conf->mddev->thread); +		} else { +			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { +				atomic_dec(&conf->preread_active_stripes); +				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) +					md_wakeup_thread(conf->mddev->thread); +			} +			list_add_tail(&sh->lru, &conf->inactive_list); +			atomic_dec(&conf->active_stripes); +			if (!conf->inactive_blocked || +			    atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4)) +				wake_up(&conf->wait_for_stripe); +		} +	} +} +static void release_stripe(struct stripe_head *sh) +{ +	raid6_conf_t *conf = sh->raid_conf; +	unsigned long flags; + +	spin_lock_irqsave(&conf->device_lock, flags); +	__release_stripe(conf, sh); +	spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static void remove_hash(struct stripe_head *sh) +{ +	PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); + +	if (sh->hash_pprev) { +		if (sh->hash_next) +			sh->hash_next->hash_pprev = sh->hash_pprev; +		*sh->hash_pprev = sh->hash_next; +		sh->hash_pprev = NULL; +	} +} + +static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) +{ +	struct stripe_head **shp = &stripe_hash(conf, sh->sector); + +	PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); + +	CHECK_DEVLOCK(); +	if ((sh->hash_next = *shp) != NULL) +		(*shp)->hash_pprev = &sh->hash_next; +	*shp = sh; +	sh->hash_pprev = shp; +} + + +/* find an idle stripe, make sure it is unhashed, and return it. */ +static struct stripe_head *get_free_stripe(raid6_conf_t *conf) +{ +	struct stripe_head *sh = NULL; +	struct list_head *first; + +	CHECK_DEVLOCK(); +	if (list_empty(&conf->inactive_list)) +		goto out; +	first = conf->inactive_list.next; +	sh = list_entry(first, struct stripe_head, lru); +	list_del_init(first); +	remove_hash(sh); +	atomic_inc(&conf->active_stripes); +out: +	return sh; +} + +static void shrink_buffers(struct stripe_head *sh, int num) +{ +	struct page *p; +	int i; + +	for (i=0; i<num ; i++) { +		p = sh->dev[i].page; +		if (!p) +			continue; +		sh->dev[i].page = NULL; +		page_cache_release(p); +	} +} + +static int grow_buffers(struct stripe_head *sh, int num) +{ +	int i; + +	for (i=0; i<num; i++) { +		struct page *page; + +		if (!(page = alloc_page(GFP_KERNEL))) { +			return 1; +		} +		sh->dev[i].page = page; +	} +	return 0; +} + +static void raid6_build_block (struct stripe_head *sh, int i); + +static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) +{ +	raid6_conf_t *conf = sh->raid_conf; +	int disks = conf->raid_disks, i; + +	if (atomic_read(&sh->count) != 0) +		BUG(); +	if (test_bit(STRIPE_HANDLE, &sh->state)) +		BUG(); + +	CHECK_DEVLOCK(); +	PRINTK("init_stripe called, stripe %llu\n", +		(unsigned long long)sh->sector); + +	remove_hash(sh); + +	sh->sector = sector; +	sh->pd_idx = pd_idx; +	sh->state = 0; + +	for (i=disks; i--; ) { +		struct r5dev *dev = &sh->dev[i]; + +		if (dev->toread || dev->towrite || dev->written || +		    test_bit(R5_LOCKED, &dev->flags)) { +			PRINTK("sector=%llx i=%d %p %p %p %d\n", +			       (unsigned long long)sh->sector, i, dev->toread, +			       dev->towrite, dev->written, +			       test_bit(R5_LOCKED, &dev->flags)); +			BUG(); +		} +		dev->flags = 0; +		raid6_build_block(sh, i); +	} +	insert_hash(conf, sh); +} + +static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) +{ +	struct stripe_head *sh; + +	CHECK_DEVLOCK(); +	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); +	for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) +		if (sh->sector == sector) +			return sh; +	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); +	return NULL; +} + +static void unplug_slaves(mddev_t *mddev); + +static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector, +					     int pd_idx, int noblock) +{ +	struct stripe_head *sh; + +	PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector); + +	spin_lock_irq(&conf->device_lock); + +	do { +		sh = __find_stripe(conf, sector); +		if (!sh) { +			if (!conf->inactive_blocked) +				sh = get_free_stripe(conf); +			if (noblock && sh == NULL) +				break; +			if (!sh) { +				conf->inactive_blocked = 1; +				wait_event_lock_irq(conf->wait_for_stripe, +						    !list_empty(&conf->inactive_list) && +						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) +						     || !conf->inactive_blocked), +						    conf->device_lock, +						    unplug_slaves(conf->mddev); +					); +				conf->inactive_blocked = 0; +			} else +				init_stripe(sh, sector, pd_idx); +		} else { +			if (atomic_read(&sh->count)) { +				if (!list_empty(&sh->lru)) +					BUG(); +			} else { +				if (!test_bit(STRIPE_HANDLE, &sh->state)) +					atomic_inc(&conf->active_stripes); +				if (list_empty(&sh->lru)) +					BUG(); +				list_del_init(&sh->lru); +			} +		} +	} while (sh == NULL); + +	if (sh) +		atomic_inc(&sh->count); + +	spin_unlock_irq(&conf->device_lock); +	return sh; +} + +static int grow_stripes(raid6_conf_t *conf, int num) +{ +	struct stripe_head *sh; +	kmem_cache_t *sc; +	int devs = conf->raid_disks; + +	sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev)); + +	sc = kmem_cache_create(conf->cache_name, +			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), +			       0, 0, NULL, NULL); +	if (!sc) +		return 1; +	conf->slab_cache = sc; +	while (num--) { +		sh = kmem_cache_alloc(sc, GFP_KERNEL); +		if (!sh) +			return 1; +		memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev)); +		sh->raid_conf = conf; +		spin_lock_init(&sh->lock); + +		if (grow_buffers(sh, conf->raid_disks)) { +			shrink_buffers(sh, conf->raid_disks); +			kmem_cache_free(sc, sh); +			return 1; +		} +		/* we just created an active stripe so... */ +		atomic_set(&sh->count, 1); +		atomic_inc(&conf->active_stripes); +		INIT_LIST_HEAD(&sh->lru); +		release_stripe(sh); +	} +	return 0; +} + +static void shrink_stripes(raid6_conf_t *conf) +{ +	struct stripe_head *sh; + +	while (1) { +		spin_lock_irq(&conf->device_lock); +		sh = get_free_stripe(conf); +		spin_unlock_irq(&conf->device_lock); +		if (!sh) +			break; +		if (atomic_read(&sh->count)) +			BUG(); +		shrink_buffers(sh, conf->raid_disks); +		kmem_cache_free(conf->slab_cache, sh); +		atomic_dec(&conf->active_stripes); +	} +	kmem_cache_destroy(conf->slab_cache); +	conf->slab_cache = NULL; +} + +static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, +				   int error) +{ + 	struct stripe_head *sh = bi->bi_private; +	raid6_conf_t *conf = sh->raid_conf; +	int disks = conf->raid_disks, i; +	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + +	if (bi->bi_size) +		return 1; + +	for (i=0 ; i<disks; i++) +		if (bi == &sh->dev[i].req) +			break; + +	PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", +		(unsigned long long)sh->sector, i, atomic_read(&sh->count), +		uptodate); +	if (i == disks) { +		BUG(); +		return 0; +	} + +	if (uptodate) { +#if 0 +		struct bio *bio; +		unsigned long flags; +		spin_lock_irqsave(&conf->device_lock, flags); +		/* we can return a buffer if we bypassed the cache or +		 * if the top buffer is not in highmem.  If there are +		 * multiple buffers, leave the extra work to +		 * handle_stripe +		 */ +		buffer = sh->bh_read[i]; +		if (buffer && +		    (!PageHighMem(buffer->b_page) +		     || buffer->b_page == bh->b_page ) +			) { +			sh->bh_read[i] = buffer->b_reqnext; +			buffer->b_reqnext = NULL; +		} else +			buffer = NULL; +		spin_unlock_irqrestore(&conf->device_lock, flags); +		if (sh->bh_page[i]==bh->b_page) +			set_buffer_uptodate(bh); +		if (buffer) { +			if (buffer->b_page != bh->b_page) +				memcpy(buffer->b_data, bh->b_data, bh->b_size); +			buffer->b_end_io(buffer, 1); +		} +#else +		set_bit(R5_UPTODATE, &sh->dev[i].flags); +#endif +	} else { +		md_error(conf->mddev, conf->disks[i].rdev); +		clear_bit(R5_UPTODATE, &sh->dev[i].flags); +	} +	rdev_dec_pending(conf->disks[i].rdev, conf->mddev); +#if 0 +	/* must restore b_page before unlocking buffer... */ +	if (sh->bh_page[i] != bh->b_page) { +		bh->b_page = sh->bh_page[i]; +		bh->b_data = page_address(bh->b_page); +		clear_buffer_uptodate(bh); +	} +#endif +	clear_bit(R5_LOCKED, &sh->dev[i].flags); +	set_bit(STRIPE_HANDLE, &sh->state); +	release_stripe(sh); +	return 0; +} + +static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done, +				    int error) +{ + 	struct stripe_head *sh = bi->bi_private; +	raid6_conf_t *conf = sh->raid_conf; +	int disks = conf->raid_disks, i; +	unsigned long flags; +	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + +	if (bi->bi_size) +		return 1; + +	for (i=0 ; i<disks; i++) +		if (bi == &sh->dev[i].req) +			break; + +	PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", +		(unsigned long long)sh->sector, i, atomic_read(&sh->count), +		uptodate); +	if (i == disks) { +		BUG(); +		return 0; +	} + +	spin_lock_irqsave(&conf->device_lock, flags); +	if (!uptodate) +		md_error(conf->mddev, conf->disks[i].rdev); + +	rdev_dec_pending(conf->disks[i].rdev, conf->mddev); + +	clear_bit(R5_LOCKED, &sh->dev[i].flags); +	set_bit(STRIPE_HANDLE, &sh->state); +	__release_stripe(conf, sh); +	spin_unlock_irqrestore(&conf->device_lock, flags); +	return 0; +} + + +static sector_t compute_blocknr(struct stripe_head *sh, int i); + +static void raid6_build_block (struct stripe_head *sh, int i) +{ +	struct r5dev *dev = &sh->dev[i]; +	int pd_idx = sh->pd_idx; +	int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks); + +	bio_init(&dev->req); +	dev->req.bi_io_vec = &dev->vec; +	dev->req.bi_vcnt++; +	dev->req.bi_max_vecs++; +	dev->vec.bv_page = dev->page; +	dev->vec.bv_len = STRIPE_SIZE; +	dev->vec.bv_offset = 0; + +	dev->req.bi_sector = sh->sector; +	dev->req.bi_private = sh; + +	dev->flags = 0; +	if (i != pd_idx && i != qd_idx) +		dev->sector = compute_blocknr(sh, i); +} + +static void error(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	char b[BDEVNAME_SIZE]; +	raid6_conf_t *conf = (raid6_conf_t *) mddev->private; +	PRINTK("raid6: error called\n"); + +	if (!rdev->faulty) { +		mddev->sb_dirty = 1; +		if (rdev->in_sync) { +			conf->working_disks--; +			mddev->degraded++; +			conf->failed_disks++; +			rdev->in_sync = 0; +			/* +			 * if recovery was running, make sure it aborts. +			 */ +			set_bit(MD_RECOVERY_ERR, &mddev->recovery); +		} +		rdev->faulty = 1; +		printk (KERN_ALERT +			"raid6: Disk failure on %s, disabling device." +			" Operation continuing on %d devices\n", +			bdevname(rdev->bdev,b), conf->working_disks); +	} +} + +/* + * Input: a 'big' sector number, + * Output: index of the data and parity disk, and the sector # in them. + */ +static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks, +			unsigned int data_disks, unsigned int * dd_idx, +			unsigned int * pd_idx, raid6_conf_t *conf) +{ +	long stripe; +	unsigned long chunk_number; +	unsigned int chunk_offset; +	sector_t new_sector; +	int sectors_per_chunk = conf->chunk_size >> 9; + +	/* First compute the information on this sector */ + +	/* +	 * Compute the chunk number and the sector offset inside the chunk +	 */ +	chunk_offset = sector_div(r_sector, sectors_per_chunk); +	chunk_number = r_sector; +	if ( r_sector != chunk_number ) { +		printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n", +		       (unsigned long long)r_sector, (unsigned long)chunk_number); +		BUG(); +	} + +	/* +	 * Compute the stripe number +	 */ +	stripe = chunk_number / data_disks; + +	/* +	 * Compute the data disk and parity disk indexes inside the stripe +	 */ +	*dd_idx = chunk_number % data_disks; + +	/* +	 * Select the parity disk based on the user selected algorithm. +	 */ + +	/**** FIX THIS ****/ +	switch (conf->algorithm) { +	case ALGORITHM_LEFT_ASYMMETRIC: +		*pd_idx = raid_disks - 1 - (stripe % raid_disks); +		if (*pd_idx == raid_disks-1) +		  	(*dd_idx)++; 	/* Q D D D P */ +		else if (*dd_idx >= *pd_idx) +		  	(*dd_idx) += 2; /* D D P Q D */ +		break; +	case ALGORITHM_RIGHT_ASYMMETRIC: +		*pd_idx = stripe % raid_disks; +		if (*pd_idx == raid_disks-1) +		  	(*dd_idx)++; 	/* Q D D D P */ +		else if (*dd_idx >= *pd_idx) +		  	(*dd_idx) += 2; /* D D P Q D */ +		break; +	case ALGORITHM_LEFT_SYMMETRIC: +		*pd_idx = raid_disks - 1 - (stripe % raid_disks); +		*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; +		break; +	case ALGORITHM_RIGHT_SYMMETRIC: +		*pd_idx = stripe % raid_disks; +		*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; +		break; +	default: +		printk (KERN_CRIT "raid6: unsupported algorithm %d\n", +			conf->algorithm); +	} + +	PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n", +	       chunk_number, *pd_idx, *dd_idx); + +	/* +	 * Finally, compute the new sector number +	 */ +	new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset; +	return new_sector; +} + + +static sector_t compute_blocknr(struct stripe_head *sh, int i) +{ +	raid6_conf_t *conf = sh->raid_conf; +	int raid_disks = conf->raid_disks, data_disks = raid_disks - 2; +	sector_t new_sector = sh->sector, check; +	int sectors_per_chunk = conf->chunk_size >> 9; +	sector_t stripe; +	int chunk_offset; +	int chunk_number, dummy1, dummy2, dd_idx = i; +	sector_t r_sector; +	int i0 = i; + +	chunk_offset = sector_div(new_sector, sectors_per_chunk); +	stripe = new_sector; +	if ( new_sector != stripe ) { +		printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n", +		       (unsigned long long)new_sector, (unsigned long)stripe); +		BUG(); +	} + +	switch (conf->algorithm) { +		case ALGORITHM_LEFT_ASYMMETRIC: +		case ALGORITHM_RIGHT_ASYMMETRIC: +		  	if (sh->pd_idx == raid_disks-1) +				i--; 	/* Q D D D P */ +			else if (i > sh->pd_idx) +				i -= 2; /* D D P Q D */ +			break; +		case ALGORITHM_LEFT_SYMMETRIC: +		case ALGORITHM_RIGHT_SYMMETRIC: +			if (sh->pd_idx == raid_disks-1) +				i--; /* Q D D D P */ +			else { +				/* D D P Q D */ +				if (i < sh->pd_idx) +					i += raid_disks; +				i -= (sh->pd_idx + 2); +			} +			break; +		default: +			printk (KERN_CRIT "raid6: unsupported algorithm %d\n", +				conf->algorithm); +	} + +	PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i); + +	chunk_number = stripe * data_disks + i; +	r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; + +	check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); +	if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { +		printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n"); +		return 0; +	} +	return r_sector; +} + + + +/* + * Copy data between a page in the stripe cache, and one or more bion + * The page could align with the middle of the bio, or there could be + * several bion, each with several bio_vecs, which cover part of the page + * Multiple bion are linked together on bi_next.  There may be extras + * at the end of this list.  We ignore them. + */ +static void copy_data(int frombio, struct bio *bio, +		     struct page *page, +		     sector_t sector) +{ +	char *pa = page_address(page); +	struct bio_vec *bvl; +	int i; +	int page_offset; + +	if (bio->bi_sector >= sector) +		page_offset = (signed)(bio->bi_sector - sector) * 512; +	else +		page_offset = (signed)(sector - bio->bi_sector) * -512; +	bio_for_each_segment(bvl, bio, i) { +		int len = bio_iovec_idx(bio,i)->bv_len; +		int clen; +		int b_offset = 0; + +		if (page_offset < 0) { +			b_offset = -page_offset; +			page_offset += b_offset; +			len -= b_offset; +		} + +		if (len > 0 && page_offset + len > STRIPE_SIZE) +			clen = STRIPE_SIZE - page_offset; +		else clen = len; + +		if (clen > 0) { +			char *ba = __bio_kmap_atomic(bio, i, KM_USER0); +			if (frombio) +				memcpy(pa+page_offset, ba+b_offset, clen); +			else +				memcpy(ba+b_offset, pa+page_offset, clen); +			__bio_kunmap_atomic(ba, KM_USER0); +		} +		if (clen < len) /* hit end of page */ +			break; +		page_offset +=  len; +	} +} + +#define check_xor() 	do { 						\ +			   if (count == MAX_XOR_BLOCKS) {		\ +				xor_block(count, STRIPE_SIZE, ptr);	\ +				count = 1;				\ +			   }						\ +			} while(0) + +/* Compute P and Q syndromes */ +static void compute_parity(struct stripe_head *sh, int method) +{ +	raid6_conf_t *conf = sh->raid_conf; +	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; +	struct bio *chosen; +	/**** FIX THIS: This could be very bad if disks is close to 256 ****/ +	void *ptrs[disks]; + +	qd_idx = raid6_next_disk(pd_idx, disks); +	d0_idx = raid6_next_disk(qd_idx, disks); + +	PRINTK("compute_parity, stripe %llu, method %d\n", +		(unsigned long long)sh->sector, method); + +	switch(method) { +	case READ_MODIFY_WRITE: +		BUG();		/* READ_MODIFY_WRITE N/A for RAID-6 */ +	case RECONSTRUCT_WRITE: +		for (i= disks; i-- ;) +			if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { +				chosen = sh->dev[i].towrite; +				sh->dev[i].towrite = NULL; + +				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) +					wake_up(&conf->wait_for_overlap); + +				if (sh->dev[i].written) BUG(); +				sh->dev[i].written = chosen; +			} +		break; +	case CHECK_PARITY: +		BUG();		/* Not implemented yet */ +	} + +	for (i = disks; i--;) +		if (sh->dev[i].written) { +			sector_t sector = sh->dev[i].sector; +			struct bio *wbi = sh->dev[i].written; +			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { +				copy_data(1, wbi, sh->dev[i].page, sector); +				wbi = r5_next_bio(wbi, sector); +			} + +			set_bit(R5_LOCKED, &sh->dev[i].flags); +			set_bit(R5_UPTODATE, &sh->dev[i].flags); +		} + +//	switch(method) { +//	case RECONSTRUCT_WRITE: +//	case CHECK_PARITY: +//	case UPDATE_PARITY: +		/* Note that unlike RAID-5, the ordering of the disks matters greatly. */ +		/* FIX: Is this ordering of drives even remotely optimal? */ +		count = 0; +		i = d0_idx; +		do { +			ptrs[count++] = page_address(sh->dev[i].page); +			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) +				printk("block %d/%d not uptodate on parity calc\n", i,count); +			i = raid6_next_disk(i, disks); +		} while ( i != d0_idx ); +//		break; +//	} + +	raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); + +	switch(method) { +	case RECONSTRUCT_WRITE: +		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); +		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); +		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags); +		set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags); +		break; +	case UPDATE_PARITY: +		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); +		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); +		break; +	} +} + +/* Compute one missing block */ +static void compute_block_1(struct stripe_head *sh, int dd_idx) +{ +	raid6_conf_t *conf = sh->raid_conf; +	int i, count, disks = conf->raid_disks; +	void *ptr[MAX_XOR_BLOCKS], *p; +	int pd_idx = sh->pd_idx; +	int qd_idx = raid6_next_disk(pd_idx, disks); + +	PRINTK("compute_block_1, stripe %llu, idx %d\n", +		(unsigned long long)sh->sector, dd_idx); + +	if ( dd_idx == qd_idx ) { +		/* We're actually computing the Q drive */ +		compute_parity(sh, UPDATE_PARITY); +	} else { +		ptr[0] = page_address(sh->dev[dd_idx].page); +		memset(ptr[0], 0, STRIPE_SIZE); +		count = 1; +		for (i = disks ; i--; ) { +			if (i == dd_idx || i == qd_idx) +				continue; +			p = page_address(sh->dev[i].page); +			if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) +				ptr[count++] = p; +			else +				printk("compute_block() %d, stripe %llu, %d" +				       " not present\n", dd_idx, +				       (unsigned long long)sh->sector, i); + +			check_xor(); +		} +		if (count != 1) +			xor_block(count, STRIPE_SIZE, ptr); +		set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); +	} +} + +/* Compute two missing blocks */ +static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) +{ +	raid6_conf_t *conf = sh->raid_conf; +	int i, count, disks = conf->raid_disks; +	int pd_idx = sh->pd_idx; +	int qd_idx = raid6_next_disk(pd_idx, disks); +	int d0_idx = raid6_next_disk(qd_idx, disks); +	int faila, failb; + +	/* faila and failb are disk numbers relative to d0_idx */ +	/* pd_idx become disks-2 and qd_idx become disks-1 */ +	faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; +	failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; + +	BUG_ON(faila == failb); +	if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } + +	PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", +	       (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); + +	if ( failb == disks-1 ) { +		/* Q disk is one of the missing disks */ +		if ( faila == disks-2 ) { +			/* Missing P+Q, just recompute */ +			compute_parity(sh, UPDATE_PARITY); +			return; +		} else { +			/* We're missing D+Q; recompute D from P */ +			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); +			compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ +			return; +		} +	} + +	/* We're missing D+P or D+D; build pointer table */ +	{ +		/**** FIX THIS: This could be very bad if disks is close to 256 ****/ +		void *ptrs[disks]; + +		count = 0; +		i = d0_idx; +		do { +			ptrs[count++] = page_address(sh->dev[i].page); +			i = raid6_next_disk(i, disks); +			if (i != dd_idx1 && i != dd_idx2 && +			    !test_bit(R5_UPTODATE, &sh->dev[i].flags)) +				printk("compute_2 with missing block %d/%d\n", count, i); +		} while ( i != d0_idx ); + +		if ( failb == disks-2 ) { +			/* We're missing D+P. */ +			raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); +		} else { +			/* We're missing D+D. */ +			raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); +		} + +		/* Both the above update both missing blocks */ +		set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); +		set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); +	} +} + + +/* + * Each stripe/dev can have one or more bion attached. + * toread/towrite point to the first in a chain. + * The bi_next chain must be in order. + */ +static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) +{ +	struct bio **bip; +	raid6_conf_t *conf = sh->raid_conf; + +	PRINTK("adding bh b#%llu to stripe s#%llu\n", +		(unsigned long long)bi->bi_sector, +		(unsigned long long)sh->sector); + + +	spin_lock(&sh->lock); +	spin_lock_irq(&conf->device_lock); +	if (forwrite) +		bip = &sh->dev[dd_idx].towrite; +	else +		bip = &sh->dev[dd_idx].toread; +	while (*bip && (*bip)->bi_sector < bi->bi_sector) { +		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) +			goto overlap; +		bip = &(*bip)->bi_next; +	} +	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) +		goto overlap; + +	if (*bip && bi->bi_next && (*bip) != bi->bi_next) +		BUG(); +	if (*bip) +		bi->bi_next = *bip; +	*bip = bi; +	bi->bi_phys_segments ++; +	spin_unlock_irq(&conf->device_lock); +	spin_unlock(&sh->lock); + +	PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n", +		(unsigned long long)bi->bi_sector, +		(unsigned long long)sh->sector, dd_idx); + +	if (forwrite) { +		/* check if page is covered */ +		sector_t sector = sh->dev[dd_idx].sector; +		for (bi=sh->dev[dd_idx].towrite; +		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && +			     bi && bi->bi_sector <= sector; +		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { +			if (bi->bi_sector + (bi->bi_size>>9) >= sector) +				sector = bi->bi_sector + (bi->bi_size>>9); +		} +		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) +			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); +	} +	return 1; + + overlap: +	set_bit(R5_Overlap, &sh->dev[dd_idx].flags); +	spin_unlock_irq(&conf->device_lock); +	spin_unlock(&sh->lock); +	return 0; +} + + +/* + * handle_stripe - do things to a stripe. + * + * We lock the stripe and then examine the state of various bits + * to see what needs to be done. + * Possible results: + *    return some read request which now have data + *    return some write requests which are safely on disc + *    schedule a read on some buffers + *    schedule a write of some buffers + *    return confirmation of parity correctness + * + * Parity calculations are done inside the stripe lock + * buffers are taken off read_list or write_list, and bh_cache buffers + * get BH_Lock set before the stripe lock is released. + * + */ + +static void handle_stripe(struct stripe_head *sh) +{ +	raid6_conf_t *conf = sh->raid_conf; +	int disks = conf->raid_disks; +	struct bio *return_bi= NULL; +	struct bio *bi; +	int i; +	int syncing; +	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; +	int non_overwrite = 0; +	int failed_num[2] = {0, 0}; +	struct r5dev *dev, *pdev, *qdev; +	int pd_idx = sh->pd_idx; +	int qd_idx = raid6_next_disk(pd_idx, disks); +	int p_failed, q_failed; + +	PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n", +	       (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), +	       pd_idx, qd_idx); + +	spin_lock(&sh->lock); +	clear_bit(STRIPE_HANDLE, &sh->state); +	clear_bit(STRIPE_DELAYED, &sh->state); + +	syncing = test_bit(STRIPE_SYNCING, &sh->state); +	/* Now to look around and see what can be done */ + +	for (i=disks; i--; ) { +		mdk_rdev_t *rdev; +		dev = &sh->dev[i]; +		clear_bit(R5_Insync, &dev->flags); +		clear_bit(R5_Syncio, &dev->flags); + +		PRINTK("check %d: state 0x%lx read %p write %p written %p\n", +			i, dev->flags, dev->toread, dev->towrite, dev->written); +		/* maybe we can reply to a read */ +		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { +			struct bio *rbi, *rbi2; +			PRINTK("Return read for disc %d\n", i); +			spin_lock_irq(&conf->device_lock); +			rbi = dev->toread; +			dev->toread = NULL; +			if (test_and_clear_bit(R5_Overlap, &dev->flags)) +				wake_up(&conf->wait_for_overlap); +			spin_unlock_irq(&conf->device_lock); +			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { +				copy_data(0, rbi, dev->page, dev->sector); +				rbi2 = r5_next_bio(rbi, dev->sector); +				spin_lock_irq(&conf->device_lock); +				if (--rbi->bi_phys_segments == 0) { +					rbi->bi_next = return_bi; +					return_bi = rbi; +				} +				spin_unlock_irq(&conf->device_lock); +				rbi = rbi2; +			} +		} + +		/* now count some things */ +		if (test_bit(R5_LOCKED, &dev->flags)) locked++; +		if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + + +		if (dev->toread) to_read++; +		if (dev->towrite) { +			to_write++; +			if (!test_bit(R5_OVERWRITE, &dev->flags)) +				non_overwrite++; +		} +		if (dev->written) written++; +		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ +		if (!rdev || !rdev->in_sync) { +			if ( failed < 2 ) +				failed_num[failed] = i; +			failed++; +		} else +			set_bit(R5_Insync, &dev->flags); +	} +	PRINTK("locked=%d uptodate=%d to_read=%d" +	       " to_write=%d failed=%d failed_num=%d,%d\n", +	       locked, uptodate, to_read, to_write, failed, +	       failed_num[0], failed_num[1]); +	/* check if the array has lost >2 devices and, if so, some requests might +	 * need to be failed +	 */ +	if (failed > 2 && to_read+to_write+written) { +		spin_lock_irq(&conf->device_lock); +		for (i=disks; i--; ) { +			/* fail all writes first */ +			bi = sh->dev[i].towrite; +			sh->dev[i].towrite = NULL; +			if (bi) to_write--; + +			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) +				wake_up(&conf->wait_for_overlap); + +			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ +				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); +				clear_bit(BIO_UPTODATE, &bi->bi_flags); +				if (--bi->bi_phys_segments == 0) { +					md_write_end(conf->mddev); +					bi->bi_next = return_bi; +					return_bi = bi; +				} +				bi = nextbi; +			} +			/* and fail all 'written' */ +			bi = sh->dev[i].written; +			sh->dev[i].written = NULL; +			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { +				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); +				clear_bit(BIO_UPTODATE, &bi->bi_flags); +				if (--bi->bi_phys_segments == 0) { +					md_write_end(conf->mddev); +					bi->bi_next = return_bi; +					return_bi = bi; +				} +				bi = bi2; +			} + +			/* fail any reads if this device is non-operational */ +			if (!test_bit(R5_Insync, &sh->dev[i].flags)) { +				bi = sh->dev[i].toread; +				sh->dev[i].toread = NULL; +				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) +					wake_up(&conf->wait_for_overlap); +				if (bi) to_read--; +				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ +					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); +					clear_bit(BIO_UPTODATE, &bi->bi_flags); +					if (--bi->bi_phys_segments == 0) { +						bi->bi_next = return_bi; +						return_bi = bi; +					} +					bi = nextbi; +				} +			} +		} +		spin_unlock_irq(&conf->device_lock); +	} +	if (failed > 2 && syncing) { +		md_done_sync(conf->mddev, STRIPE_SECTORS,0); +		clear_bit(STRIPE_SYNCING, &sh->state); +		syncing = 0; +	} + +	/* +	 * might be able to return some write requests if the parity blocks +	 * are safe, or on a failed drive +	 */ +	pdev = &sh->dev[pd_idx]; +	p_failed = (failed >= 1 && failed_num[0] == pd_idx) +		|| (failed >= 2 && failed_num[1] == pd_idx); +	qdev = &sh->dev[qd_idx]; +	q_failed = (failed >= 1 && failed_num[0] == qd_idx) +		|| (failed >= 2 && failed_num[1] == qd_idx); + +	if ( written && +	     ( p_failed || ((test_bit(R5_Insync, &pdev->flags) +			     && !test_bit(R5_LOCKED, &pdev->flags) +			     && test_bit(R5_UPTODATE, &pdev->flags))) ) && +	     ( q_failed || ((test_bit(R5_Insync, &qdev->flags) +			     && !test_bit(R5_LOCKED, &qdev->flags) +			     && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { +		/* any written block on an uptodate or failed drive can be +		 * returned.  Note that if we 'wrote' to a failed drive, +		 * it will be UPTODATE, but never LOCKED, so we don't need +		 * to test 'failed' directly. +		 */ +		for (i=disks; i--; ) +			if (sh->dev[i].written) { +				dev = &sh->dev[i]; +				if (!test_bit(R5_LOCKED, &dev->flags) && +				    test_bit(R5_UPTODATE, &dev->flags) ) { +					/* We can return any write requests */ +					struct bio *wbi, *wbi2; +					PRINTK("Return write for stripe %llu disc %d\n", +					       (unsigned long long)sh->sector, i); +					spin_lock_irq(&conf->device_lock); +					wbi = dev->written; +					dev->written = NULL; +					while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { +						wbi2 = r5_next_bio(wbi, dev->sector); +						if (--wbi->bi_phys_segments == 0) { +							md_write_end(conf->mddev); +							wbi->bi_next = return_bi; +							return_bi = wbi; +						} +						wbi = wbi2; +					} +					spin_unlock_irq(&conf->device_lock); +				} +			} +	} + +	/* Now we might consider reading some blocks, either to check/generate +	 * parity, or to satisfy requests +	 * or to load a block that is being partially written. +	 */ +	if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) { +		for (i=disks; i--;) { +			dev = &sh->dev[i]; +			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && +			    (dev->toread || +			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || +			     syncing || +			     (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || +			     (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) +				    ) +				) { +				/* we would like to get this block, possibly +				 * by computing it, but we might not be able to +				 */ +				if (uptodate == disks-1) { +					PRINTK("Computing stripe %llu block %d\n", +					       (unsigned long long)sh->sector, i); +					compute_block_1(sh, i); +					uptodate++; +				} else if ( uptodate == disks-2 && failed >= 2 ) { +					/* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ +					int other; +					for (other=disks; other--;) { +						if ( other == i ) +							continue; +						if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) ) +							break; +					} +					BUG_ON(other < 0); +					PRINTK("Computing stripe %llu blocks %d,%d\n", +					       (unsigned long long)sh->sector, i, other); +					compute_block_2(sh, i, other); +					uptodate += 2; +				} else if (test_bit(R5_Insync, &dev->flags)) { +					set_bit(R5_LOCKED, &dev->flags); +					set_bit(R5_Wantread, &dev->flags); +#if 0 +					/* if I am just reading this block and we don't have +					   a failed drive, or any pending writes then sidestep the cache */ +					if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && +					    ! syncing && !failed && !to_write) { +						sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page; +						sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data; +					} +#endif +					locked++; +					PRINTK("Reading block %d (sync=%d)\n", +						i, syncing); +					if (syncing) +						md_sync_acct(conf->disks[i].rdev->bdev, +							     STRIPE_SECTORS); +				} +			} +		} +		set_bit(STRIPE_HANDLE, &sh->state); +	} + +	/* now to consider writing and what else, if anything should be read */ +	if (to_write) { +		int rcw=0, must_compute=0; +		for (i=disks ; i--;) { +			dev = &sh->dev[i]; +			/* Would I have to read this buffer for reconstruct_write */ +			if (!test_bit(R5_OVERWRITE, &dev->flags) +			    && i != pd_idx && i != qd_idx +			    && (!test_bit(R5_LOCKED, &dev->flags) +#if 0 +				|| sh->bh_page[i] != bh->b_page +#endif +				    ) && +			    !test_bit(R5_UPTODATE, &dev->flags)) { +				if (test_bit(R5_Insync, &dev->flags)) rcw++; +				else { +					PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags); +					must_compute++; +				} +			} +		} +		PRINTK("for sector %llu, rcw=%d, must_compute=%d\n", +		       (unsigned long long)sh->sector, rcw, must_compute); +		set_bit(STRIPE_HANDLE, &sh->state); + +		if (rcw > 0) +			/* want reconstruct write, but need to get some data */ +			for (i=disks; i--;) { +				dev = &sh->dev[i]; +				if (!test_bit(R5_OVERWRITE, &dev->flags) +				    && !(failed == 0 && (i == pd_idx || i == qd_idx)) +				    && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && +				    test_bit(R5_Insync, &dev->flags)) { +					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) +					{ +						PRINTK("Read_old stripe %llu block %d for Reconstruct\n", +						       (unsigned long long)sh->sector, i); +						set_bit(R5_LOCKED, &dev->flags); +						set_bit(R5_Wantread, &dev->flags); +						locked++; +					} else { +						PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", +						       (unsigned long long)sh->sector, i); +						set_bit(STRIPE_DELAYED, &sh->state); +						set_bit(STRIPE_HANDLE, &sh->state); +					} +				} +			} +		/* now if nothing is locked, and if we have enough data, we can start a write request */ +		if (locked == 0 && rcw == 0) { +			if ( must_compute > 0 ) { +				/* We have failed blocks and need to compute them */ +				switch ( failed ) { +				case 0:	BUG(); +				case 1: compute_block_1(sh, failed_num[0]); break; +				case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; +				default: BUG();	/* This request should have been failed? */ +				} +			} + +			PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector); +			compute_parity(sh, RECONSTRUCT_WRITE); +			/* now every locked buffer is ready to be written */ +			for (i=disks; i--;) +				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { +					PRINTK("Writing stripe %llu block %d\n", +					       (unsigned long long)sh->sector, i); +					locked++; +					set_bit(R5_Wantwrite, &sh->dev[i].flags); +#if 0 /**** FIX: I don't understand the logic here... ****/ +					if (!test_bit(R5_Insync, &sh->dev[i].flags) +					    || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */ +						set_bit(STRIPE_INSYNC, &sh->state); +#endif +				} +			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { +				atomic_dec(&conf->preread_active_stripes); +				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) +					md_wakeup_thread(conf->mddev->thread); +			} +		} +	} + +	/* maybe we need to check and possibly fix the parity for this stripe +	 * Any reads will already have been scheduled, so we just see if enough data +	 * is available +	 */ +	if (syncing && locked == 0 && +	    !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { +		set_bit(STRIPE_HANDLE, &sh->state); +#if 0 /* RAID-6: Don't support CHECK PARITY yet */ +		if (failed == 0) { +			char *pagea; +			if (uptodate != disks) +				BUG(); +			compute_parity(sh, CHECK_PARITY); +			uptodate--; +			pagea = page_address(sh->dev[pd_idx].page); +			if ((*(u32*)pagea) == 0 && +			    !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { +				/* parity is correct (on disc, not in buffer any more) */ +				set_bit(STRIPE_INSYNC, &sh->state); +			} +		} +#endif +		if (!test_bit(STRIPE_INSYNC, &sh->state)) { +			int failed_needupdate[2]; +			struct r5dev *adev, *bdev; + +			if ( failed < 1 ) +				failed_num[0] = pd_idx; +			if ( failed < 2 ) +				failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx; + +			failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); +			failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags); + +			PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", +			       failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); + +#if 0  /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ +			/* should be able to compute the missing block(s) and write to spare */ +			if ( failed_needupdate[0] ^ failed_needupdate[1] ) { +				if (uptodate+1 != disks) +					BUG(); +				compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); +				uptodate++; +			} else if ( failed_needupdate[0] & failed_needupdate[1] ) { +				if (uptodate+2 != disks) +					BUG(); +				compute_block_2(sh, failed_num[0], failed_num[1]); +				uptodate += 2; +			} +#else +			compute_block_2(sh, failed_num[0], failed_num[1]); +			uptodate += failed_needupdate[0] + failed_needupdate[1]; +#endif + +			if (uptodate != disks) +				BUG(); + +			PRINTK("Marking for sync stripe %llu blocks %d,%d\n", +			       (unsigned long long)sh->sector, failed_num[0], failed_num[1]); + +			/**** FIX: Should we really do both of these unconditionally? ****/ +			adev = &sh->dev[failed_num[0]]; +			locked += !test_bit(R5_LOCKED, &adev->flags); +			set_bit(R5_LOCKED, &adev->flags); +			set_bit(R5_Wantwrite, &adev->flags); +			bdev = &sh->dev[failed_num[1]]; +			locked += !test_bit(R5_LOCKED, &bdev->flags); +			set_bit(R5_LOCKED, &bdev->flags); +			set_bit(R5_Wantwrite, &bdev->flags); + +			set_bit(STRIPE_INSYNC, &sh->state); +			set_bit(R5_Syncio, &adev->flags); +			set_bit(R5_Syncio, &bdev->flags); +		} +	} +	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { +		md_done_sync(conf->mddev, STRIPE_SECTORS,1); +		clear_bit(STRIPE_SYNCING, &sh->state); +	} + +	spin_unlock(&sh->lock); + +	while ((bi=return_bi)) { +		int bytes = bi->bi_size; + +		return_bi = bi->bi_next; +		bi->bi_next = NULL; +		bi->bi_size = 0; +		bi->bi_end_io(bi, bytes, 0); +	} +	for (i=disks; i-- ;) { +		int rw; +		struct bio *bi; +		mdk_rdev_t *rdev; +		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) +			rw = 1; +		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) +			rw = 0; +		else +			continue; + +		bi = &sh->dev[i].req; + +		bi->bi_rw = rw; +		if (rw) +			bi->bi_end_io = raid6_end_write_request; +		else +			bi->bi_end_io = raid6_end_read_request; + +		rcu_read_lock(); +		rdev = conf->disks[i].rdev; +		if (rdev && rdev->faulty) +			rdev = NULL; +		if (rdev) +			atomic_inc(&rdev->nr_pending); +		rcu_read_unlock(); + +		if (rdev) { +			if (test_bit(R5_Syncio, &sh->dev[i].flags)) +				md_sync_acct(rdev->bdev, STRIPE_SECTORS); + +			bi->bi_bdev = rdev->bdev; +			PRINTK("for %llu schedule op %ld on disc %d\n", +				(unsigned long long)sh->sector, bi->bi_rw, i); +			atomic_inc(&sh->count); +			bi->bi_sector = sh->sector + rdev->data_offset; +			bi->bi_flags = 1 << BIO_UPTODATE; +			bi->bi_vcnt = 1; +			bi->bi_max_vecs = 1; +			bi->bi_idx = 0; +			bi->bi_io_vec = &sh->dev[i].vec; +			bi->bi_io_vec[0].bv_len = STRIPE_SIZE; +			bi->bi_io_vec[0].bv_offset = 0; +			bi->bi_size = STRIPE_SIZE; +			bi->bi_next = NULL; +			generic_make_request(bi); +		} else { +			PRINTK("skip op %ld on disc %d for sector %llu\n", +				bi->bi_rw, i, (unsigned long long)sh->sector); +			clear_bit(R5_LOCKED, &sh->dev[i].flags); +			set_bit(STRIPE_HANDLE, &sh->state); +		} +	} +} + +static inline void raid6_activate_delayed(raid6_conf_t *conf) +{ +	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { +		while (!list_empty(&conf->delayed_list)) { +			struct list_head *l = conf->delayed_list.next; +			struct stripe_head *sh; +			sh = list_entry(l, struct stripe_head, lru); +			list_del_init(l); +			clear_bit(STRIPE_DELAYED, &sh->state); +			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) +				atomic_inc(&conf->preread_active_stripes); +			list_add_tail(&sh->lru, &conf->handle_list); +		} +	} +} + +static void unplug_slaves(mddev_t *mddev) +{ +	raid6_conf_t *conf = mddev_to_conf(mddev); +	int i; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks; i++) { +		mdk_rdev_t *rdev = conf->disks[i].rdev; +		if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { +			request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + +			atomic_inc(&rdev->nr_pending); +			rcu_read_unlock(); + +			if (r_queue->unplug_fn) +				r_queue->unplug_fn(r_queue); + +			rdev_dec_pending(rdev, mddev); +			rcu_read_lock(); +		} +	} +	rcu_read_unlock(); +} + +static void raid6_unplug_device(request_queue_t *q) +{ +	mddev_t *mddev = q->queuedata; +	raid6_conf_t *conf = mddev_to_conf(mddev); +	unsigned long flags; + +	spin_lock_irqsave(&conf->device_lock, flags); + +	if (blk_remove_plug(q)) +		raid6_activate_delayed(conf); +	md_wakeup_thread(mddev->thread); + +	spin_unlock_irqrestore(&conf->device_lock, flags); + +	unplug_slaves(mddev); +} + +static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, +			     sector_t *error_sector) +{ +	mddev_t *mddev = q->queuedata; +	raid6_conf_t *conf = mddev_to_conf(mddev); +	int i, ret = 0; + +	rcu_read_lock(); +	for (i=0; i<mddev->raid_disks && ret == 0; i++) { +		mdk_rdev_t *rdev = conf->disks[i].rdev; +		if (rdev && !rdev->faulty) { +			struct block_device *bdev = rdev->bdev; +			request_queue_t *r_queue = bdev_get_queue(bdev); + +			if (!r_queue->issue_flush_fn) +				ret = -EOPNOTSUPP; +			else { +				atomic_inc(&rdev->nr_pending); +				rcu_read_unlock(); +				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, +							      error_sector); +				rdev_dec_pending(rdev, mddev); +				rcu_read_lock(); +			} +		} +	} +	rcu_read_unlock(); +	return ret; +} + +static inline void raid6_plug_device(raid6_conf_t *conf) +{ +	spin_lock_irq(&conf->device_lock); +	blk_plug_device(conf->mddev->queue); +	spin_unlock_irq(&conf->device_lock); +} + +static int make_request (request_queue_t *q, struct bio * bi) +{ +	mddev_t *mddev = q->queuedata; +	raid6_conf_t *conf = mddev_to_conf(mddev); +	const unsigned int raid_disks = conf->raid_disks; +	const unsigned int data_disks = raid_disks - 2; +	unsigned int dd_idx, pd_idx; +	sector_t new_sector; +	sector_t logical_sector, last_sector; +	struct stripe_head *sh; + +	if (bio_data_dir(bi)==WRITE) { +		disk_stat_inc(mddev->gendisk, writes); +		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); +	} else { +		disk_stat_inc(mddev->gendisk, reads); +		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi)); +	} + +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); +	last_sector = bi->bi_sector + (bi->bi_size>>9); + +	bi->bi_next = NULL; +	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */ +	if ( bio_data_dir(bi) == WRITE ) +		md_write_start(mddev); +	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { +		DEFINE_WAIT(w); + +		new_sector = raid6_compute_sector(logical_sector, +						  raid_disks, data_disks, &dd_idx, &pd_idx, conf); + +		PRINTK("raid6: make_request, sector %llu logical %llu\n", +		       (unsigned long long)new_sector, +		       (unsigned long long)logical_sector); + +	retry: +		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); +		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK)); +		if (sh) { +			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { +				/* Add failed due to overlap.  Flush everything +				 * and wait a while +				 */ +				raid6_unplug_device(mddev->queue); +				release_stripe(sh); +				schedule(); +				goto retry; +			} +			finish_wait(&conf->wait_for_overlap, &w); +			raid6_plug_device(conf); +			handle_stripe(sh); +			release_stripe(sh); +		} else { +			/* cannot get stripe for read-ahead, just give-up */ +			clear_bit(BIO_UPTODATE, &bi->bi_flags); +			finish_wait(&conf->wait_for_overlap, &w); +			break; +		} + +	} +	spin_lock_irq(&conf->device_lock); +	if (--bi->bi_phys_segments == 0) { +		int bytes = bi->bi_size; + +		if ( bio_data_dir(bi) == WRITE ) +			md_write_end(mddev); +		bi->bi_size = 0; +		bi->bi_end_io(bi, bytes, 0); +	} +	spin_unlock_irq(&conf->device_lock); +	return 0; +} + +/* FIXME go_faster isn't used */ +static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) +{ +	raid6_conf_t *conf = (raid6_conf_t *) mddev->private; +	struct stripe_head *sh; +	int sectors_per_chunk = conf->chunk_size >> 9; +	sector_t x; +	unsigned long stripe; +	int chunk_offset; +	int dd_idx, pd_idx; +	sector_t first_sector; +	int raid_disks = conf->raid_disks; +	int data_disks = raid_disks - 2; + +	if (sector_nr >= mddev->size <<1) { +		/* just being told to finish up .. nothing much to do */ +		unplug_slaves(mddev); +		return 0; +	} +	/* if there are 2 or more failed drives and we are trying +	 * to resync, then assert that we are finished, because there is +	 * nothing we can do. +	 */ +	if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { +		int rv = (mddev->size << 1) - sector_nr; +		md_done_sync(mddev, rv, 1); +		return rv; +	} + +	x = sector_nr; +	chunk_offset = sector_div(x, sectors_per_chunk); +	stripe = x; +	BUG_ON(x != stripe); + +	first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk +		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); +	sh = get_active_stripe(conf, sector_nr, pd_idx, 1); +	if (sh == NULL) { +		sh = get_active_stripe(conf, sector_nr, pd_idx, 0); +		/* make sure we don't swamp the stripe cache if someone else +		 * is trying to get access +		 */ +		set_current_state(TASK_UNINTERRUPTIBLE); +		schedule_timeout(1); +	} +	spin_lock(&sh->lock); +	set_bit(STRIPE_SYNCING, &sh->state); +	clear_bit(STRIPE_INSYNC, &sh->state); +	spin_unlock(&sh->lock); + +	handle_stripe(sh); +	release_stripe(sh); + +	return STRIPE_SECTORS; +} + +/* + * This is our raid6 kernel thread. + * + * We scan the hash table for stripes which can be handled now. + * During the scan, completed stripes are saved for us by the interrupt + * handler, so that they will not have to wait for our next wakeup. + */ +static void raid6d (mddev_t *mddev) +{ +	struct stripe_head *sh; +	raid6_conf_t *conf = mddev_to_conf(mddev); +	int handled; + +	PRINTK("+++ raid6d active\n"); + +	md_check_recovery(mddev); +	md_handle_safemode(mddev); + +	handled = 0; +	spin_lock_irq(&conf->device_lock); +	while (1) { +		struct list_head *first; + +		if (list_empty(&conf->handle_list) && +		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && +		    !blk_queue_plugged(mddev->queue) && +		    !list_empty(&conf->delayed_list)) +			raid6_activate_delayed(conf); + +		if (list_empty(&conf->handle_list)) +			break; + +		first = conf->handle_list.next; +		sh = list_entry(first, struct stripe_head, lru); + +		list_del_init(first); +		atomic_inc(&sh->count); +		if (atomic_read(&sh->count)!= 1) +			BUG(); +		spin_unlock_irq(&conf->device_lock); + +		handled++; +		handle_stripe(sh); +		release_stripe(sh); + +		spin_lock_irq(&conf->device_lock); +	} +	PRINTK("%d stripes handled\n", handled); + +	spin_unlock_irq(&conf->device_lock); + +	unplug_slaves(mddev); + +	PRINTK("--- raid6d inactive\n"); +} + +static int run (mddev_t *mddev) +{ +	raid6_conf_t *conf; +	int raid_disk, memory; +	mdk_rdev_t *rdev; +	struct disk_info *disk; +	struct list_head *tmp; + +	if (mddev->level != 6) { +		PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level); +		return -EIO; +	} + +	mddev->private = kmalloc (sizeof (raid6_conf_t) +				  + mddev->raid_disks * sizeof(struct disk_info), +				  GFP_KERNEL); +	if ((conf = mddev->private) == NULL) +		goto abort; +	memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); +	conf->mddev = mddev; + +	if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) +		goto abort; +	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); + +	spin_lock_init(&conf->device_lock); +	init_waitqueue_head(&conf->wait_for_stripe); +	init_waitqueue_head(&conf->wait_for_overlap); +	INIT_LIST_HEAD(&conf->handle_list); +	INIT_LIST_HEAD(&conf->delayed_list); +	INIT_LIST_HEAD(&conf->inactive_list); +	atomic_set(&conf->active_stripes, 0); +	atomic_set(&conf->preread_active_stripes, 0); + +	mddev->queue->unplug_fn = raid6_unplug_device; +	mddev->queue->issue_flush_fn = raid6_issue_flush; + +	PRINTK("raid6: run(%s) called.\n", mdname(mddev)); + +	ITERATE_RDEV(mddev,rdev,tmp) { +		raid_disk = rdev->raid_disk; +		if (raid_disk >= mddev->raid_disks +		    || raid_disk < 0) +			continue; +		disk = conf->disks + raid_disk; + +		disk->rdev = rdev; + +		if (rdev->in_sync) { +			char b[BDEVNAME_SIZE]; +			printk(KERN_INFO "raid6: device %s operational as raid" +			       " disk %d\n", bdevname(rdev->bdev,b), +			       raid_disk); +			conf->working_disks++; +		} +	} + +	conf->raid_disks = mddev->raid_disks; + +	/* +	 * 0 for a fully functional array, 1 or 2 for a degraded array. +	 */ +	mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; +	conf->mddev = mddev; +	conf->chunk_size = mddev->chunk_size; +	conf->level = mddev->level; +	conf->algorithm = mddev->layout; +	conf->max_nr_stripes = NR_STRIPES; + +	/* device size must be a multiple of chunk size */ +	mddev->size &= ~(mddev->chunk_size/1024 -1); + +	if (conf->raid_disks < 4) { +		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", +		       mdname(mddev), conf->raid_disks); +		goto abort; +	} +	if (!conf->chunk_size || conf->chunk_size % 4) { +		printk(KERN_ERR "raid6: invalid chunk size %d for %s\n", +		       conf->chunk_size, mdname(mddev)); +		goto abort; +	} +	if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { +		printk(KERN_ERR +		       "raid6: unsupported parity algorithm %d for %s\n", +		       conf->algorithm, mdname(mddev)); +		goto abort; +	} +	if (mddev->degraded > 2) { +		printk(KERN_ERR "raid6: not enough operational devices for %s" +		       " (%d/%d failed)\n", +		       mdname(mddev), conf->failed_disks, conf->raid_disks); +		goto abort; +	} + +#if 0				/* FIX: For now */ +	if (mddev->degraded > 0 && +	    mddev->recovery_cp != MaxSector) { +		printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev)); +		goto abort; +	} +#endif + +	{ +		mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6"); +		if (!mddev->thread) { +			printk(KERN_ERR +			       "raid6: couldn't allocate thread for %s\n", +			       mdname(mddev)); +			goto abort; +		} +	} + +	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + +		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; +	if (grow_stripes(conf, conf->max_nr_stripes)) { +		printk(KERN_ERR +		       "raid6: couldn't allocate %dkB for buffers\n", memory); +		shrink_stripes(conf); +		md_unregister_thread(mddev->thread); +		goto abort; +	} else +		printk(KERN_INFO "raid6: allocated %dkB for %s\n", +		       memory, mdname(mddev)); + +	if (mddev->degraded == 0) +		printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d" +		       " devices, algorithm %d\n", conf->level, mdname(mddev), +		       mddev->raid_disks-mddev->degraded, mddev->raid_disks, +		       conf->algorithm); +	else +		printk(KERN_ALERT "raid6: raid level %d set %s active with %d" +		       " out of %d devices, algorithm %d\n", conf->level, +		       mdname(mddev), mddev->raid_disks - mddev->degraded, +		       mddev->raid_disks, conf->algorithm); + +	print_raid6_conf(conf); + +	/* read-ahead size must cover two whole stripes, which is +	 * 2 * (n-2) * chunksize where 'n' is the number of raid devices +	 */ +	{ +		int stripe = (mddev->raid_disks-2) * mddev->chunk_size +			/ PAGE_CACHE_SIZE; +		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) +			mddev->queue->backing_dev_info.ra_pages = 2 * stripe; +	} + +	/* Ok, everything is just fine now */ +	mddev->array_size =  mddev->size * (mddev->raid_disks - 2); +	return 0; +abort: +	if (conf) { +		print_raid6_conf(conf); +		if (conf->stripe_hashtbl) +			free_pages((unsigned long) conf->stripe_hashtbl, +							HASH_PAGES_ORDER); +		kfree(conf); +	} +	mddev->private = NULL; +	printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev)); +	return -EIO; +} + + + +static int stop (mddev_t *mddev) +{ +	raid6_conf_t *conf = (raid6_conf_t *) mddev->private; + +	md_unregister_thread(mddev->thread); +	mddev->thread = NULL; +	shrink_stripes(conf); +	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); +	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +	kfree(conf); +	mddev->private = NULL; +	return 0; +} + +#if RAID6_DUMPSTATE +static void print_sh (struct seq_file *seq, struct stripe_head *sh) +{ +	int i; + +	seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", +		   (unsigned long long)sh->sector, sh->pd_idx, sh->state); +	seq_printf(seq, "sh %llu,  count %d.\n", +		   (unsigned long long)sh->sector, atomic_read(&sh->count)); +	seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); +	for (i = 0; i < sh->raid_conf->raid_disks; i++) { +		seq_printf(seq, "(cache%d: %p %ld) ", +			   i, sh->dev[i].page, sh->dev[i].flags); +	} +	seq_printf(seq, "\n"); +} + +static void printall (struct seq_file *seq, raid6_conf_t *conf) +{ +	struct stripe_head *sh; +	int i; + +	spin_lock_irq(&conf->device_lock); +	for (i = 0; i < NR_HASH; i++) { +		sh = conf->stripe_hashtbl[i]; +		for (; sh; sh = sh->hash_next) { +			if (sh->raid_conf != conf) +				continue; +			print_sh(seq, sh); +		} +	} +	spin_unlock_irq(&conf->device_lock); +} +#endif + +static void status (struct seq_file *seq, mddev_t *mddev) +{ +	raid6_conf_t *conf = (raid6_conf_t *) mddev->private; +	int i; + +	seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); +	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); +	for (i = 0; i < conf->raid_disks; i++) + 		seq_printf (seq, "%s", +			    conf->disks[i].rdev && +			    conf->disks[i].rdev->in_sync ? "U" : "_"); +	seq_printf (seq, "]"); +#if RAID6_DUMPSTATE +	seq_printf (seq, "\n"); +	printall(seq, conf); +#endif +} + +static void print_raid6_conf (raid6_conf_t *conf) +{ +	int i; +	struct disk_info *tmp; + +	printk("RAID6 conf printout:\n"); +	if (!conf) { +		printk("(conf==NULL)\n"); +		return; +	} +	printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, +		 conf->working_disks, conf->failed_disks); + +	for (i = 0; i < conf->raid_disks; i++) { +		char b[BDEVNAME_SIZE]; +		tmp = conf->disks + i; +		if (tmp->rdev) +		printk(" disk %d, o:%d, dev:%s\n", +			i, !tmp->rdev->faulty, +			bdevname(tmp->rdev->bdev,b)); +	} +} + +static int raid6_spare_active(mddev_t *mddev) +{ +	int i; +	raid6_conf_t *conf = mddev->private; +	struct disk_info *tmp; + +	for (i = 0; i < conf->raid_disks; i++) { +		tmp = conf->disks + i; +		if (tmp->rdev +		    && !tmp->rdev->faulty +		    && !tmp->rdev->in_sync) { +			mddev->degraded--; +			conf->failed_disks--; +			conf->working_disks++; +			tmp->rdev->in_sync = 1; +		} +	} +	print_raid6_conf(conf); +	return 0; +} + +static int raid6_remove_disk(mddev_t *mddev, int number) +{ +	raid6_conf_t *conf = mddev->private; +	int err = 0; +	mdk_rdev_t *rdev; +	struct disk_info *p = conf->disks + number; + +	print_raid6_conf(conf); +	rdev = p->rdev; +	if (rdev) { +		if (rdev->in_sync || +		    atomic_read(&rdev->nr_pending)) { +			err = -EBUSY; +			goto abort; +		} +		p->rdev = NULL; +		synchronize_kernel(); +		if (atomic_read(&rdev->nr_pending)) { +			/* lost the race, try later */ +			err = -EBUSY; +			p->rdev = rdev; +		} +	} + +abort: + +	print_raid6_conf(conf); +	return err; +} + +static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ +	raid6_conf_t *conf = mddev->private; +	int found = 0; +	int disk; +	struct disk_info *p; + +	if (mddev->degraded > 2) +		/* no point adding a device */ +		return 0; +	/* +	 * find the disk ... +	 */ +	for (disk=0; disk < mddev->raid_disks; disk++) +		if ((p=conf->disks + disk)->rdev == NULL) { +			rdev->in_sync = 0; +			rdev->raid_disk = disk; +			found = 1; +			p->rdev = rdev; +			break; +		} +	print_raid6_conf(conf); +	return found; +} + +static int raid6_resize(mddev_t *mddev, sector_t sectors) +{ +	/* no resync is happening, and there is enough space +	 * on all devices, so we can resize. +	 * We need to make sure resync covers any new space. +	 * If the array is shrinking we should possibly wait until +	 * any io in the removed space completes, but it hardly seems +	 * worth it. +	 */ +	sectors &= ~((sector_t)mddev->chunk_size/512 - 1); +	mddev->array_size = (sectors * (mddev->raid_disks-2))>>1; +	set_capacity(mddev->gendisk, mddev->array_size << 1); +	mddev->changed = 1; +	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) { +		mddev->recovery_cp = mddev->size << 1; +		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +	} +	mddev->size = sectors /2; +	return 0; +} + +static mdk_personality_t raid6_personality= +{ +	.name		= "raid6", +	.owner		= THIS_MODULE, +	.make_request	= make_request, +	.run		= run, +	.stop		= stop, +	.status		= status, +	.error_handler	= error, +	.hot_add_disk	= raid6_add_disk, +	.hot_remove_disk= raid6_remove_disk, +	.spare_active	= raid6_spare_active, +	.sync_request	= sync_request, +	.resize		= raid6_resize, +}; + +static int __init raid6_init (void) +{ +	int e; + +	e = raid6_select_algo(); +	if ( e ) +		return e; + +	return register_md_personality (RAID6, &raid6_personality); +} + +static void raid6_exit (void) +{ +	unregister_md_personality (RAID6); +} + +module_init(raid6_init); +module_exit(raid6_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("md-personality-8"); /* RAID6 */ diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c new file mode 100644 index 00000000000..359157aaf9e --- /dev/null +++ b/drivers/md/raid6mmx.c @@ -0,0 +1,150 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2002 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6mmx.c + * + * MMX implementation of RAID-6 syndrome functions + */ + +#if defined(__i386__) + +#include "raid6.h" +#include "raid6x86.h" + +/* Shared with raid6sse1.c */ +const struct raid6_mmx_constants { +	u64 x1d; +} raid6_mmx_constants = { +	0x1d1d1d1d1d1d1d1dULL, +}; + +static int raid6_have_mmx(void) +{ +#ifdef __KERNEL__ +	/* Not really "boot_cpu" but "all_cpus" */ +	return boot_cpu_has(X86_FEATURE_MMX); +#else +	/* User space test code */ +	u32 features = cpuid_features(); +	return ( (features & (1<<23)) == (1<<23) ); +#endif +} + +/* + * Plain MMX implementation + */ +static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; +	raid6_mmx_save_t sa; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	raid6_before_mmx(&sa); + +	asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); +	asm volatile("pxor %mm5,%mm5");	/* Zero temp */ + +	for ( d = 0 ; d < bytes ; d += 8 ) { +		asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ +		asm volatile("movq %mm2,%mm4");	/* Q[0] */ +		for ( z = z0-1 ; z >= 0 ; z-- ) { +			asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); +			asm volatile("pcmpgtb %mm4,%mm5"); +			asm volatile("paddb %mm4,%mm4"); +			asm volatile("pand %mm0,%mm5"); +			asm volatile("pxor %mm5,%mm4"); +			asm volatile("pxor %mm5,%mm5"); +			asm volatile("pxor %mm6,%mm2"); +			asm volatile("pxor %mm6,%mm4"); +		} +		asm volatile("movq %%mm2,%0" : "=m" (p[d])); +		asm volatile("pxor %mm2,%mm2"); +		asm volatile("movq %%mm4,%0" : "=m" (q[d])); +		asm volatile("pxor %mm4,%mm4"); +	} + +	raid6_after_mmx(&sa); +} + +const struct raid6_calls raid6_mmxx1 = { +	raid6_mmx1_gen_syndrome, +	raid6_have_mmx, +	"mmxx1", +	0 +}; + +/* + * Unrolled-by-2 MMX implementation + */ +static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; +	raid6_mmx_save_t sa; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	raid6_before_mmx(&sa); + +	asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); +	asm volatile("pxor %mm5,%mm5");	/* Zero temp */ +	asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + +	for ( d = 0 ; d < bytes ; d += 16 ) { +		asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ +		asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); +		asm volatile("movq %mm2,%mm4"); /* Q[0] */ +		asm volatile("movq %mm3,%mm6"); /* Q[1] */ +		for ( z = z0-1 ; z >= 0 ; z-- ) { +			asm volatile("pcmpgtb %mm4,%mm5"); +			asm volatile("pcmpgtb %mm6,%mm7"); +			asm volatile("paddb %mm4,%mm4"); +			asm volatile("paddb %mm6,%mm6"); +			asm volatile("pand %mm0,%mm5"); +			asm volatile("pand %mm0,%mm7"); +			asm volatile("pxor %mm5,%mm4"); +			asm volatile("pxor %mm7,%mm6"); +			asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); +			asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); +			asm volatile("pxor %mm5,%mm2"); +			asm volatile("pxor %mm7,%mm3"); +			asm volatile("pxor %mm5,%mm4"); +			asm volatile("pxor %mm7,%mm6"); +			asm volatile("pxor %mm5,%mm5"); +			asm volatile("pxor %mm7,%mm7"); +		} +		asm volatile("movq %%mm2,%0" : "=m" (p[d])); +		asm volatile("movq %%mm3,%0" : "=m" (p[d+8])); +		asm volatile("movq %%mm4,%0" : "=m" (q[d])); +		asm volatile("movq %%mm6,%0" : "=m" (q[d+8])); +	} + +	raid6_after_mmx(&sa); +} + +const struct raid6_calls raid6_mmxx2 = { +	raid6_mmx2_gen_syndrome, +	raid6_have_mmx, +	"mmxx2", +	0 +}; + +#endif diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c new file mode 100644 index 00000000000..a8c4d9451bd --- /dev/null +++ b/drivers/md/raid6recov.c @@ -0,0 +1,133 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2002 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6recov.c + * + * RAID-6 data recovery in dual failure mode.  In single failure mode, + * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct + * the syndrome.) + */ + +#include "raid6.h" + +/* Recover two failed data blocks. */ +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, +		       void **ptrs) +{ +	u8 *p, *q, *dp, *dq; +	u8 px, qx, db; +	const u8 *pbmul;	/* P multiplier table for B data */ +	const u8 *qmul;		/* Q multiplier table (for both) */ + +	p = (u8 *)ptrs[disks-2]; +	q = (u8 *)ptrs[disks-1]; + +	/* Compute syndrome with zero for the missing data pages +	   Use the dead data pages as temporary storage for +	   delta p and delta q */ +	dp = (u8 *)ptrs[faila]; +	ptrs[faila] = (void *)raid6_empty_zero_page; +	ptrs[disks-2] = dp; +	dq = (u8 *)ptrs[failb]; +	ptrs[failb] = (void *)raid6_empty_zero_page; +	ptrs[disks-1] = dq; + +	raid6_call.gen_syndrome(disks, bytes, ptrs); + +	/* Restore pointer table */ +	ptrs[faila]   = dp; +	ptrs[failb]   = dq; +	ptrs[disks-2] = p; +	ptrs[disks-1] = q; + +	/* Now, pick the proper data tables */ +	pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; +	qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + +	/* Now do it... */ +	while ( bytes-- ) { +		px    = *p ^ *dp; +		qx    = qmul[*q ^ *dq]; +		*dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ +		*dp++ = db ^ px; /* Reconstructed A */ +		p++; q++; +	} +} + + + + +/* Recover failure of one data block plus the P block */ +void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) +{ +	u8 *p, *q, *dq; +	const u8 *qmul;		/* Q multiplier table */ + +	p = (u8 *)ptrs[disks-2]; +	q = (u8 *)ptrs[disks-1]; + +	/* Compute syndrome with zero for the missing data page +	   Use the dead data page as temporary storage for delta q */ +	dq = (u8 *)ptrs[faila]; +	ptrs[faila] = (void *)raid6_empty_zero_page; +	ptrs[disks-1] = dq; + +	raid6_call.gen_syndrome(disks, bytes, ptrs); + +	/* Restore pointer table */ +	ptrs[faila]   = dq; +	ptrs[disks-1] = q; + +	/* Now, pick the proper data tables */ +	qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + +	/* Now do it... */ +	while ( bytes-- ) { +		*p++ ^= *dq = qmul[*q ^ *dq]; +		q++; dq++; +	} +} + + +#ifndef __KERNEL__		/* Testing only */ + +/* Recover two failed blocks. */ +void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) +{ +	if ( faila > failb ) { +		int tmp = faila; +		faila = failb; +		failb = tmp; +	} + +	if ( failb == disks-1 ) { +		if ( faila == disks-2 ) { +			/* P+Q failure.  Just rebuild the syndrome. */ +			raid6_call.gen_syndrome(disks, bytes, ptrs); +		} else { +			/* data+Q failure.  Reconstruct data from P, +			   then rebuild syndrome. */ +			/* NOT IMPLEMENTED - equivalent to RAID-5 */ +		} +	} else { +		if ( failb == disks-2 ) { +			/* data+P failure. */ +			raid6_datap_recov(disks, bytes, faila, ptrs); +		} else { +			/* data+data failure. */ +			raid6_2data_recov(disks, bytes, faila, failb, ptrs); +		} +	} +} + +#endif diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c new file mode 100644 index 00000000000..f7e7859f71a --- /dev/null +++ b/drivers/md/raid6sse1.c @@ -0,0 +1,171 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2002 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6sse1.c + * + * SSE-1/MMXEXT implementation of RAID-6 syndrome functions + * + * This is really an MMX implementation, but it requires SSE-1 or + * AMD MMXEXT for prefetch support and a few other features.  The + * support for nontemporal memory accesses is enough to make this + * worthwhile as a separate implementation. + */ + +#if defined(__i386__) + +#include "raid6.h" +#include "raid6x86.h" + +/* Defined in raid6mmx.c */ +extern const struct raid6_mmx_constants { +	u64 x1d; +} raid6_mmx_constants; + +static int raid6_have_sse1_or_mmxext(void) +{ +#ifdef __KERNEL__ +	/* Not really boot_cpu but "all_cpus" */ +	return boot_cpu_has(X86_FEATURE_MMX) && +		(boot_cpu_has(X86_FEATURE_XMM) || +		 boot_cpu_has(X86_FEATURE_MMXEXT)); +#else +	/* User space test code - this incorrectly breaks on some Athlons */ +	u32 features = cpuid_features(); +	return ( (features & (5<<23)) == (5<<23) ); +#endif +} + +/* + * Plain SSE1 implementation + */ +static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; +	raid6_mmx_save_t sa; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	/* This is really MMX code, not SSE */ +	raid6_before_mmx(&sa); + +	asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); +	asm volatile("pxor %mm5,%mm5");	/* Zero temp */ + +	for ( d = 0 ; d < bytes ; d += 8 ) { +		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); +		asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ +		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); +		asm volatile("movq %mm2,%mm4");	/* Q[0] */ +		asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d])); +		for ( z = z0-2 ; z >= 0 ; z-- ) { +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); +			asm volatile("pcmpgtb %mm4,%mm5"); +			asm volatile("paddb %mm4,%mm4"); +			asm volatile("pand %mm0,%mm5"); +			asm volatile("pxor %mm5,%mm4"); +			asm volatile("pxor %mm5,%mm5"); +			asm volatile("pxor %mm6,%mm2"); +			asm volatile("pxor %mm6,%mm4"); +			asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); +		} +		asm volatile("pcmpgtb %mm4,%mm5"); +		asm volatile("paddb %mm4,%mm4"); +		asm volatile("pand %mm0,%mm5"); +		asm volatile("pxor %mm5,%mm4"); +		asm volatile("pxor %mm5,%mm5"); +		asm volatile("pxor %mm6,%mm2"); +		asm volatile("pxor %mm6,%mm4"); + +		asm volatile("movntq %%mm2,%0" : "=m" (p[d])); +		asm volatile("movntq %%mm4,%0" : "=m" (q[d])); +	} + +	raid6_after_mmx(&sa); +	asm volatile("sfence" : : : "memory"); +} + +const struct raid6_calls raid6_sse1x1 = { +	raid6_sse11_gen_syndrome, +	raid6_have_sse1_or_mmxext, +	"sse1x1", +	1			/* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE1 implementation + */ +static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; +	raid6_mmx_save_t sa; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	raid6_before_mmx(&sa); + +	asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); +	asm volatile("pxor %mm5,%mm5");	/* Zero temp */ +	asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + +	/* We uniformly assume a single prefetch covers at least 16 bytes */ +	for ( d = 0 ; d < bytes ; d += 16 ) { +		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); +		asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ +		asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */ +		asm volatile("movq %mm2,%mm4");	/* Q[0] */ +		asm volatile("movq %mm3,%mm6"); /* Q[1] */ +		for ( z = z0-1 ; z >= 0 ; z-- ) { +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); +			asm volatile("pcmpgtb %mm4,%mm5"); +			asm volatile("pcmpgtb %mm6,%mm7"); +			asm volatile("paddb %mm4,%mm4"); +			asm volatile("paddb %mm6,%mm6"); +			asm volatile("pand %mm0,%mm5"); +			asm volatile("pand %mm0,%mm7"); +			asm volatile("pxor %mm5,%mm4"); +			asm volatile("pxor %mm7,%mm6"); +			asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); +			asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); +			asm volatile("pxor %mm5,%mm2"); +			asm volatile("pxor %mm7,%mm3"); +			asm volatile("pxor %mm5,%mm4"); +			asm volatile("pxor %mm7,%mm6"); +			asm volatile("pxor %mm5,%mm5"); +			asm volatile("pxor %mm7,%mm7"); +		} +		asm volatile("movntq %%mm2,%0" : "=m" (p[d])); +		asm volatile("movntq %%mm3,%0" : "=m" (p[d+8])); +		asm volatile("movntq %%mm4,%0" : "=m" (q[d])); +		asm volatile("movntq %%mm6,%0" : "=m" (q[d+8])); +	} + +	raid6_after_mmx(&sa); +	asm volatile("sfence" : :: "memory"); +} + +const struct raid6_calls raid6_sse1x2 = { +	raid6_sse12_gen_syndrome, +	raid6_have_sse1_or_mmxext, +	"sse1x2", +	1			/* Has cache hints */ +}; + +#endif diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c new file mode 100644 index 00000000000..b3aa7fe0877 --- /dev/null +++ b/drivers/md/raid6sse2.c @@ -0,0 +1,270 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2002 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6sse2.c + * + * SSE-2 implementation of RAID-6 syndrome functions + * + */ + +#if defined(__i386__) || defined(__x86_64__) + +#include "raid6.h" +#include "raid6x86.h" + +static const struct raid6_sse_constants { +	u64 x1d[2]; +} raid6_sse_constants  __attribute__((aligned(16))) = { +	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, +}; + +static int raid6_have_sse2(void) +{ +#ifdef __KERNEL__ +	/* Not really boot_cpu but "all_cpus" */ +	return boot_cpu_has(X86_FEATURE_MMX) && +		boot_cpu_has(X86_FEATURE_FXSR) && +		boot_cpu_has(X86_FEATURE_XMM) && +		boot_cpu_has(X86_FEATURE_XMM2); +#else +	/* User space test code */ +	u32 features = cpuid_features(); +	return ( (features & (15<<23)) == (15<<23) ); +#endif +} + +/* + * Plain SSE2 implementation + */ +static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; +	raid6_sse_save_t sa; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	raid6_before_sse2(&sa); + +	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); +	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */ + +	for ( d = 0 ; d < bytes ; d += 16 ) { +		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); +		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ +		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); +		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ +		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); +		for ( z = z0-2 ; z >= 0 ; z-- ) { +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); +			asm volatile("pcmpgtb %xmm4,%xmm5"); +			asm volatile("paddb %xmm4,%xmm4"); +			asm volatile("pand %xmm0,%xmm5"); +			asm volatile("pxor %xmm5,%xmm4"); +			asm volatile("pxor %xmm5,%xmm5"); +			asm volatile("pxor %xmm6,%xmm2"); +			asm volatile("pxor %xmm6,%xmm4"); +			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); +		} +		asm volatile("pcmpgtb %xmm4,%xmm5"); +		asm volatile("paddb %xmm4,%xmm4"); +		asm volatile("pand %xmm0,%xmm5"); +		asm volatile("pxor %xmm5,%xmm4"); +		asm volatile("pxor %xmm5,%xmm5"); +		asm volatile("pxor %xmm6,%xmm2"); +		asm volatile("pxor %xmm6,%xmm4"); + +		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); +		asm volatile("pxor %xmm2,%xmm2"); +		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); +		asm volatile("pxor %xmm4,%xmm4"); +	} + +	raid6_after_sse2(&sa); +	asm volatile("sfence" : : : "memory"); +} + +const struct raid6_calls raid6_sse2x1 = { +	raid6_sse21_gen_syndrome, +	raid6_have_sse2, +	"sse2x1", +	1			/* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE2 implementation + */ +static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; +	raid6_sse_save_t sa; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	raid6_before_sse2(&sa); + +	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); +	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ +	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ + +	/* We uniformly assume a single prefetch covers at least 32 bytes */ +	for ( d = 0 ; d < bytes ; d += 32 ) { +		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); +		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */ +		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ +		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ +		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ +		for ( z = z0-1 ; z >= 0 ; z-- ) { +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); +			asm volatile("pcmpgtb %xmm4,%xmm5"); +			asm volatile("pcmpgtb %xmm6,%xmm7"); +			asm volatile("paddb %xmm4,%xmm4"); +			asm volatile("paddb %xmm6,%xmm6"); +			asm volatile("pand %xmm0,%xmm5"); +			asm volatile("pand %xmm0,%xmm7"); +			asm volatile("pxor %xmm5,%xmm4"); +			asm volatile("pxor %xmm7,%xmm6"); +			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); +			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); +			asm volatile("pxor %xmm5,%xmm2"); +			asm volatile("pxor %xmm7,%xmm3"); +			asm volatile("pxor %xmm5,%xmm4"); +			asm volatile("pxor %xmm7,%xmm6"); +			asm volatile("pxor %xmm5,%xmm5"); +			asm volatile("pxor %xmm7,%xmm7"); +		} +		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); +		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); +		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); +		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); +	} + +	raid6_after_sse2(&sa); +	asm volatile("sfence" : : : "memory"); +} + +const struct raid6_calls raid6_sse2x2 = { +	raid6_sse22_gen_syndrome, +	raid6_have_sse2, +	"sse2x2", +	1			/* Has cache hints */ +}; + +#endif + +#ifdef __x86_64__ + +/* + * Unrolled-by-4 SSE2 implementation + */ +static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; +	raid6_sse16_save_t sa; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	raid6_before_sse16(&sa); + +	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); +	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */ +	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */ +	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */ +	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */ +	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */ +	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */ +	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */ +	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */ +	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */ +	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */ +	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */ +	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */ + +	for ( d = 0 ; d < bytes ; d += 64 ) { +		for ( z = z0 ; z >= 0 ; z-- ) { +			/* The second prefetch seems to improve performance... */ +			asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); +			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); +			asm volatile("pcmpgtb %xmm4,%xmm5"); +			asm volatile("pcmpgtb %xmm6,%xmm7"); +			asm volatile("pcmpgtb %xmm12,%xmm13"); +			asm volatile("pcmpgtb %xmm14,%xmm15"); +			asm volatile("paddb %xmm4,%xmm4"); +			asm volatile("paddb %xmm6,%xmm6"); +			asm volatile("paddb %xmm12,%xmm12"); +			asm volatile("paddb %xmm14,%xmm14"); +			asm volatile("pand %xmm0,%xmm5"); +			asm volatile("pand %xmm0,%xmm7"); +			asm volatile("pand %xmm0,%xmm13"); +			asm volatile("pand %xmm0,%xmm15"); +			asm volatile("pxor %xmm5,%xmm4"); +			asm volatile("pxor %xmm7,%xmm6"); +			asm volatile("pxor %xmm13,%xmm12"); +			asm volatile("pxor %xmm15,%xmm14"); +			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); +			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); +			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); +			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); +			asm volatile("pxor %xmm5,%xmm2"); +			asm volatile("pxor %xmm7,%xmm3"); +			asm volatile("pxor %xmm13,%xmm10"); +			asm volatile("pxor %xmm15,%xmm11"); +			asm volatile("pxor %xmm5,%xmm4"); +			asm volatile("pxor %xmm7,%xmm6"); +			asm volatile("pxor %xmm13,%xmm12"); +			asm volatile("pxor %xmm15,%xmm14"); +			asm volatile("pxor %xmm5,%xmm5"); +			asm volatile("pxor %xmm7,%xmm7"); +			asm volatile("pxor %xmm13,%xmm13"); +			asm volatile("pxor %xmm15,%xmm15"); +		} +		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); +		asm volatile("pxor %xmm2,%xmm2"); +		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); +		asm volatile("pxor %xmm3,%xmm3"); +		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); +		asm volatile("pxor %xmm10,%xmm10"); +		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); +		asm volatile("pxor %xmm11,%xmm11"); +		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); +		asm volatile("pxor %xmm4,%xmm4"); +		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); +		asm volatile("pxor %xmm6,%xmm6"); +		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); +		asm volatile("pxor %xmm12,%xmm12"); +		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); +		asm volatile("pxor %xmm14,%xmm14"); +	} +	asm volatile("sfence" : : : "memory"); +	raid6_after_sse16(&sa); +} + +const struct raid6_calls raid6_sse2x4 = { +	raid6_sse24_gen_syndrome, +	raid6_have_sse2, +	"sse2x4", +	1			/* Has cache hints */ +}; + +#endif diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile new file mode 100644 index 00000000000..55780672860 --- /dev/null +++ b/drivers/md/raid6test/Makefile @@ -0,0 +1,58 @@ +# +# This is a simple Makefile to test some of the RAID-6 code +# from userspace. +# + +CC	 = gcc +OPTFLAGS = -O2			# Adjust as desired +CFLAGS	 = -I.. -g $(OPTFLAGS) +LD	 = ld +PERL	 = perl + +.c.o: +	$(CC) $(CFLAGS) -c -o $@ $< + +%.c: ../%.c +	cp -f $< $@ + +%.uc: ../%.uc +	cp -f $< $@ + +all:	raid6.o raid6test + +raid6.o: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \ +	 raid6int32.o \ +	 raid6mmx.o raid6sse1.o raid6sse2.o \ +	 raid6recov.o raid6algos.o \ +	 raid6tables.o +	$(LD) -r -o $@ $^ + +raid6test: raid6.o test.c +	$(CC) $(CFLAGS) -o raid6test $^ + +raid6int1.c: raid6int.uc ../unroll.pl +	$(PERL) ../unroll.pl 1 < raid6int.uc > $@ + +raid6int2.c: raid6int.uc ../unroll.pl +	$(PERL) ../unroll.pl 2 < raid6int.uc > $@ + +raid6int4.c: raid6int.uc ../unroll.pl +	$(PERL) ../unroll.pl 4 < raid6int.uc > $@ + +raid6int8.c: raid6int.uc ../unroll.pl +	$(PERL) ../unroll.pl 8 < raid6int.uc > $@ + +raid6int16.c: raid6int.uc ../unroll.pl +	$(PERL) ../unroll.pl 16 < raid6int.uc > $@ + +raid6int32.c: raid6int.uc ../unroll.pl +	$(PERL) ../unroll.pl 32 < raid6int.uc > $@ + +raid6tables.c: mktables +	./mktables > raid6tables.c + +clean: +	rm -f *.o mktables mktables.c raid6int.uc raid6*.c raid6test + +spotless: clean +	rm -f *~ diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c new file mode 100644 index 00000000000..0d5cd57accd --- /dev/null +++ b/drivers/md/raid6test/test.c @@ -0,0 +1,103 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2002 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6test.c + * + * Test RAID-6 recovery with various algorithms + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include "raid6.h" + +#define NDISKS		16	/* Including P and Q */ + +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +struct raid6_calls raid6_call; + +char *dataptrs[NDISKS]; +char data[NDISKS][PAGE_SIZE]; +char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; + +void makedata(void) +{ +	int i, j; + +	for (  i = 0 ; i < NDISKS ; i++ ) { +		for ( j = 0 ; j < PAGE_SIZE ; j++ ) { +			data[i][j] = rand(); +		} +		dataptrs[i] = data[i]; +	} +} + +int main(int argc, char *argv[]) +{ +	const struct raid6_calls * const * algo; +	int i, j; +	int erra, errb; + +	makedata(); + +	for ( algo = raid6_algos ; *algo ; algo++ ) { +		if ( !(*algo)->valid || (*algo)->valid() ) { +			raid6_call = **algo; + +			/* Nuke syndromes */ +			memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + +			/* Generate assumed good syndrome */ +			raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, (void **)&dataptrs); + +			for ( i = 0 ; i < NDISKS-1 ; i++ ) { +				for ( j = i+1 ; j < NDISKS ; j++ ) { +					memset(recovi, 0xf0, PAGE_SIZE); +					memset(recovj, 0xba, PAGE_SIZE); + +					dataptrs[i] = recovi; +					dataptrs[j] = recovj; + +					raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); + +					erra = memcmp(data[i], recovi, PAGE_SIZE); +					errb = memcmp(data[j], recovj, PAGE_SIZE); + +					if ( i < NDISKS-2 && j == NDISKS-1 ) { +						/* We don't implement the DQ failure scenario, since it's +						   equivalent to a RAID-5 failure (XOR, then recompute Q) */ +					} else { +						printf("algo=%-8s  faila=%3d(%c)  failb=%3d(%c)  %s\n", +						       raid6_call.name, +						       i, (i==NDISKS-2)?'P':'D', +						       j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D', +						       (!erra && !errb) ? "OK" : +						       !erra ? "ERRB" : +						       !errb ? "ERRA" : +						       "ERRAB"); +					} + +					dataptrs[i] = data[i]; +					dataptrs[j] = data[j]; +				} +			} +		} +		printf("\n"); +	} + +	printf("\n"); +	/* Pick the best algorithm test */ +	raid6_select_algo(); + +	return 0; +} diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h new file mode 100644 index 00000000000..4cf20534fe4 --- /dev/null +++ b/drivers/md/raid6x86.h @@ -0,0 +1,245 @@ +/* ----------------------------------------------------------------------- * + * + *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Bostom MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6x86.h + * + * Definitions common to x86 and x86-64 RAID-6 code only + */ + +#ifndef LINUX_RAID_RAID6X86_H +#define LINUX_RAID_RAID6X86_H + +#if defined(__i386__) || defined(__x86_64__) + +#ifdef __x86_64__ + +typedef struct { +	unsigned int fsave[27]; +	unsigned long cr0; +} raid6_mmx_save_t __attribute__((aligned(16))); + +/* N.B.: For SSE we only save %xmm0-%xmm7 even for x86-64, since +   the code doesn't know about the additional x86-64 registers */ +typedef struct { +	unsigned int sarea[8*4+2]; +	unsigned long cr0; +} raid6_sse_save_t __attribute__((aligned(16))); + +/* This is for x86-64-specific code which uses all 16 XMM registers */ +typedef struct { +	unsigned int sarea[16*4+2]; +	unsigned long cr0; +} raid6_sse16_save_t __attribute__((aligned(16))); + +/* On x86-64 the stack *SHOULD* be 16-byte aligned, but currently this +   is buggy in the kernel and it's only 8-byte aligned in places, so +   we need to do this anyway.  Sigh. */ +#define SAREA(x) ((unsigned int *)((((unsigned long)&(x)->sarea)+15) & ~15)) + +#else /* __i386__ */ + +typedef struct { +	unsigned int fsave[27]; +	unsigned long cr0; +} raid6_mmx_save_t; + +/* On i386, the stack is only 8-byte aligned, but SSE requires 16-byte +   alignment.  The +3 is so we have the slack space to manually align +   a properly-sized area correctly.  */ +typedef struct { +	unsigned int sarea[8*4+3]; +	unsigned long cr0; +} raid6_sse_save_t; + +/* Find the 16-byte aligned save area */ +#define SAREA(x) ((unsigned int *)((((unsigned long)&(x)->sarea)+15) & ~15)) + +#endif + +#ifdef __KERNEL__ /* Real code */ + +/* Note: %cr0 is 32 bits on i386 and 64 bits on x86-64 */ + +static inline unsigned long raid6_get_fpu(void) +{ +	unsigned long cr0; + +	preempt_disable(); +	asm volatile("mov %%cr0,%0 ; clts" : "=r" (cr0)); +	return cr0; +} + +static inline void raid6_put_fpu(unsigned long cr0) +{ +	asm volatile("mov %0,%%cr0" : : "r" (cr0)); +	preempt_enable(); +} + +#else /* Dummy code for user space testing */ + +static inline unsigned long raid6_get_fpu(void) +{ +	return 0xf00ba6; +} + +static inline void raid6_put_fpu(unsigned long cr0) +{ +	(void)cr0; +} + +#endif + +static inline void raid6_before_mmx(raid6_mmx_save_t *s) +{ +	s->cr0 = raid6_get_fpu(); +	asm volatile("fsave %0 ; fwait" : "=m" (s->fsave[0])); +} + +static inline void raid6_after_mmx(raid6_mmx_save_t *s) +{ +	asm volatile("frstor %0" : : "m" (s->fsave[0])); +	raid6_put_fpu(s->cr0); +} + +static inline void raid6_before_sse(raid6_sse_save_t *s) +{ +	unsigned int *rsa = SAREA(s); + +	s->cr0 = raid6_get_fpu(); + +	asm volatile("movaps %%xmm0,%0" : "=m" (rsa[0])); +	asm volatile("movaps %%xmm1,%0" : "=m" (rsa[4])); +	asm volatile("movaps %%xmm2,%0" : "=m" (rsa[8])); +	asm volatile("movaps %%xmm3,%0" : "=m" (rsa[12])); +	asm volatile("movaps %%xmm4,%0" : "=m" (rsa[16])); +	asm volatile("movaps %%xmm5,%0" : "=m" (rsa[20])); +	asm volatile("movaps %%xmm6,%0" : "=m" (rsa[24])); +	asm volatile("movaps %%xmm7,%0" : "=m" (rsa[28])); +} + +static inline void raid6_after_sse(raid6_sse_save_t *s) +{ +	unsigned int *rsa = SAREA(s); + +	asm volatile("movaps %0,%%xmm0" : : "m" (rsa[0])); +	asm volatile("movaps %0,%%xmm1" : : "m" (rsa[4])); +	asm volatile("movaps %0,%%xmm2" : : "m" (rsa[8])); +	asm volatile("movaps %0,%%xmm3" : : "m" (rsa[12])); +	asm volatile("movaps %0,%%xmm4" : : "m" (rsa[16])); +	asm volatile("movaps %0,%%xmm5" : : "m" (rsa[20])); +	asm volatile("movaps %0,%%xmm6" : : "m" (rsa[24])); +	asm volatile("movaps %0,%%xmm7" : : "m" (rsa[28])); + +	raid6_put_fpu(s->cr0); +} + +static inline void raid6_before_sse2(raid6_sse_save_t *s) +{ +	unsigned int *rsa = SAREA(s); + +	s->cr0 = raid6_get_fpu(); + +	asm volatile("movdqa %%xmm0,%0" : "=m" (rsa[0])); +	asm volatile("movdqa %%xmm1,%0" : "=m" (rsa[4])); +	asm volatile("movdqa %%xmm2,%0" : "=m" (rsa[8])); +	asm volatile("movdqa %%xmm3,%0" : "=m" (rsa[12])); +	asm volatile("movdqa %%xmm4,%0" : "=m" (rsa[16])); +	asm volatile("movdqa %%xmm5,%0" : "=m" (rsa[20])); +	asm volatile("movdqa %%xmm6,%0" : "=m" (rsa[24])); +	asm volatile("movdqa %%xmm7,%0" : "=m" (rsa[28])); +} + +static inline void raid6_after_sse2(raid6_sse_save_t *s) +{ +	unsigned int *rsa = SAREA(s); + +	asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0])); +	asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4])); +	asm volatile("movdqa %0,%%xmm2" : : "m" (rsa[8])); +	asm volatile("movdqa %0,%%xmm3" : : "m" (rsa[12])); +	asm volatile("movdqa %0,%%xmm4" : : "m" (rsa[16])); +	asm volatile("movdqa %0,%%xmm5" : : "m" (rsa[20])); +	asm volatile("movdqa %0,%%xmm6" : : "m" (rsa[24])); +	asm volatile("movdqa %0,%%xmm7" : : "m" (rsa[28])); + +	raid6_put_fpu(s->cr0); +} + +#ifdef __x86_64__ + +static inline void raid6_before_sse16(raid6_sse16_save_t *s) +{ +	unsigned int *rsa = SAREA(s); + +	s->cr0 = raid6_get_fpu(); + +	asm volatile("movdqa %%xmm0,%0" : "=m" (rsa[0])); +	asm volatile("movdqa %%xmm1,%0" : "=m" (rsa[4])); +	asm volatile("movdqa %%xmm2,%0" : "=m" (rsa[8])); +	asm volatile("movdqa %%xmm3,%0" : "=m" (rsa[12])); +	asm volatile("movdqa %%xmm4,%0" : "=m" (rsa[16])); +	asm volatile("movdqa %%xmm5,%0" : "=m" (rsa[20])); +	asm volatile("movdqa %%xmm6,%0" : "=m" (rsa[24])); +	asm volatile("movdqa %%xmm7,%0" : "=m" (rsa[28])); +	asm volatile("movdqa %%xmm8,%0" : "=m" (rsa[32])); +	asm volatile("movdqa %%xmm9,%0" : "=m" (rsa[36])); +	asm volatile("movdqa %%xmm10,%0" : "=m" (rsa[40])); +	asm volatile("movdqa %%xmm11,%0" : "=m" (rsa[44])); +	asm volatile("movdqa %%xmm12,%0" : "=m" (rsa[48])); +	asm volatile("movdqa %%xmm13,%0" : "=m" (rsa[52])); +	asm volatile("movdqa %%xmm14,%0" : "=m" (rsa[56])); +	asm volatile("movdqa %%xmm15,%0" : "=m" (rsa[60])); +} + +static inline void raid6_after_sse16(raid6_sse16_save_t *s) +{ +	unsigned int *rsa = SAREA(s); + +	asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0])); +	asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4])); +	asm volatile("movdqa %0,%%xmm2" : : "m" (rsa[8])); +	asm volatile("movdqa %0,%%xmm3" : : "m" (rsa[12])); +	asm volatile("movdqa %0,%%xmm4" : : "m" (rsa[16])); +	asm volatile("movdqa %0,%%xmm5" : : "m" (rsa[20])); +	asm volatile("movdqa %0,%%xmm6" : : "m" (rsa[24])); +	asm volatile("movdqa %0,%%xmm7" : : "m" (rsa[28])); +	asm volatile("movdqa %0,%%xmm8" : : "m" (rsa[32])); +	asm volatile("movdqa %0,%%xmm9" : : "m" (rsa[36])); +	asm volatile("movdqa %0,%%xmm10" : : "m" (rsa[40])); +	asm volatile("movdqa %0,%%xmm11" : : "m" (rsa[44])); +	asm volatile("movdqa %0,%%xmm12" : : "m" (rsa[48])); +	asm volatile("movdqa %0,%%xmm13" : : "m" (rsa[52])); +	asm volatile("movdqa %0,%%xmm14" : : "m" (rsa[56])); +	asm volatile("movdqa %0,%%xmm15" : : "m" (rsa[60])); + +	raid6_put_fpu(s->cr0); +} + +#endif /* __x86_64__ */ + +/* User space test hack */ +#ifndef __KERNEL__ +static inline int cpuid_features(void) +{ +	u32 eax = 1; +	u32 ebx, ecx, edx; + +	asm volatile("cpuid" : +		     "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)); + +	return edx; +} +#endif /* ndef __KERNEL__ */ + +#endif +#endif diff --git a/drivers/md/unroll.pl b/drivers/md/unroll.pl new file mode 100644 index 00000000000..3acc710a20e --- /dev/null +++ b/drivers/md/unroll.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl +# +# Take a piece of C code and for each line which contains the sequence $$ +# repeat n times with $ replaced by 0...n-1; the sequence $# is replaced +# by the unrolling factor, and $* with a single $ +# + +($n) = @ARGV; +$n += 0; + +while ( defined($line = <STDIN>) ) { +    if ( $line =~ /\$\$/ ) { +	$rep = $n; +    } else { +	$rep = 1; +    } +    for ( $i = 0 ; $i < $rep ; $i++ ) { +	$tmp = $line; +	$tmp =~ s/\$\$/$i/g; +	$tmp =~ s/\$\#/$n/g; +	$tmp =~ s/\$\*/\$/g; +	print $tmp; +    } +} diff --git a/drivers/md/xor.c b/drivers/md/xor.c new file mode 100644 index 00000000000..324897c4be4 --- /dev/null +++ b/drivers/md/xor.c @@ -0,0 +1,154 @@ +/* + * xor.c : Multiple Devices driver for Linux + * + * Copyright (C) 1996, 1997, 1998, 1999, 2000, + * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson. + * + * Dispatch optimized RAID-5 checksumming functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define BH_TRACE 0 +#include <linux/module.h> +#include <linux/raid/md.h> +#include <linux/raid/xor.h> +#include <asm/xor.h> + +/* The xor routines to use.  */ +static struct xor_block_template *active_template; + +void +xor_block(unsigned int count, unsigned int bytes, void **ptr) +{ +	unsigned long *p0, *p1, *p2, *p3, *p4; + +	p0 = (unsigned long *) ptr[0]; +	p1 = (unsigned long *) ptr[1]; +	if (count == 2) { +		active_template->do_2(bytes, p0, p1); +		return; +	} + +	p2 = (unsigned long *) ptr[2]; +	if (count == 3) { +		active_template->do_3(bytes, p0, p1, p2); +		return; +	} + +	p3 = (unsigned long *) ptr[3]; +	if (count == 4) { +		active_template->do_4(bytes, p0, p1, p2, p3); +		return; +	} + +	p4 = (unsigned long *) ptr[4]; +	active_template->do_5(bytes, p0, p1, p2, p3, p4); +} + +/* Set of all registered templates.  */ +static struct xor_block_template *template_list; + +#define BENCH_SIZE (PAGE_SIZE) + +static void +do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) +{ +	int speed; +	unsigned long now; +	int i, count, max; + +	tmpl->next = template_list; +	template_list = tmpl; + +	/* +	 * Count the number of XORs done during a whole jiffy, and use +	 * this to calculate the speed of checksumming.  We use a 2-page +	 * allocation to have guaranteed color L1-cache layout. +	 */ +	max = 0; +	for (i = 0; i < 5; i++) { +		now = jiffies; +		count = 0; +		while (jiffies == now) { +			mb(); +			tmpl->do_2(BENCH_SIZE, b1, b2); +			mb(); +			count++; +			mb(); +		} +		if (count > max) +			max = count; +	} + +	speed = max * (HZ * BENCH_SIZE / 1024); +	tmpl->speed = speed; + +	printk("   %-10s: %5d.%03d MB/sec\n", tmpl->name, +	       speed / 1000, speed % 1000); +} + +static int +calibrate_xor_block(void) +{ +	void *b1, *b2; +	struct xor_block_template *f, *fastest; + +	b1 = (void *) __get_free_pages(GFP_KERNEL, 2); +	if (! b1) { +		printk("raid5: Yikes!  No memory available.\n"); +		return -ENOMEM; +	} +	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE; + +	/* +	 * If this arch/cpu has a short-circuited selection, don't loop through all +	 * the possible functions, just test the best one +	 */ + +	fastest = NULL; + +#ifdef XOR_SELECT_TEMPLATE +		fastest = XOR_SELECT_TEMPLATE(fastest); +#endif + +#define xor_speed(templ)	do_xor_speed((templ), b1, b2) + +	if (fastest) { +		printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n", +			fastest->name); +		xor_speed(fastest); +	} else { +		printk(KERN_INFO "raid5: measuring checksumming speed\n"); +		XOR_TRY_TEMPLATES; +		fastest = template_list; +		for (f = fastest; f; f = f->next) +			if (f->speed > fastest->speed) +				fastest = f; +	} + +	printk("raid5: using function: %s (%d.%03d MB/sec)\n", +	       fastest->name, fastest->speed / 1000, fastest->speed % 1000); + +#undef xor_speed + +	free_pages((unsigned long)b1, 2); + +	active_template = fastest; +	return 0; +} + +static __exit void xor_exit(void) { } + +EXPORT_SYMBOL(xor_block); +MODULE_LICENSE("GPL"); + +module_init(calibrate_xor_block); +module_exit(xor_exit);  |