diff options
Diffstat (limited to 'drivers/md/md.c')
| -rw-r--r-- | drivers/md/md.c | 897 | 
1 files changed, 807 insertions, 90 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index dfc9425db70..5404b229582 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,  }  EXPORT_SYMBOL_GPL(bio_clone_mddev); +void md_trim_bio(struct bio *bio, int offset, int size) +{ +	/* 'bio' is a cloned bio which we need to trim to match +	 * the given offset and size. +	 * This requires adjusting bi_sector, bi_size, and bi_io_vec +	 */ +	int i; +	struct bio_vec *bvec; +	int sofar = 0; + +	size <<= 9; +	if (offset == 0 && size == bio->bi_size) +		return; + +	bio->bi_sector += offset; +	bio->bi_size = size; +	offset <<= 9; +	clear_bit(BIO_SEG_VALID, &bio->bi_flags); + +	while (bio->bi_idx < bio->bi_vcnt && +	       bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { +		/* remove this whole bio_vec */ +		offset -= bio->bi_io_vec[bio->bi_idx].bv_len; +		bio->bi_idx++; +	} +	if (bio->bi_idx < bio->bi_vcnt) { +		bio->bi_io_vec[bio->bi_idx].bv_offset += offset; +		bio->bi_io_vec[bio->bi_idx].bv_len -= offset; +	} +	/* avoid any complications with bi_idx being non-zero*/ +	if (bio->bi_idx) { +		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, +			(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); +		bio->bi_vcnt -= bio->bi_idx; +		bio->bi_idx = 0; +	} +	/* Make sure vcnt and last bv are not too big */ +	bio_for_each_segment(bvec, bio, i) { +		if (sofar + bvec->bv_len > size) +			bvec->bv_len = size - sofar; +		if (bvec->bv_len == 0) { +			bio->bi_vcnt = i; +			break; +		} +		sofar += bvec->bv_len; +	} +} +EXPORT_SYMBOL_GPL(md_trim_bio); +  /*   * We have a system wide 'event count' that is incremented   * on any 'interesting' event, and readers of /proc/mdstat @@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)  		rdev->sb_start = 0;  		rdev->sectors = 0;  	} +	if (rdev->bb_page) { +		put_page(rdev->bb_page); +		rdev->bb_page = NULL; +	}  } @@ -795,7 +848,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,  	bio->bi_end_io = super_written;  	atomic_inc(&mddev->pending_writes); -	submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio); +	submit_bio(WRITE_FLUSH_FUA, bio);  }  void md_super_wait(mddev_t *mddev) @@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version  	ret = -EINVAL;  	bdevname(rdev->bdev, b); -	sb = (mdp_super_t*)page_address(rdev->sb_page); +	sb = page_address(rdev->sb_page);  	if (sb->md_magic != MD_SB_MAGIC) {  		printk(KERN_ERR "md: invalid raid superblock magic on %s\n", @@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version  	rdev->preferred_minor = sb->md_minor;  	rdev->data_offset = 0;  	rdev->sb_size = MD_SB_BYTES; +	rdev->badblocks.shift = -1;  	if (sb->level == LEVEL_MULTIPATH)  		rdev->desc_nr = -1; @@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version  		ret = 1;  	} else {  		__u64 ev1, ev2; -		mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); +		mdp_super_t *refsb = page_address(refdev->sb_page);  		if (!uuid_equal(refsb, sb)) {  			printk(KERN_WARNING "md: %s has different UUID to %s\n",  				b, bdevname(refdev->bdev,b2)); @@ -1084,8 +1138,11 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version  			ret = 0;  	}  	rdev->sectors = rdev->sb_start; +	/* Limit to 4TB as metadata cannot record more than that */ +	if (rdev->sectors >= (2ULL << 32)) +		rdev->sectors = (2ULL << 32) - 2; -	if (rdev->sectors < sb->size * 2 && sb->level > 1) +	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)  		/* "this cannot possibly happen" ... */  		ret = -EINVAL; @@ -1099,7 +1156,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version  static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)  {  	mdp_disk_t *desc; -	mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); +	mdp_super_t *sb = page_address(rdev->sb_page);  	__u64 ev1 = md_event(sb);  	rdev->raid_disk = -1; @@ -1119,7 +1176,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)  		mddev->clevel[0] = 0;  		mddev->layout = sb->layout;  		mddev->raid_disks = sb->raid_disks; -		mddev->dev_sectors = sb->size * 2; +		mddev->dev_sectors = ((sector_t)sb->size) * 2;  		mddev->events = ev1;  		mddev->bitmap_info.offset = 0;  		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; @@ -1230,7 +1287,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)  	rdev->sb_size = MD_SB_BYTES; -	sb = (mdp_super_t*)page_address(rdev->sb_page); +	sb = page_address(rdev->sb_page);  	memset(sb, 0, sizeof(*sb)); @@ -1361,6 +1418,11 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)  	rdev->sb_start = calc_dev_sboffset(rdev);  	if (!num_sectors || num_sectors > rdev->sb_start)  		num_sectors = rdev->sb_start; +	/* Limit to 4TB as metadata cannot record more than that. +	 * 4TB == 2^32 KB, or 2*2^32 sectors. +	 */ +	if (num_sectors >= (2ULL << 32)) +		num_sectors = (2ULL << 32) - 2;  	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,  		       rdev->sb_page);  	md_super_wait(rdev->mddev); @@ -1395,6 +1457,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)  	return cpu_to_le32(csum);  } +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, +			    int acknowledged);  static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)  {  	struct mdp_superblock_1 *sb; @@ -1435,7 +1499,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)  	if (ret) return ret; -	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); +	sb = page_address(rdev->sb_page);  	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||  	    sb->major_version != cpu_to_le32(1) || @@ -1473,12 +1537,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)  	else  		rdev->desc_nr = le32_to_cpu(sb->dev_number); +	if (!rdev->bb_page) { +		rdev->bb_page = alloc_page(GFP_KERNEL); +		if (!rdev->bb_page) +			return -ENOMEM; +	} +	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && +	    rdev->badblocks.count == 0) { +		/* need to load the bad block list. +		 * Currently we limit it to one page. +		 */ +		s32 offset; +		sector_t bb_sector; +		u64 *bbp; +		int i; +		int sectors = le16_to_cpu(sb->bblog_size); +		if (sectors > (PAGE_SIZE / 512)) +			return -EINVAL; +		offset = le32_to_cpu(sb->bblog_offset); +		if (offset == 0) +			return -EINVAL; +		bb_sector = (long long)offset; +		if (!sync_page_io(rdev, bb_sector, sectors << 9, +				  rdev->bb_page, READ, true)) +			return -EIO; +		bbp = (u64 *)page_address(rdev->bb_page); +		rdev->badblocks.shift = sb->bblog_shift; +		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { +			u64 bb = le64_to_cpu(*bbp); +			int count = bb & (0x3ff); +			u64 sector = bb >> 10; +			sector <<= sb->bblog_shift; +			count <<= sb->bblog_shift; +			if (bb + 1 == 0) +				break; +			if (md_set_badblocks(&rdev->badblocks, +					     sector, count, 1) == 0) +				return -EINVAL; +		} +	} else if (sb->bblog_offset == 0) +		rdev->badblocks.shift = -1; +  	if (!refdev) {  		ret = 1;  	} else {  		__u64 ev1, ev2; -		struct mdp_superblock_1 *refsb =  -			(struct mdp_superblock_1*)page_address(refdev->sb_page); +		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);  		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||  		    sb->level != refsb->level || @@ -1513,7 +1617,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)  static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)  { -	struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); +	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);  	__u64 ev1 = le64_to_cpu(sb->events);  	rdev->raid_disk = -1; @@ -1619,13 +1723,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)  	int max_dev, i;  	/* make rdev->sb match mddev and rdev data. */ -	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); +	sb = page_address(rdev->sb_page);  	sb->feature_map = 0;  	sb->pad0 = 0;  	sb->recovery_offset = cpu_to_le64(0);  	memset(sb->pad1, 0, sizeof(sb->pad1)); -	memset(sb->pad2, 0, sizeof(sb->pad2));  	memset(sb->pad3, 0, sizeof(sb->pad3));  	sb->utime = cpu_to_le64((__u64)mddev->utime); @@ -1643,6 +1746,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)  	sb->level = cpu_to_le32(mddev->level);  	sb->layout = cpu_to_le32(mddev->layout); +	if (test_bit(WriteMostly, &rdev->flags)) +		sb->devflags |= WriteMostly1; +	else +		sb->devflags &= ~WriteMostly1; +  	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {  		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);  		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); @@ -1665,6 +1773,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)  		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);  	} +	if (rdev->badblocks.count == 0) +		/* Nothing to do for bad blocks*/ ; +	else if (sb->bblog_offset == 0) +		/* Cannot record bad blocks on this device */ +		md_error(mddev, rdev); +	else { +		struct badblocks *bb = &rdev->badblocks; +		u64 *bbp = (u64 *)page_address(rdev->bb_page); +		u64 *p = bb->page; +		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); +		if (bb->changed) { +			unsigned seq; + +retry: +			seq = read_seqbegin(&bb->lock); + +			memset(bbp, 0xff, PAGE_SIZE); + +			for (i = 0 ; i < bb->count ; i++) { +				u64 internal_bb = *p++; +				u64 store_bb = ((BB_OFFSET(internal_bb) << 10) +						| BB_LEN(internal_bb)); +				*bbp++ = cpu_to_le64(store_bb); +			} +			if (read_seqretry(&bb->lock, seq)) +				goto retry; + +			bb->sector = (rdev->sb_start + +				      (int)le32_to_cpu(sb->bblog_offset)); +			bb->size = le16_to_cpu(sb->bblog_size); +			bb->changed = 0; +		} +	} +  	max_dev = 0;  	list_for_each_entry(rdev2, &mddev->disks, same_set)  		if (rdev2->desc_nr+1 > max_dev) @@ -1724,7 +1866,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)  			num_sectors = max_sectors;  		rdev->sb_start = sb_start;  	} -	sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); +	sb = page_address(rdev->sb_page);  	sb->data_size = cpu_to_le64(num_sectors);  	sb->super_offset = rdev->sb_start;  	sb->sb_csum = calc_sb_1_csum(sb); @@ -1922,7 +2064,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)  	bd_link_disk_holder(rdev->bdev, mddev->gendisk);  	/* May as well allow recovery to be retried once */ -	mddev->recovery_disabled = 0; +	mddev->recovery_disabled++;  	return 0; @@ -1953,6 +2095,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)  	sysfs_remove_link(&rdev->kobj, "block");  	sysfs_put(rdev->sysfs_state);  	rdev->sysfs_state = NULL; +	kfree(rdev->badblocks.page); +	rdev->badblocks.count = 0; +	rdev->badblocks.page = NULL;  	/* We need to delay this, otherwise we can deadlock when  	 * writing to 'remove' to "dev/state".  We also need  	 * to delay it due to rcu usage. @@ -2127,10 +2272,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)  		printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);  		switch (major_version) {  		case 0: -			print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); +			print_sb_90(page_address(rdev->sb_page));  			break;  		case 1: -			print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); +			print_sb_1(page_address(rdev->sb_page));  			break;  		}  	} else @@ -2194,6 +2339,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)  	mdk_rdev_t *rdev;  	int sync_req;  	int nospares = 0; +	int any_badblocks_changed = 0;  repeat:  	/* First make sure individual recovery_offsets are correct */ @@ -2208,8 +2354,18 @@ repeat:  	if (!mddev->persistent) {  		clear_bit(MD_CHANGE_CLEAN, &mddev->flags);  		clear_bit(MD_CHANGE_DEVS, &mddev->flags); -		if (!mddev->external) +		if (!mddev->external) {  			clear_bit(MD_CHANGE_PENDING, &mddev->flags); +			list_for_each_entry(rdev, &mddev->disks, same_set) { +				if (rdev->badblocks.changed) { +					md_ack_all_badblocks(&rdev->badblocks); +					md_error(mddev, rdev); +				} +				clear_bit(Blocked, &rdev->flags); +				clear_bit(BlockedBadBlocks, &rdev->flags); +				wake_up(&rdev->blocked_wait); +			} +		}  		wake_up(&mddev->sb_wait);  		return;  	} @@ -2265,6 +2421,14 @@ repeat:  		MD_BUG();  		mddev->events --;  	} + +	list_for_each_entry(rdev, &mddev->disks, same_set) { +		if (rdev->badblocks.changed) +			any_badblocks_changed++; +		if (test_bit(Faulty, &rdev->flags)) +			set_bit(FaultRecorded, &rdev->flags); +	} +  	sync_sbs(mddev, nospares);  	spin_unlock_irq(&mddev->write_lock); @@ -2290,6 +2454,13 @@ repeat:  				bdevname(rdev->bdev,b),  				(unsigned long long)rdev->sb_start);  			rdev->sb_events = mddev->events; +			if (rdev->badblocks.size) { +				md_super_write(mddev, rdev, +					       rdev->badblocks.sector, +					       rdev->badblocks.size << 9, +					       rdev->bb_page); +				rdev->badblocks.size = 0; +			}  		} else  			dprintk(")\n"); @@ -2313,6 +2484,15 @@ repeat:  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))  		sysfs_notify(&mddev->kobj, NULL, "sync_completed"); +	list_for_each_entry(rdev, &mddev->disks, same_set) { +		if (test_and_clear_bit(FaultRecorded, &rdev->flags)) +			clear_bit(Blocked, &rdev->flags); + +		if (any_badblocks_changed) +			md_ack_all_badblocks(&rdev->badblocks); +		clear_bit(BlockedBadBlocks, &rdev->flags); +		wake_up(&rdev->blocked_wait); +	}  }  /* words written to sysfs files may, or may not, be \n terminated. @@ -2347,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)  	char *sep = "";  	size_t len = 0; -	if (test_bit(Faulty, &rdev->flags)) { +	if (test_bit(Faulty, &rdev->flags) || +	    rdev->badblocks.unacked_exist) {  		len+= sprintf(page+len, "%sfaulty",sep);  		sep = ",";  	} @@ -2359,7 +2540,8 @@ state_show(mdk_rdev_t *rdev, char *page)  		len += sprintf(page+len, "%swrite_mostly",sep);  		sep = ",";  	} -	if (test_bit(Blocked, &rdev->flags)) { +	if (test_bit(Blocked, &rdev->flags) || +	    rdev->badblocks.unacked_exist) {  		len += sprintf(page+len, "%sblocked", sep);  		sep = ",";  	} @@ -2368,6 +2550,10 @@ state_show(mdk_rdev_t *rdev, char *page)  		len += sprintf(page+len, "%sspare", sep);  		sep = ",";  	} +	if (test_bit(WriteErrorSeen, &rdev->flags)) { +		len += sprintf(page+len, "%swrite_error", sep); +		sep = ","; +	}  	return len+sprintf(page+len, "\n");  } @@ -2375,18 +2561,23 @@ static ssize_t  state_store(mdk_rdev_t *rdev, const char *buf, size_t len)  {  	/* can write -	 *  faulty  - simulates and error +	 *  faulty  - simulates an error  	 *  remove  - disconnects the device  	 *  writemostly - sets write_mostly  	 *  -writemostly - clears write_mostly -	 *  blocked - sets the Blocked flag -	 *  -blocked - clears the Blocked flag +	 *  blocked - sets the Blocked flags +	 *  -blocked - clears the Blocked and possibly simulates an error  	 *  insync - sets Insync providing device isn't active +	 *  write_error - sets WriteErrorSeen +	 *  -write_error - clears WriteErrorSeen  	 */  	int err = -EINVAL;  	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {  		md_error(rdev->mddev, rdev); -		err = 0; +		if (test_bit(Faulty, &rdev->flags)) +			err = 0; +		else +			err = -EBUSY;  	} else if (cmd_match(buf, "remove")) {  		if (rdev->raid_disk >= 0)  			err = -EBUSY; @@ -2408,7 +2599,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)  		set_bit(Blocked, &rdev->flags);  		err = 0;  	} else if (cmd_match(buf, "-blocked")) { +		if (!test_bit(Faulty, &rdev->flags) && +		    rdev->badblocks.unacked_exist) { +			/* metadata handler doesn't understand badblocks, +			 * so we need to fail the device +			 */ +			md_error(rdev->mddev, rdev); +		}  		clear_bit(Blocked, &rdev->flags); +		clear_bit(BlockedBadBlocks, &rdev->flags);  		wake_up(&rdev->blocked_wait);  		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);  		md_wakeup_thread(rdev->mddev->thread); @@ -2417,6 +2616,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)  	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {  		set_bit(In_sync, &rdev->flags);  		err = 0; +	} else if (cmd_match(buf, "write_error")) { +		set_bit(WriteErrorSeen, &rdev->flags); +		err = 0; +	} else if (cmd_match(buf, "-write_error")) { +		clear_bit(WriteErrorSeen, &rdev->flags); +		err = 0;  	}  	if (!err)  		sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -2459,7 +2664,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)  {  	char *e;  	int err; -	char nm[20];  	int slot = simple_strtoul(buf, &e, 10);  	if (strncmp(buf, "none", 4)==0)  		slot = -1; @@ -2482,8 +2686,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)  			hot_remove_disk(rdev->mddev, rdev->raid_disk);  		if (err)  			return err; -		sprintf(nm, "rd%d", rdev->raid_disk); -		sysfs_remove_link(&rdev->mddev->kobj, nm); +		sysfs_unlink_rdev(rdev->mddev, rdev);  		rdev->raid_disk = -1;  		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);  		md_wakeup_thread(rdev->mddev->thread); @@ -2522,8 +2725,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)  			return err;  		} else  			sysfs_notify_dirent_safe(rdev->sysfs_state); -		sprintf(nm, "rd%d", rdev->raid_disk); -		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) +		if (sysfs_link_rdev(rdev->mddev, rdev))  			/* failure here is OK */;  		/* don't wakeup anyone, leave that to userspace. */  	} else { @@ -2712,6 +2914,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le  static struct rdev_sysfs_entry rdev_recovery_start =  __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack); +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); + +static ssize_t bb_show(mdk_rdev_t *rdev, char *page) +{ +	return badblocks_show(&rdev->badblocks, page, 0); +} +static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) +{ +	int rv = badblocks_store(&rdev->badblocks, page, len, 0); +	/* Maybe that ack was all we needed */ +	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) +		wake_up(&rdev->blocked_wait); +	return rv; +} +static struct rdev_sysfs_entry rdev_bad_blocks = +__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); + + +static ssize_t ubb_show(mdk_rdev_t *rdev, char *page) +{ +	return badblocks_show(&rdev->badblocks, page, 1); +} +static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len) +{ +	return badblocks_store(&rdev->badblocks, page, len, 1); +} +static struct rdev_sysfs_entry rdev_unack_bad_blocks = +__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); +  static struct attribute *rdev_default_attrs[] = {  	&rdev_state.attr,  	&rdev_errors.attr, @@ -2719,6 +2954,8 @@ static struct attribute *rdev_default_attrs[] = {  	&rdev_offset.attr,  	&rdev_size.attr,  	&rdev_recovery_start.attr, +	&rdev_bad_blocks.attr, +	&rdev_unack_bad_blocks.attr,  	NULL,  };  static ssize_t @@ -2782,7 +3019,7 @@ static struct kobj_type rdev_ktype = {  	.default_attrs	= rdev_default_attrs,  }; -void md_rdev_init(mdk_rdev_t *rdev) +int md_rdev_init(mdk_rdev_t *rdev)  {  	rdev->desc_nr = -1;  	rdev->saved_raid_disk = -1; @@ -2792,12 +3029,27 @@ void md_rdev_init(mdk_rdev_t *rdev)  	rdev->sb_events = 0;  	rdev->last_read_error.tv_sec  = 0;  	rdev->last_read_error.tv_nsec = 0; +	rdev->sb_loaded = 0; +	rdev->bb_page = NULL;  	atomic_set(&rdev->nr_pending, 0);  	atomic_set(&rdev->read_errors, 0);  	atomic_set(&rdev->corrected_errors, 0);  	INIT_LIST_HEAD(&rdev->same_set);  	init_waitqueue_head(&rdev->blocked_wait); + +	/* Add space to store bad block list. +	 * This reserves the space even on arrays where it cannot +	 * be used - I wonder if that matters +	 */ +	rdev->badblocks.count = 0; +	rdev->badblocks.shift = 0; +	rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); +	seqlock_init(&rdev->badblocks.lock); +	if (rdev->badblocks.page == NULL) +		return -ENOMEM; + +	return 0;  }  EXPORT_SYMBOL_GPL(md_rdev_init);  /* @@ -2823,8 +3075,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi  		return ERR_PTR(-ENOMEM);  	} -	md_rdev_init(rdev); -	if ((err = alloc_disk_sb(rdev))) +	err = md_rdev_init(rdev); +	if (err) +		goto abort_free; +	err = alloc_disk_sb(rdev); +	if (err)  		goto abort_free;  	err = lock_rdev(rdev, newdev, super_format == -2); @@ -2860,15 +3115,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi  			goto abort_free;  		}  	} +	if (super_format == -1) +		/* hot-add for 0.90, or non-persistent: so no badblocks */ +		rdev->badblocks.shift = -1;  	return rdev;  abort_free: -	if (rdev->sb_page) { -		if (rdev->bdev) -			unlock_rdev(rdev); -		free_disk_sb(rdev); -	} +	if (rdev->bdev) +		unlock_rdev(rdev); +	free_disk_sb(rdev); +	kfree(rdev->badblocks.page);  	kfree(rdev);  	return ERR_PTR(err);  } @@ -3149,15 +3406,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)  	}  	list_for_each_entry(rdev, &mddev->disks, same_set) { -		char nm[20];  		if (rdev->raid_disk < 0)  			continue;  		if (rdev->new_raid_disk >= mddev->raid_disks)  			rdev->new_raid_disk = -1;  		if (rdev->new_raid_disk == rdev->raid_disk)  			continue; -		sprintf(nm, "rd%d", rdev->raid_disk); -		sysfs_remove_link(&mddev->kobj, nm); +		sysfs_unlink_rdev(mddev, rdev);  	}  	list_for_each_entry(rdev, &mddev->disks, same_set) {  		if (rdev->raid_disk < 0) @@ -3168,11 +3423,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)  		if (rdev->raid_disk < 0)  			clear_bit(In_sync, &rdev->flags);  		else { -			char nm[20]; -			sprintf(nm, "rd%d", rdev->raid_disk); -			if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) -				printk("md: cannot register %s for %s after level change\n", -				       nm, mdname(mddev)); +			if (sysfs_link_rdev(mddev, rdev)) +				printk(KERN_WARNING "md: cannot register rd%d" +				       " for %s after level change\n", +				       rdev->raid_disk, mdname(mddev));  		}  	} @@ -4504,7 +4758,8 @@ int md_run(mddev_t *mddev)  	}  	if (mddev->bio_set == NULL) -		mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); +		mddev->bio_set = bioset_create(BIO_POOL_SIZE, +					       sizeof(mddev_t *));  	spin_lock(&pers_lock);  	pers = find_pers(mddev->level, mddev->clevel); @@ -4621,12 +4876,9 @@ int md_run(mddev_t *mddev)  	smp_wmb();  	mddev->ready = 1;  	list_for_each_entry(rdev, &mddev->disks, same_set) -		if (rdev->raid_disk >= 0) { -			char nm[20]; -			sprintf(nm, "rd%d", rdev->raid_disk); -			if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) +		if (rdev->raid_disk >= 0) +			if (sysfs_link_rdev(mddev, rdev))  				/* failure here is OK */; -		}  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -4854,11 +5106,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)  		sysfs_notify_dirent_safe(mddev->sysfs_state);  		list_for_each_entry(rdev, &mddev->disks, same_set) -			if (rdev->raid_disk >= 0) { -				char nm[20]; -				sprintf(nm, "rd%d", rdev->raid_disk); -				sysfs_remove_link(&mddev->kobj, nm); -			} +			if (rdev->raid_disk >= 0) +				sysfs_unlink_rdev(mddev, rdev);  		set_capacity(disk, 0);  		mutex_unlock(&mddev->open_mutex); @@ -5750,6 +5999,8 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)  		return -ENODEV;  	md_error(mddev, rdev); +	if (!test_bit(Faulty, &rdev->flags)) +		return -EBUSY;  	return 0;  } @@ -6198,18 +6449,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)  	if (!rdev || test_bit(Faulty, &rdev->flags))  		return; -	if (mddev->external) -		set_bit(Blocked, &rdev->flags); -/* -	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", -		mdname(mddev), -		MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), -		__builtin_return_address(0),__builtin_return_address(1), -		__builtin_return_address(2),__builtin_return_address(3)); -*/ -	if (!mddev->pers) -		return; -	if (!mddev->pers->error_handler) +	if (!mddev->pers || !mddev->pers->error_handler)  		return;  	mddev->pers->error_handler(mddev,rdev);  	if (mddev->degraded) @@ -6933,11 +7173,14 @@ void md_do_sync(mddev_t *mddev)  			atomic_add(sectors, &mddev->recovery_active);  		} +		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) +			break; +  		j += sectors;  		if (j>1) mddev->curr_resync = j;  		mddev->curr_mark_cnt = io_sectors;  		if (last_check == 0) -			/* this is the earliers that rebuilt will be +			/* this is the earliest that rebuild will be  			 * visible in /proc/mdstat  			 */  			md_new_event(mddev); @@ -6946,10 +7189,6 @@ void md_do_sync(mddev_t *mddev)  			continue;  		last_check = io_sectors; - -		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) -			break; -  	repeat:  		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {  			/* step marks */ @@ -7067,29 +7306,23 @@ static int remove_and_add_spares(mddev_t *mddev)  		    atomic_read(&rdev->nr_pending)==0) {  			if (mddev->pers->hot_remove_disk(  				    mddev, rdev->raid_disk)==0) { -				char nm[20]; -				sprintf(nm,"rd%d", rdev->raid_disk); -				sysfs_remove_link(&mddev->kobj, nm); +				sysfs_unlink_rdev(mddev, rdev);  				rdev->raid_disk = -1;  			}  		} -	if (mddev->degraded && !mddev->recovery_disabled) { +	if (mddev->degraded) {  		list_for_each_entry(rdev, &mddev->disks, same_set) {  			if (rdev->raid_disk >= 0 &&  			    !test_bit(In_sync, &rdev->flags) && -			    !test_bit(Faulty, &rdev->flags) && -			    !test_bit(Blocked, &rdev->flags)) +			    !test_bit(Faulty, &rdev->flags))  				spares++;  			if (rdev->raid_disk < 0  			    && !test_bit(Faulty, &rdev->flags)) {  				rdev->recovery_offset = 0;  				if (mddev->pers->  				    hot_add_disk(mddev, rdev) == 0) { -					char nm[20]; -					sprintf(nm, "rd%d", rdev->raid_disk); -					if (sysfs_create_link(&mddev->kobj, -							      &rdev->kobj, nm)) +					if (sysfs_link_rdev(mddev, rdev))  						/* failure here is OK */;  					spares++;  					md_new_event(mddev); @@ -7138,6 +7371,8 @@ static void reap_sync_thread(mddev_t *mddev)  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);  	sysfs_notify_dirent_safe(mddev->sysfs_action);  	md_new_event(mddev); +	if (mddev->event_work.func) +		queue_work(md_misc_wq, &mddev->event_work);  }  /* @@ -7170,9 +7405,6 @@ void md_check_recovery(mddev_t *mddev)  	if (mddev->bitmap)  		bitmap_daemon_work(mddev); -	if (mddev->ro) -		return; -  	if (signal_pending(current)) {  		if (mddev->pers->sync_request && !mddev->external) {  			printk(KERN_INFO "md: %s in immediate safe mode\n", @@ -7209,9 +7441,7 @@ void md_check_recovery(mddev_t *mddev)  				    atomic_read(&rdev->nr_pending)==0) {  					if (mddev->pers->hot_remove_disk(  						    mddev, rdev->raid_disk)==0) { -						char nm[20]; -						sprintf(nm,"rd%d", rdev->raid_disk); -						sysfs_remove_link(&mddev->kobj, nm); +						sysfs_unlink_rdev(mddev, rdev);  						rdev->raid_disk = -1;  					}  				} @@ -7331,12 +7561,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)  {  	sysfs_notify_dirent_safe(rdev->sysfs_state);  	wait_event_timeout(rdev->blocked_wait, -			   !test_bit(Blocked, &rdev->flags), +			   !test_bit(Blocked, &rdev->flags) && +			   !test_bit(BlockedBadBlocks, &rdev->flags),  			   msecs_to_jiffies(5000));  	rdev_dec_pending(rdev, mddev);  }  EXPORT_SYMBOL(md_wait_for_blocked_rdev); + +/* Bad block management. + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide.  This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + *  A 'shift' can be set so that larger blocks are tracked and + *  consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so md_is_badblock + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad.  So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * We return + *  0 if there are no known bad blocks in the range + *  1 if there are known bad block which are all acknowledged + * -1 if there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, +		   sector_t *first_bad, int *bad_sectors) +{ +	int hi; +	int lo = 0; +	u64 *p = bb->page; +	int rv = 0; +	sector_t target = s + sectors; +	unsigned seq; + +	if (bb->shift > 0) { +		/* round the start down, and the end up */ +		s >>= bb->shift; +		target += (1<<bb->shift) - 1; +		target >>= bb->shift; +		sectors = target - s; +	} +	/* 'target' is now the first block after the bad range */ + +retry: +	seq = read_seqbegin(&bb->lock); + +	hi = bb->count; + +	/* Binary search between lo and hi for 'target' +	 * i.e. for the last range that starts before 'target' +	 */ +	/* INVARIANT: ranges before 'lo' and at-or-after 'hi' +	 * are known not to be the last range before target. +	 * VARIANT: hi-lo is the number of possible +	 * ranges, and decreases until it reaches 1 +	 */ +	while (hi - lo > 1) { +		int mid = (lo + hi) / 2; +		sector_t a = BB_OFFSET(p[mid]); +		if (a < target) +			/* This could still be the one, earlier ranges +			 * could not. */ +			lo = mid; +		else +			/* This and later ranges are definitely out. */ +			hi = mid; +	} +	/* 'lo' might be the last that started before target, but 'hi' isn't */ +	if (hi > lo) { +		/* need to check all range that end after 's' to see if +		 * any are unacknowledged. +		 */ +		while (lo >= 0 && +		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { +			if (BB_OFFSET(p[lo]) < target) { +				/* starts before the end, and finishes after +				 * the start, so they must overlap +				 */ +				if (rv != -1 && BB_ACK(p[lo])) +					rv = 1; +				else +					rv = -1; +				*first_bad = BB_OFFSET(p[lo]); +				*bad_sectors = BB_LEN(p[lo]); +			} +			lo--; +		} +	} + +	if (read_seqretry(&bb->lock, seq)) +		goto retry; + +	return rv; +} +EXPORT_SYMBOL_GPL(md_is_badblock); + +/* + * Add a range of bad blocks to the table. + * This might extend the table, or might contract it + * if two adjacent ranges can be merged. + * We binary-search to find the 'insertion' point, then + * decide how best to handle it. + */ +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, +			    int acknowledged) +{ +	u64 *p; +	int lo, hi; +	int rv = 1; + +	if (bb->shift < 0) +		/* badblocks are disabled */ +		return 0; + +	if (bb->shift) { +		/* round the start down, and the end up */ +		sector_t next = s + sectors; +		s >>= bb->shift; +		next += (1<<bb->shift) - 1; +		next >>= bb->shift; +		sectors = next - s; +	} + +	write_seqlock_irq(&bb->lock); + +	p = bb->page; +	lo = 0; +	hi = bb->count; +	/* Find the last range that starts at-or-before 's' */ +	while (hi - lo > 1) { +		int mid = (lo + hi) / 2; +		sector_t a = BB_OFFSET(p[mid]); +		if (a <= s) +			lo = mid; +		else +			hi = mid; +	} +	if (hi > lo && BB_OFFSET(p[lo]) > s) +		hi = lo; + +	if (hi > lo) { +		/* we found a range that might merge with the start +		 * of our new range +		 */ +		sector_t a = BB_OFFSET(p[lo]); +		sector_t e = a + BB_LEN(p[lo]); +		int ack = BB_ACK(p[lo]); +		if (e >= s) { +			/* Yes, we can merge with a previous range */ +			if (s == a && s + sectors >= e) +				/* new range covers old */ +				ack = acknowledged; +			else +				ack = ack && acknowledged; + +			if (e < s + sectors) +				e = s + sectors; +			if (e - a <= BB_MAX_LEN) { +				p[lo] = BB_MAKE(a, e-a, ack); +				s = e; +			} else { +				/* does not all fit in one range, +				 * make p[lo] maximal +				 */ +				if (BB_LEN(p[lo]) != BB_MAX_LEN) +					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); +				s = a + BB_MAX_LEN; +			} +			sectors = e - s; +		} +	} +	if (sectors && hi < bb->count) { +		/* 'hi' points to the first range that starts after 's'. +		 * Maybe we can merge with the start of that range */ +		sector_t a = BB_OFFSET(p[hi]); +		sector_t e = a + BB_LEN(p[hi]); +		int ack = BB_ACK(p[hi]); +		if (a <= s + sectors) { +			/* merging is possible */ +			if (e <= s + sectors) { +				/* full overlap */ +				e = s + sectors; +				ack = acknowledged; +			} else +				ack = ack && acknowledged; + +			a = s; +			if (e - a <= BB_MAX_LEN) { +				p[hi] = BB_MAKE(a, e-a, ack); +				s = e; +			} else { +				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); +				s = a + BB_MAX_LEN; +			} +			sectors = e - s; +			lo = hi; +			hi++; +		} +	} +	if (sectors == 0 && hi < bb->count) { +		/* we might be able to combine lo and hi */ +		/* Note: 's' is at the end of 'lo' */ +		sector_t a = BB_OFFSET(p[hi]); +		int lolen = BB_LEN(p[lo]); +		int hilen = BB_LEN(p[hi]); +		int newlen = lolen + hilen - (s - a); +		if (s >= a && newlen < BB_MAX_LEN) { +			/* yes, we can combine them */ +			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); +			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); +			memmove(p + hi, p + hi + 1, +				(bb->count - hi - 1) * 8); +			bb->count--; +		} +	} +	while (sectors) { +		/* didn't merge (it all). +		 * Need to add a range just before 'hi' */ +		if (bb->count >= MD_MAX_BADBLOCKS) { +			/* No room for more */ +			rv = 0; +			break; +		} else { +			int this_sectors = sectors; +			memmove(p + hi + 1, p + hi, +				(bb->count - hi) * 8); +			bb->count++; + +			if (this_sectors > BB_MAX_LEN) +				this_sectors = BB_MAX_LEN; +			p[hi] = BB_MAKE(s, this_sectors, acknowledged); +			sectors -= this_sectors; +			s += this_sectors; +		} +	} + +	bb->changed = 1; +	if (!acknowledged) +		bb->unacked_exist = 1; +	write_sequnlock_irq(&bb->lock); + +	return rv; +} + +int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, +		       int acknowledged) +{ +	int rv = md_set_badblocks(&rdev->badblocks, +				  s + rdev->data_offset, sectors, acknowledged); +	if (rv) { +		/* Make sure they get written out promptly */ +		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); +		md_wakeup_thread(rdev->mddev->thread); +	} +	return rv; +} +EXPORT_SYMBOL_GPL(rdev_set_badblocks); + +/* + * Remove a range of bad blocks from the table. + * This may involve extending the table if we spilt a region, + * but it must not fail.  So if the table becomes full, we just + * drop the remove request. + */ +static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) +{ +	u64 *p; +	int lo, hi; +	sector_t target = s + sectors; +	int rv = 0; + +	if (bb->shift > 0) { +		/* When clearing we round the start up and the end down. +		 * This should not matter as the shift should align with +		 * the block size and no rounding should ever be needed. +		 * However it is better the think a block is bad when it +		 * isn't than to think a block is not bad when it is. +		 */ +		s += (1<<bb->shift) - 1; +		s >>= bb->shift; +		target >>= bb->shift; +		sectors = target - s; +	} + +	write_seqlock_irq(&bb->lock); + +	p = bb->page; +	lo = 0; +	hi = bb->count; +	/* Find the last range that starts before 'target' */ +	while (hi - lo > 1) { +		int mid = (lo + hi) / 2; +		sector_t a = BB_OFFSET(p[mid]); +		if (a < target) +			lo = mid; +		else +			hi = mid; +	} +	if (hi > lo) { +		/* p[lo] is the last range that could overlap the +		 * current range.  Earlier ranges could also overlap, +		 * but only this one can overlap the end of the range. +		 */ +		if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { +			/* Partial overlap, leave the tail of this range */ +			int ack = BB_ACK(p[lo]); +			sector_t a = BB_OFFSET(p[lo]); +			sector_t end = a + BB_LEN(p[lo]); + +			if (a < s) { +				/* we need to split this range */ +				if (bb->count >= MD_MAX_BADBLOCKS) { +					rv = 0; +					goto out; +				} +				memmove(p+lo+1, p+lo, (bb->count - lo) * 8); +				bb->count++; +				p[lo] = BB_MAKE(a, s-a, ack); +				lo++; +			} +			p[lo] = BB_MAKE(target, end - target, ack); +			/* there is no longer an overlap */ +			hi = lo; +			lo--; +		} +		while (lo >= 0 && +		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { +			/* This range does overlap */ +			if (BB_OFFSET(p[lo]) < s) { +				/* Keep the early parts of this range. */ +				int ack = BB_ACK(p[lo]); +				sector_t start = BB_OFFSET(p[lo]); +				p[lo] = BB_MAKE(start, s - start, ack); +				/* now low doesn't overlap, so.. */ +				break; +			} +			lo--; +		} +		/* 'lo' is strictly before, 'hi' is strictly after, +		 * anything between needs to be discarded +		 */ +		if (hi - lo > 1) { +			memmove(p+lo+1, p+hi, (bb->count - hi) * 8); +			bb->count -= (hi - lo - 1); +		} +	} + +	bb->changed = 1; +out: +	write_sequnlock_irq(&bb->lock); +	return rv; +} + +int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors) +{ +	return md_clear_badblocks(&rdev->badblocks, +				  s + rdev->data_offset, +				  sectors); +} +EXPORT_SYMBOL_GPL(rdev_clear_badblocks); + +/* + * Acknowledge all bad blocks in a list. + * This only succeeds if ->changed is clear.  It is used by + * in-kernel metadata updates + */ +void md_ack_all_badblocks(struct badblocks *bb) +{ +	if (bb->page == NULL || bb->changed) +		/* no point even trying */ +		return; +	write_seqlock_irq(&bb->lock); + +	if (bb->changed == 0) { +		u64 *p = bb->page; +		int i; +		for (i = 0; i < bb->count ; i++) { +			if (!BB_ACK(p[i])) { +				sector_t start = BB_OFFSET(p[i]); +				int len = BB_LEN(p[i]); +				p[i] = BB_MAKE(start, len, 1); +			} +		} +		bb->unacked_exist = 0; +	} +	write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(md_ack_all_badblocks); + +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + *    are recorded as bad.  The list is truncated to fit within + *    the one-page limit of sysfs. + *    Writing "sector length" to this file adds an acknowledged + *    bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + *    been acknowledged.  Writing to this file adds bad blocks + *    without acknowledging them.  This is largely for testing. + */ + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack) +{ +	size_t len; +	int i; +	u64 *p = bb->page; +	unsigned seq; + +	if (bb->shift < 0) +		return 0; + +retry: +	seq = read_seqbegin(&bb->lock); + +	len = 0; +	i = 0; + +	while (len < PAGE_SIZE && i < bb->count) { +		sector_t s = BB_OFFSET(p[i]); +		unsigned int length = BB_LEN(p[i]); +		int ack = BB_ACK(p[i]); +		i++; + +		if (unack && ack) +			continue; + +		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", +				(unsigned long long)s << bb->shift, +				length << bb->shift); +	} +	if (unack && len == 0) +		bb->unacked_exist = 0; + +	if (read_seqretry(&bb->lock, seq)) +		goto retry; + +	return len; +} + +#define DO_DEBUG 1 + +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) +{ +	unsigned long long sector; +	int length; +	char newline; +#ifdef DO_DEBUG +	/* Allow clearing via sysfs *only* for testing/debugging. +	 * Normally only a successful write may clear a badblock +	 */ +	int clear = 0; +	if (page[0] == '-') { +		clear = 1; +		page++; +	} +#endif /* DO_DEBUG */ + +	switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { +	case 3: +		if (newline != '\n') +			return -EINVAL; +	case 2: +		if (length <= 0) +			return -EINVAL; +		break; +	default: +		return -EINVAL; +	} + +#ifdef DO_DEBUG +	if (clear) { +		md_clear_badblocks(bb, sector, length); +		return len; +	} +#endif /* DO_DEBUG */ +	if (md_set_badblocks(bb, sector, length, !unack)) +		return len; +	else +		return -ENOSPC; +} +  static int md_notify_reboot(struct notifier_block *this,  			    unsigned long code, void *x)  {  |