diff options
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/btrfs/Makefile | 3 | ||||
| -rw-r--r-- | fs/btrfs/acl.c | 17 | ||||
| -rw-r--r-- | fs/btrfs/backref.c | 776 | ||||
| -rw-r--r-- | fs/btrfs/backref.h | 62 | ||||
| -rw-r--r-- | fs/btrfs/btrfs_inode.h | 17 | ||||
| -rw-r--r-- | fs/btrfs/compression.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/ctree.c | 10 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 198 | ||||
| -rw-r--r-- | fs/btrfs/delayed-inode.c | 50 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 441 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.h | 4 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 848 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 614 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.h | 23 | ||||
| -rw-r--r-- | fs/btrfs/file-item.c | 17 | ||||
| -rw-r--r-- | fs/btrfs/file.c | 25 | ||||
| -rw-r--r-- | fs/btrfs/free-space-cache.c | 926 | ||||
| -rw-r--r-- | fs/btrfs/inode-map.c | 6 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 457 | ||||
| -rw-r--r-- | fs/btrfs/ioctl.c | 227 | ||||
| -rw-r--r-- | fs/btrfs/ioctl.h | 29 | ||||
| -rw-r--r-- | fs/btrfs/print-tree.c | 8 | ||||
| -rw-r--r-- | fs/btrfs/reada.c | 951 | ||||
| -rw-r--r-- | fs/btrfs/relocation.c | 24 | ||||
| -rw-r--r-- | fs/btrfs/scrub.c | 587 | ||||
| -rw-r--r-- | fs/btrfs/super.c | 298 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 146 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 19 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 207 | ||||
| -rw-r--r-- | fs/btrfs/volumes.h | 18 | ||||
| -rw-r--r-- | fs/btrfs/xattr.c | 11 | 
31 files changed, 5421 insertions, 1601 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 40e6ac08c21..c0ddfd29c5e 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \  	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \  	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \  	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \ -	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o +	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ +	   reada.o backref.o  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index eb159aaa5a1..89b156d85d6 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)  		if (!value)  			return ERR_PTR(-ENOMEM);  		size = __btrfs_getxattr(inode, name, value, size); -		if (size > 0) { -			acl = posix_acl_from_xattr(value, size); -			if (IS_ERR(acl)) { -				kfree(value); -				return acl; -			} -			set_cached_acl(inode, type, acl); -		} -		kfree(value); +	} +	if (size > 0) { +		acl = posix_acl_from_xattr(value, size);  	} else if (size == -ENOENT || size == -ENODATA || size == 0) {  		/* FIXME, who returns -ENOENT?  I think nobody */  		acl = NULL; -		set_cached_acl(inode, type, acl);  	} else {  		acl = ERR_PTR(-EIO);  	} +	kfree(value); + +	if (!IS_ERR(acl)) +		set_cached_acl(inode, type, acl);  	return acl;  } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c new file mode 100644 index 00000000000..8855aad3929 --- /dev/null +++ b/fs/btrfs/backref.c @@ -0,0 +1,776 @@ +/* + * Copyright (C) 2011 STRATO.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "backref.h" + +struct __data_ref { +	struct list_head list; +	u64 inum; +	u64 root; +	u64 extent_data_item_offset; +}; + +struct __shared_ref { +	struct list_head list; +	u64 disk_byte; +}; + +static int __inode_info(u64 inum, u64 ioff, u8 key_type, +			struct btrfs_root *fs_root, struct btrfs_path *path, +			struct btrfs_key *found_key) +{ +	int ret; +	struct btrfs_key key; +	struct extent_buffer *eb; + +	key.type = key_type; +	key.objectid = inum; +	key.offset = ioff; + +	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); +	if (ret < 0) +		return ret; + +	eb = path->nodes[0]; +	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { +		ret = btrfs_next_leaf(fs_root, path); +		if (ret) +			return ret; +		eb = path->nodes[0]; +	} + +	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); +	if (found_key->type != key.type || found_key->objectid != key.objectid) +		return 1; + +	return 0; +} + +/* + * this makes the path point to (inum INODE_ITEM ioff) + */ +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, +			struct btrfs_path *path) +{ +	struct btrfs_key key; +	return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, +				&key); +} + +static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, +				struct btrfs_path *path, +				struct btrfs_key *found_key) +{ +	return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, +				found_key); +} + +/* + * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements + * of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, +				struct btrfs_inode_ref *iref, +				struct extent_buffer *eb_in, u64 parent, +				char *dest, u32 size) +{ +	u32 len; +	int slot; +	u64 next_inum; +	int ret; +	s64 bytes_left = size - 1; +	struct extent_buffer *eb = eb_in; +	struct btrfs_key found_key; + +	if (bytes_left >= 0) +		dest[bytes_left] = '\0'; + +	while (1) { +		len = btrfs_inode_ref_name_len(eb, iref); +		bytes_left -= len; +		if (bytes_left >= 0) +			read_extent_buffer(eb, dest + bytes_left, +						(unsigned long)(iref + 1), len); +		if (eb != eb_in) +			free_extent_buffer(eb); +		ret = inode_ref_info(parent, 0, fs_root, path, &found_key); +		if (ret) +			break; +		next_inum = found_key.offset; + +		/* regular exit ahead */ +		if (parent == next_inum) +			break; + +		slot = path->slots[0]; +		eb = path->nodes[0]; +		/* make sure we can use eb after releasing the path */ +		if (eb != eb_in) +			atomic_inc(&eb->refs); +		btrfs_release_path(path); + +		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); +		parent = next_inum; +		--bytes_left; +		if (bytes_left >= 0) +			dest[bytes_left] = '/'; +	} + +	btrfs_release_path(path); + +	if (ret) +		return ERR_PTR(ret); + +	return dest + bytes_left; +} + +/* + * this makes the path point to (logical EXTENT_ITEM *) + * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for + * tree blocks and <0 on error. + */ +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, +			struct btrfs_path *path, struct btrfs_key *found_key) +{ +	int ret; +	u64 flags; +	u32 item_size; +	struct extent_buffer *eb; +	struct btrfs_extent_item *ei; +	struct btrfs_key key; + +	key.type = BTRFS_EXTENT_ITEM_KEY; +	key.objectid = logical; +	key.offset = (u64)-1; + +	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); +	if (ret < 0) +		return ret; +	ret = btrfs_previous_item(fs_info->extent_root, path, +					0, BTRFS_EXTENT_ITEM_KEY); +	if (ret < 0) +		return ret; + +	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); +	if (found_key->type != BTRFS_EXTENT_ITEM_KEY || +	    found_key->objectid > logical || +	    found_key->objectid + found_key->offset <= logical) +		return -ENOENT; + +	eb = path->nodes[0]; +	item_size = btrfs_item_size_nr(eb, path->slots[0]); +	BUG_ON(item_size < sizeof(*ei)); + +	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); +	flags = btrfs_extent_flags(eb, ei); + +	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) +		return BTRFS_EXTENT_FLAG_TREE_BLOCK; +	if (flags & BTRFS_EXTENT_FLAG_DATA) +		return BTRFS_EXTENT_FLAG_DATA; + +	return -EIO; +} + +/* + * helper function to iterate extent inline refs. ptr must point to a 0 value + * for the first call and may be modified. it is used to track state. + * if more refs exist, 0 is returned and the next call to + * __get_extent_inline_ref must pass the modified ptr parameter to get the + * next ref. after the last ref was processed, 1 is returned. + * returns <0 on error + */ +static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, +				struct btrfs_extent_item *ei, u32 item_size, +				struct btrfs_extent_inline_ref **out_eiref, +				int *out_type) +{ +	unsigned long end; +	u64 flags; +	struct btrfs_tree_block_info *info; + +	if (!*ptr) { +		/* first call */ +		flags = btrfs_extent_flags(eb, ei); +		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { +			info = (struct btrfs_tree_block_info *)(ei + 1); +			*out_eiref = +				(struct btrfs_extent_inline_ref *)(info + 1); +		} else { +			*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); +		} +		*ptr = (unsigned long)*out_eiref; +		if ((void *)*ptr >= (void *)ei + item_size) +			return -ENOENT; +	} + +	end = (unsigned long)ei + item_size; +	*out_eiref = (struct btrfs_extent_inline_ref *)*ptr; +	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); + +	*ptr += btrfs_extent_inline_ref_size(*out_type); +	WARN_ON(*ptr > end); +	if (*ptr == end) +		return 1; /* last */ + +	return 0; +} + +/* + * reads the tree block backref for an extent. tree level and root are returned + * through out_level and out_root. ptr must point to a 0 value for the first + * call and may be modified (see __get_extent_inline_ref comment). + * returns 0 if data was provided, 1 if there was no more data to provide or + * <0 on error. + */ +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, +				struct btrfs_extent_item *ei, u32 item_size, +				u64 *out_root, u8 *out_level) +{ +	int ret; +	int type; +	struct btrfs_tree_block_info *info; +	struct btrfs_extent_inline_ref *eiref; + +	if (*ptr == (unsigned long)-1) +		return 1; + +	while (1) { +		ret = __get_extent_inline_ref(ptr, eb, ei, item_size, +						&eiref, &type); +		if (ret < 0) +			return ret; + +		if (type == BTRFS_TREE_BLOCK_REF_KEY || +		    type == BTRFS_SHARED_BLOCK_REF_KEY) +			break; + +		if (ret == 1) +			return 1; +	} + +	/* we can treat both ref types equally here */ +	info = (struct btrfs_tree_block_info *)(ei + 1); +	*out_root = btrfs_extent_inline_ref_offset(eb, eiref); +	*out_level = btrfs_tree_block_level(eb, info); + +	if (ret == 1) +		*ptr = (unsigned long)-1; + +	return 0; +} + +static int __data_list_add(struct list_head *head, u64 inum, +				u64 extent_data_item_offset, u64 root) +{ +	struct __data_ref *ref; + +	ref = kmalloc(sizeof(*ref), GFP_NOFS); +	if (!ref) +		return -ENOMEM; + +	ref->inum = inum; +	ref->extent_data_item_offset = extent_data_item_offset; +	ref->root = root; +	list_add_tail(&ref->list, head); + +	return 0; +} + +static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, +				struct btrfs_extent_data_ref *dref) +{ +	return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), +				btrfs_extent_data_ref_offset(eb, dref), +				btrfs_extent_data_ref_root(eb, dref)); +} + +static int __shared_list_add(struct list_head *head, u64 disk_byte) +{ +	struct __shared_ref *ref; + +	ref = kmalloc(sizeof(*ref), GFP_NOFS); +	if (!ref) +		return -ENOMEM; + +	ref->disk_byte = disk_byte; +	list_add_tail(&ref->list, head); + +	return 0; +} + +static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, +					   u64 logical, u64 inum, +					   u64 extent_data_item_offset, +					   u64 extent_offset, +					   struct btrfs_path *path, +					   struct list_head *data_refs, +					   iterate_extent_inodes_t *iterate, +					   void *ctx) +{ +	u64 ref_root; +	u32 item_size; +	struct btrfs_key key; +	struct extent_buffer *eb; +	struct btrfs_extent_item *ei; +	struct btrfs_extent_inline_ref *eiref; +	struct __data_ref *ref; +	int ret; +	int type; +	int last; +	unsigned long ptr = 0; + +	WARN_ON(!list_empty(data_refs)); +	ret = extent_from_logical(fs_info, logical, path, &key); +	if (ret & BTRFS_EXTENT_FLAG_DATA) +		ret = -EIO; +	if (ret < 0) +		goto out; + +	eb = path->nodes[0]; +	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); +	item_size = btrfs_item_size_nr(eb, path->slots[0]); + +	ret = 0; +	ref_root = 0; +	/* +	 * as done in iterate_extent_inodes, we first build a list of refs to +	 * iterate, then free the path and then iterate them to avoid deadlocks. +	 */ +	do { +		last = __get_extent_inline_ref(&ptr, eb, ei, item_size, +						&eiref, &type); +		if (last < 0) { +			ret = last; +			goto out; +		} +		if (type == BTRFS_TREE_BLOCK_REF_KEY || +		    type == BTRFS_SHARED_BLOCK_REF_KEY) { +			ref_root = btrfs_extent_inline_ref_offset(eb, eiref); +			ret = __data_list_add(data_refs, inum, +						extent_data_item_offset, +						ref_root); +		} +	} while (!ret && !last); + +	btrfs_release_path(path); + +	if (ref_root == 0) { +		printk(KERN_ERR "btrfs: failed to find tree block ref " +			"for shared data backref %llu\n", logical); +		WARN_ON(1); +		ret = -EIO; +	} + +out: +	while (!list_empty(data_refs)) { +		ref = list_first_entry(data_refs, struct __data_ref, list); +		list_del(&ref->list); +		if (!ret) +			ret = iterate(ref->inum, extent_offset + +					ref->extent_data_item_offset, +					ref->root, ctx); +		kfree(ref); +	} + +	return ret; +} + +static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, +				    u64 logical, u64 orig_extent_item_objectid, +				    u64 extent_offset, struct btrfs_path *path, +				    struct list_head *data_refs, +				    iterate_extent_inodes_t *iterate, +				    void *ctx) +{ +	u64 disk_byte; +	struct btrfs_key key; +	struct btrfs_file_extent_item *fi; +	struct extent_buffer *eb; +	int slot; +	int nritems; +	int ret; +	int found = 0; + +	eb = read_tree_block(fs_info->tree_root, logical, +				fs_info->tree_root->leafsize, 0); +	if (!eb) +		return -EIO; + +	/* +	 * from the shared data ref, we only have the leaf but we need +	 * the key. thus, we must look into all items and see that we +	 * find one (some) with a reference to our extent item. +	 */ +	nritems = btrfs_header_nritems(eb); +	for (slot = 0; slot < nritems; ++slot) { +		btrfs_item_key_to_cpu(eb, &key, slot); +		if (key.type != BTRFS_EXTENT_DATA_KEY) +			continue; +		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); +		if (!fi) { +			free_extent_buffer(eb); +			return -EIO; +		} +		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); +		if (disk_byte != orig_extent_item_objectid) { +			if (found) +				break; +			else +				continue; +		} +		++found; +		ret = __iter_shared_inline_ref_inodes(fs_info, logical, +							key.objectid, +							key.offset, +							extent_offset, path, +							data_refs, +							iterate, ctx); +		if (ret) +			break; +	} + +	if (!found) { +		printk(KERN_ERR "btrfs: failed to follow shared data backref " +			"to parent %llu\n", logical); +		WARN_ON(1); +		ret = -EIO; +	} + +	free_extent_buffer(eb); +	return ret; +} + +/* + * calls iterate() for every inode that references the extent identified by + * the given parameters. will use the path given as a parameter and return it + * released. + * when the iterator function returns a non-zero value, iteration stops. + */ +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, +				struct btrfs_path *path, +				u64 extent_item_objectid, +				u64 extent_offset, +				iterate_extent_inodes_t *iterate, void *ctx) +{ +	unsigned long ptr = 0; +	int last; +	int ret; +	int type; +	u64 logical; +	u32 item_size; +	struct btrfs_extent_inline_ref *eiref; +	struct btrfs_extent_data_ref *dref; +	struct extent_buffer *eb; +	struct btrfs_extent_item *ei; +	struct btrfs_key key; +	struct list_head data_refs = LIST_HEAD_INIT(data_refs); +	struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); +	struct __data_ref *ref_d; +	struct __shared_ref *ref_s; + +	eb = path->nodes[0]; +	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); +	item_size = btrfs_item_size_nr(eb, path->slots[0]); + +	/* first we iterate the inline refs, ... */ +	do { +		last = __get_extent_inline_ref(&ptr, eb, ei, item_size, +						&eiref, &type); +		if (last == -ENOENT) { +			ret = 0; +			break; +		} +		if (last < 0) { +			ret = last; +			break; +		} + +		if (type == BTRFS_EXTENT_DATA_REF_KEY) { +			dref = (struct btrfs_extent_data_ref *)(&eiref->offset); +			ret = __data_list_add_eb(&data_refs, eb, dref); +		} else if (type == BTRFS_SHARED_DATA_REF_KEY) { +			logical = btrfs_extent_inline_ref_offset(eb, eiref); +			ret = __shared_list_add(&shared_refs, logical); +		} +	} while (!ret && !last); + +	/* ... then we proceed to in-tree references and ... */ +	while (!ret) { +		++path->slots[0]; +		if (path->slots[0] > btrfs_header_nritems(eb)) { +			ret = btrfs_next_leaf(fs_info->extent_root, path); +			if (ret) { +				if (ret == 1) +					ret = 0; /* we're done */ +				break; +			} +			eb = path->nodes[0]; +		} +		btrfs_item_key_to_cpu(eb, &key, path->slots[0]); +		if (key.objectid != extent_item_objectid) +			break; +		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { +			dref = btrfs_item_ptr(eb, path->slots[0], +						struct btrfs_extent_data_ref); +			ret = __data_list_add_eb(&data_refs, eb, dref); +		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { +			ret = __shared_list_add(&shared_refs, key.offset); +		} +	} + +	btrfs_release_path(path); + +	/* +	 * ... only at the very end we can process the refs we found. this is +	 * because the iterator function we call is allowed to make tree lookups +	 * and we have to avoid deadlocks. additionally, we need more tree +	 * lookups ourselves for shared data refs. +	 */ +	while (!list_empty(&data_refs)) { +		ref_d = list_first_entry(&data_refs, struct __data_ref, list); +		list_del(&ref_d->list); +		if (!ret) +			ret = iterate(ref_d->inum, extent_offset + +					ref_d->extent_data_item_offset, +					ref_d->root, ctx); +		kfree(ref_d); +	} + +	while (!list_empty(&shared_refs)) { +		ref_s = list_first_entry(&shared_refs, struct __shared_ref, +					list); +		list_del(&ref_s->list); +		if (!ret) +			ret = __iter_shared_inline_ref(fs_info, +							ref_s->disk_byte, +							extent_item_objectid, +							extent_offset, path, +							&data_refs, +							iterate, ctx); +		kfree(ref_s); +	} + +	return ret; +} + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, +				struct btrfs_path *path, +				iterate_extent_inodes_t *iterate, void *ctx) +{ +	int ret; +	u64 offset; +	struct btrfs_key found_key; + +	ret = extent_from_logical(fs_info, logical, path, +					&found_key); +	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) +		ret = -EINVAL; +	if (ret < 0) +		return ret; + +	offset = logical - found_key.objectid; +	ret = iterate_extent_inodes(fs_info, path, found_key.objectid, +					offset, iterate, ctx); + +	return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, +				struct btrfs_path *path, +				iterate_irefs_t *iterate, void *ctx) +{ +	int ret; +	int slot; +	u32 cur; +	u32 len; +	u32 name_len; +	u64 parent = 0; +	int found = 0; +	struct extent_buffer *eb; +	struct btrfs_item *item; +	struct btrfs_inode_ref *iref; +	struct btrfs_key found_key; + +	while (1) { +		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, +					&found_key); +		if (ret < 0) +			break; +		if (ret) { +			ret = found ? 0 : -ENOENT; +			break; +		} +		++found; + +		parent = found_key.offset; +		slot = path->slots[0]; +		eb = path->nodes[0]; +		/* make sure we can use eb after releasing the path */ +		atomic_inc(&eb->refs); +		btrfs_release_path(path); + +		item = btrfs_item_nr(eb, slot); +		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + +		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { +			name_len = btrfs_inode_ref_name_len(eb, iref); +			/* path must be released before calling iterate()! */ +			ret = iterate(parent, iref, eb, ctx); +			if (ret) { +				free_extent_buffer(eb); +				break; +			} +			len = sizeof(*iref) + name_len; +			iref = (struct btrfs_inode_ref *)((char *)iref + len); +		} +		free_extent_buffer(eb); +	} + +	btrfs_release_path(path); + +	return ret; +} + +/* + * returns 0 if the path could be dumped (probably truncated) + * returns <0 in case of an error + */ +static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, +				struct extent_buffer *eb, void *ctx) +{ +	struct inode_fs_paths *ipath = ctx; +	char *fspath; +	char *fspath_min; +	int i = ipath->fspath->elem_cnt; +	const int s_ptr = sizeof(char *); +	u32 bytes_left; + +	bytes_left = ipath->fspath->bytes_left > s_ptr ? +					ipath->fspath->bytes_left - s_ptr : 0; + +	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; +	fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, +				inum, fspath_min, bytes_left); +	if (IS_ERR(fspath)) +		return PTR_ERR(fspath); + +	if (fspath > fspath_min) { +		ipath->fspath->val[i] = (u64)fspath; +		++ipath->fspath->elem_cnt; +		ipath->fspath->bytes_left = fspath - fspath_min; +	} else { +		++ipath->fspath->elem_missed; +		ipath->fspath->bytes_missing += fspath_min - fspath; +		ipath->fspath->bytes_left = 0; +	} + +	return 0; +} + +/* + * this dumps all file system paths to the inode into the ipath struct, provided + * is has been created large enough. each path is zero-terminated and accessed + * from ipath->fspath->val[i]. + * when it returns, there are ipath->fspath->elem_cnt number of paths available + * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the + * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would + * have been needed to return all paths. + */ +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) +{ +	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, +				inode_to_path, ipath); +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct btrfs_data_container *init_data_container(u32 total_bytes) +{ +	struct btrfs_data_container *data; +	size_t alloc_bytes; + +	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); +	data = kmalloc(alloc_bytes, GFP_NOFS); +	if (!data) +		return ERR_PTR(-ENOMEM); + +	if (total_bytes >= sizeof(*data)) { +		data->bytes_left = total_bytes - sizeof(*data); +		data->bytes_missing = 0; +	} else { +		data->bytes_missing = sizeof(*data) - total_bytes; +		data->bytes_left = 0; +	} + +	data->elem_cnt = 0; +	data->elem_missed = 0; + +	return data; +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, +					struct btrfs_path *path) +{ +	struct inode_fs_paths *ifp; +	struct btrfs_data_container *fspath; + +	fspath = init_data_container(total_bytes); +	if (IS_ERR(fspath)) +		return (void *)fspath; + +	ifp = kmalloc(sizeof(*ifp), GFP_NOFS); +	if (!ifp) { +		kfree(fspath); +		return ERR_PTR(-ENOMEM); +	} + +	ifp->btrfs_path = path; +	ifp->fspath = fspath; +	ifp->fs_root = fs_root; + +	return ifp; +} + +void free_ipath(struct inode_fs_paths *ipath) +{ +	kfree(ipath); +} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h new file mode 100644 index 00000000000..92618837cb8 --- /dev/null +++ b/fs/btrfs/backref.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2011 STRATO.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_BACKREF__ +#define __BTRFS_BACKREF__ + +#include "ioctl.h" + +struct inode_fs_paths { +	struct btrfs_path		*btrfs_path; +	struct btrfs_root		*fs_root; +	struct btrfs_data_container	*fspath; +}; + +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, +		void *ctx); +typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, +				struct extent_buffer *eb, void *ctx); + +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, +			struct btrfs_path *path); + +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, +			struct btrfs_path *path, struct btrfs_key *found_key); + +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, +				struct btrfs_extent_item *ei, u32 item_size, +				u64 *out_root, u8 *out_level); + +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, +				struct btrfs_path *path, +				u64 extent_item_objectid, +				u64 extent_offset, +				iterate_extent_inodes_t *iterate, void *ctx); + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, +				struct btrfs_path *path, +				iterate_extent_inodes_t *iterate, void *ctx); + +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); + +struct btrfs_data_container *init_data_container(u32 total_bytes); +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, +					struct btrfs_path *path); +void free_ipath(struct inode_fs_paths *ipath); + +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9f99a16edd..5a5d325a393 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -103,11 +103,6 @@ struct btrfs_inode {  	 */  	u64 delalloc_bytes; -	/* total number of bytes that may be used for this inode for -	 * delalloc -	 */ -	u64 reserved_bytes; -  	/*  	 * the size of the file stored in the metadata on disk.  data=ordered  	 * means the in-memory i_size might be larger than the size on disk @@ -115,9 +110,6 @@ struct btrfs_inode {  	 */  	u64 disk_i_size; -	/* flags field from the on disk inode */ -	u32 flags; -  	/*  	 * if this is a directory then index_cnt is the counter for the index  	 * number for new files that are created @@ -132,6 +124,15 @@ struct btrfs_inode {  	u64 last_unlink_trans;  	/* +	 * Number of bytes outstanding that are going to need csums.  This is +	 * used in ENOSPC accounting. +	 */ +	u64 csum_bytes; + +	/* flags field from the on disk inode */ +	u32 flags; + +	/*  	 * Counters to keep track of the number of extent item's we may use due  	 * to delalloc and such.  outstanding_extents is the number of extent  	 * items we think we'll end up using, and reserved_extents is the number diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8ec5d86f173..14f1c5a0b2d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -85,7 +85,8 @@ struct compressed_bio {  static inline int compressed_bio_size(struct btrfs_root *root,  				      unsigned long disk_size)  { -	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); +	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); +  	return sizeof(struct compressed_bio) +  		((disk_size + root->sectorsize - 1) / root->sectorsize) *  		csum_size; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 011cab3aca8..0fe615e4ea3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	orig_ptr = btrfs_node_blockptr(mid, orig_slot); -	if (level < BTRFS_MAX_LEVEL - 1) +	if (level < BTRFS_MAX_LEVEL - 1) {  		parent = path->nodes[level + 1]; -	pslot = path->slots[level + 1]; +		pslot = path->slots[level + 1]; +	}  	/*  	 * deal with the case where there is only one pointer in the root @@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  	mid = path->nodes[level];  	WARN_ON(btrfs_header_generation(mid) != trans->transid); -	if (level < BTRFS_MAX_LEVEL - 1) +	if (level < BTRFS_MAX_LEVEL - 1) {  		parent = path->nodes[level + 1]; -	pslot = path->slots[level + 1]; +		pslot = path->slots[level + 1]; +	}  	if (!parent)  		return 1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03912c5c6f4..b9ba59ff929 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -30,6 +30,7 @@  #include <linux/kobject.h>  #include <trace/events/btrfs.h>  #include <asm/kmap_types.h> +#include <linux/pagemap.h>  #include "extent_io.h"  #include "extent_map.h"  #include "async-thread.h" @@ -360,6 +361,47 @@ struct btrfs_header {  #define BTRFS_LABEL_SIZE 256  /* + * just in case we somehow lose the roots and are not able to mount, + * we store an array of the roots from previous transactions + * in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { +	__le64 tree_root; +	__le64 tree_root_gen; + +	__le64 chunk_root; +	__le64 chunk_root_gen; + +	__le64 extent_root; +	__le64 extent_root_gen; + +	__le64 fs_root; +	__le64 fs_root_gen; + +	__le64 dev_root; +	__le64 dev_root_gen; + +	__le64 csum_root; +	__le64 csum_root_gen; + +	__le64 total_bytes; +	__le64 bytes_used; +	__le64 num_devices; +	/* future */ +	__le64 unsed_64[4]; + +	u8 tree_root_level; +	u8 chunk_root_level; +	u8 extent_root_level; +	u8 fs_root_level; +	u8 dev_root_level; +	u8 csum_root_level; +	/* future and to align */ +	u8 unused_8[10]; +} __attribute__ ((__packed__)); + +/*   * the super block basically lists the main trees of the FS   * it currently lacks any block count etc etc   */ @@ -405,6 +447,7 @@ struct btrfs_super_block {  	/* future expansion */  	__le64 reserved[31];  	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; +	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];  } __attribute__ ((__packed__));  /* @@ -772,14 +815,8 @@ struct btrfs_space_info {  struct btrfs_block_rsv {  	u64 size;  	u64 reserved; -	u64 freed[2];  	struct btrfs_space_info *space_info; -	struct list_head list;  	spinlock_t lock; -	atomic_t usage; -	unsigned int priority:8; -	unsigned int durable:1; -	unsigned int refill_used:1;  	unsigned int full:1;  }; @@ -840,10 +877,10 @@ struct btrfs_block_group_cache {  	spinlock_t lock;  	u64 pinned;  	u64 reserved; -	u64 reserved_pinned;  	u64 bytes_super;  	u64 flags;  	u64 sectorsize; +	u64 cache_generation;  	unsigned int ro:1;  	unsigned int dirty:1;  	unsigned int iref:1; @@ -899,6 +936,10 @@ struct btrfs_fs_info {  	spinlock_t block_group_cache_lock;  	struct rb_root block_group_cache_tree; +	/* keep track of unallocated space */ +	spinlock_t free_chunk_lock; +	u64 free_chunk_space; +  	struct extent_io_tree freed_extents[2];  	struct extent_io_tree *pinned_extents; @@ -916,14 +957,11 @@ struct btrfs_fs_info {  	struct btrfs_block_rsv trans_block_rsv;  	/* block reservation for chunk tree */  	struct btrfs_block_rsv chunk_block_rsv; +	/* block reservation for delayed operations */ +	struct btrfs_block_rsv delayed_block_rsv;  	struct btrfs_block_rsv empty_block_rsv; -	/* list of block reservations that cross multiple transactions */ -	struct list_head durable_block_rsv_list; - -	struct mutex durable_block_rsv_mutex; -  	u64 generation;  	u64 last_trans_committed; @@ -942,8 +980,8 @@ struct btrfs_fs_info {  	wait_queue_head_t transaction_blocked_wait;  	wait_queue_head_t async_submit_wait; -	struct btrfs_super_block super_copy; -	struct btrfs_super_block super_for_commit; +	struct btrfs_super_block *super_copy; +	struct btrfs_super_block *super_for_commit;  	struct block_device *__bdev;  	struct super_block *sb;  	struct inode *btree_inode; @@ -1036,6 +1074,7 @@ struct btrfs_fs_info {  	struct btrfs_workers endio_freespace_worker;  	struct btrfs_workers submit_workers;  	struct btrfs_workers caching_workers; +	struct btrfs_workers readahead_workers;  	/*  	 * fixup workers take dirty pages that didn't properly go through @@ -1119,6 +1158,13 @@ struct btrfs_fs_info {  	u64 fs_state;  	struct btrfs_delayed_root *delayed_root; + +	/* readahead tree */ +	spinlock_t reada_lock; +	struct radix_tree_root reada_tree; + +	/* next backup root to be overwritten */ +	int backup_root_index;  };  /* @@ -1363,6 +1409,7 @@ struct btrfs_ioctl_defrag_range_args {  #define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)  #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)  #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17) +#define BTRFS_MOUNT_RECOVERY		(1 << 18)  #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)  #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt) @@ -1978,6 +2025,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)  	return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;  } +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, +		   tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, +		   tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, +		   tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, +		   chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, +		   chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, +		   chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, +		   extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, +		   extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, +		   extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, +		   fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, +		   fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, +		   fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, +		   dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, +		   dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, +		   dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, +		   csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, +		   csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, +		   csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, +		   total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, +		   bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, +		   num_devices, 64); +  /* struct btrfs_super_block */  BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); @@ -2129,6 +2225,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)  		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));  } +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ +	return mapping_gfp_mask(mapping) & ~__GFP_FS; +} +  /* extent-tree.c */  static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,  						 unsigned num_items) @@ -2137,6 +2238,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,  		3 * num_items;  } +/* + * Doing a truncate won't result in new nodes or leaves, just what we need for + * COW. + */ +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, +						 unsigned num_items) +{ +	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * +		num_items; +} +  void btrfs_put_block_group(struct btrfs_block_group_cache *cache);  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root, unsigned long count); @@ -2146,6 +2258,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,  			     u64 num_bytes, u64 *refs, u64 *flags);  int btrfs_pin_extent(struct btrfs_root *root,  		     u64 bytenr, u64 num, int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, +				    struct btrfs_root *root, +				    u64 bytenr, u64 num_bytes);  int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,  			  struct btrfs_root *root,  			  u64 objectid, u64 offset, u64 bytenr); @@ -2196,8 +2311,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,  		      u64 root_objectid, u64 owner, u64 offset);  int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, -				u64 num_bytes, int reserve, int sinfo); +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, +				       u64 start, u64 len);  int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,  				struct btrfs_root *root);  int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, @@ -2240,25 +2355,23 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);  void btrfs_free_block_rsv(struct btrfs_root *root,  			  struct btrfs_block_rsv *rsv); -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, -				 struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, -			struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_root *root,  			struct btrfs_block_rsv *block_rsv,  			u64 num_bytes); -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, -			  struct btrfs_root *root, +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, +				struct btrfs_block_rsv *block_rsv, +				u64 num_bytes); +int btrfs_block_rsv_check(struct btrfs_root *root, +			  struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root,  			  struct btrfs_block_rsv *block_rsv, -			  u64 min_reserved, int min_factor); +			  u64 min_reserved);  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,  			    struct btrfs_block_rsv *dst_rsv,  			    u64 num_bytes);  void btrfs_block_rsv_release(struct btrfs_root *root,  			     struct btrfs_block_rsv *block_rsv,  			     u64 num_bytes); -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, -				    struct btrfs_root *root, -				    struct btrfs_block_rsv *rsv);  int btrfs_set_block_group_ro(struct btrfs_root *root,  			     struct btrfs_block_group_cache *cache);  int btrfs_set_block_group_rw(struct btrfs_root *root, @@ -2379,6 +2492,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)  	smp_mb();  	return fs_info->closing;  } +static inline void free_fs_info(struct btrfs_fs_info *fs_info) +{ +	kfree(fs_info->delayed_root); +	kfree(fs_info->extent_root); +	kfree(fs_info->tree_root); +	kfree(fs_info->chunk_root); +	kfree(fs_info->dev_root); +	kfree(fs_info->csum_root); +	kfree(fs_info->super_copy); +	kfree(fs_info->super_for_commit); +	kfree(fs_info); +}  /* root-item.c */  int btrfs_find_root_ref(struct btrfs_root *tree_root, @@ -2579,11 +2704,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,  int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);  int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);  int btrfs_orphan_cleanup(struct btrfs_root *root); -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, -				struct btrfs_pending_snapshot *pending, -				u64 *bytes_to_reserve); -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, -				struct btrfs_pending_snapshot *pending);  void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,  			      struct btrfs_root *root);  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); @@ -2697,4 +2817,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);  int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,  			 struct btrfs_scrub_progress *progress); +/* reada.c */ +struct reada_control { +	struct btrfs_root	*root;		/* tree to prefetch */ +	struct btrfs_key	key_start; +	struct btrfs_key	key_end;	/* exclusive */ +	atomic_t		elems; +	struct kref		refcnt; +	wait_queue_head_t	wait; +}; +struct reada_control *btrfs_reada_add(struct btrfs_root *root, +			      struct btrfs_key *start, struct btrfs_key *end); +int btrfs_reada_wait(void *handle); +void btrfs_reada_detach(void *handle); +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, +			 u64 start, int err); +  #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index ae4d9cd1096..3a1b939c9ae 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,  		return 0;  	src_rsv = trans->block_rsv; -	dst_rsv = &root->fs_info->global_block_rsv; +	dst_rsv = &root->fs_info->delayed_block_rsv;  	num_bytes = btrfs_calc_trans_metadata_size(root, 1);  	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,  	if (!item->bytes_reserved)  		return; -	rsv = &root->fs_info->global_block_rsv; +	rsv = &root->fs_info->delayed_block_rsv;  	btrfs_block_rsv_release(root, rsv,  				item->bytes_reserved);  } @@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata(  	u64 num_bytes;  	int ret; -	if (!trans->bytes_reserved) -		return 0; -  	src_rsv = trans->block_rsv; -	dst_rsv = &root->fs_info->global_block_rsv; +	dst_rsv = &root->fs_info->delayed_block_rsv;  	num_bytes = btrfs_calc_trans_metadata_size(root, 1); + +	/* +	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction +	 * which doesn't reserve space for speed.  This is a problem since we +	 * still need to reserve space for this update, so try to reserve the +	 * space. +	 * +	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since +	 * we're accounted for. +	 */ +	if (!trans->bytes_reserved && +	    src_rsv != &root->fs_info->delalloc_block_rsv) { +		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); +		/* +		 * Since we're under a transaction reserve_metadata_bytes could +		 * try to commit the transaction which will make it return +		 * EAGAIN to make us stop the transaction we have, so return +		 * ENOSPC instead so that btrfs_dirty_inode knows what to do. +		 */ +		if (ret == -EAGAIN) +			ret = -ENOSPC; +		if (!ret) +			node->bytes_reserved = num_bytes; +		return ret; +	} +  	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);  	if (!ret)  		node->bytes_reserved = num_bytes; @@ -646,7 +669,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,  	if (!node->bytes_reserved)  		return; -	rsv = &root->fs_info->global_block_rsv; +	rsv = &root->fs_info->delayed_block_rsv;  	btrfs_block_rsv_release(root, rsv,  				node->bytes_reserved);  	node->bytes_reserved = 0; @@ -1026,7 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,  	path->leave_spinning = 1;  	block_rsv = trans->block_rsv; -	trans->block_rsv = &root->fs_info->global_block_rsv; +	trans->block_rsv = &root->fs_info->delayed_block_rsv;  	delayed_root = btrfs_get_delayed_root(root); @@ -1069,7 +1092,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,  	path->leave_spinning = 1;  	block_rsv = trans->block_rsv; -	trans->block_rsv = &node->root->fs_info->global_block_rsv; +	trans->block_rsv = &node->root->fs_info->delayed_block_rsv;  	ret = btrfs_insert_delayed_items(trans, path, node->root, node);  	if (!ret) @@ -1149,7 +1172,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)  		goto free_path;  	block_rsv = trans->block_rsv; -	trans->block_rsv = &root->fs_info->global_block_rsv; +	trans->block_rsv = &root->fs_info->delayed_block_rsv;  	ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);  	if (!ret) @@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,  	}  	ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); -	/* -	 * we must reserve enough space when we start a new transaction, -	 * so reserving metadata failure is impossible -	 */ -	BUG_ON(ret); +	if (ret) +		goto release_node;  	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);  	delayed_node->inode_dirty = 1; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07ea91879a9..102c176fc29 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)  static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,  			   int verify)  { -	u16 csum_size = -		btrfs_super_csum_size(&root->fs_info->super_copy); +	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);  	char *result = NULL;  	unsigned long len;  	unsigned long cur_len; @@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,  	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);  	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;  	while (1) { -		ret = read_extent_buffer_pages(io_tree, eb, start, 1, +		ret = read_extent_buffer_pages(io_tree, eb, start, +					       WAIT_COMPLETE,  					       btree_get_extent, mirror_num);  		if (!ret &&  		    !verify_parent_transid(io_tree, eb, parent_transid)) @@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,  	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);  	end = eb->start + end - 1;  err: +	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { +		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); +		btree_readahead_hook(root, eb, eb->start, ret); +	} +  	free_extent_buffer(eb);  out:  	return ret;  } +static int btree_io_failed_hook(struct bio *failed_bio, +			 struct page *page, u64 start, u64 end, +			 u64 mirror_num, struct extent_state *state) +{ +	struct extent_io_tree *tree; +	unsigned long len; +	struct extent_buffer *eb; +	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + +	tree = &BTRFS_I(page->mapping->host)->io_tree; +	if (page->private == EXTENT_PAGE_PRIVATE) +		goto out; +	if (!page->private) +		goto out; + +	len = page->private >> 2; +	WARN_ON(len == 0); + +	eb = alloc_extent_buffer(tree, start, len, page); +	if (eb == NULL) +		goto out; + +	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { +		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); +		btree_readahead_hook(root, eb, eb->start, -EIO); +	} +	free_extent_buffer(eb); + +out: +	return -EIO;	/* we fixed nothing */ +} +  static void end_workqueue_bio(struct bio *bio, int err)  {  	struct end_io_wq *end_io_wq = bio->bi_private; @@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)  {  	struct extent_io_tree *tree;  	tree = &BTRFS_I(page->mapping->host)->io_tree; -	return extent_read_full_page(tree, page, btree_get_extent); +	return extent_read_full_page(tree, page, btree_get_extent, 0);  }  static int btree_releasepage(struct page *page, gfp_t gfp_flags) @@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,  	if (!buf)  		return 0;  	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, -				 buf, 0, 0, btree_get_extent, 0); +				 buf, 0, WAIT_NONE, btree_get_extent, 0);  	free_extent_buffer(buf);  	return ret;  } +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +			 int mirror_num, struct extent_buffer **eb) +{ +	struct extent_buffer *buf = NULL; +	struct inode *btree_inode = root->fs_info->btree_inode; +	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; +	int ret; + +	buf = btrfs_find_create_tree_block(root, bytenr, blocksize); +	if (!buf) +		return 0; + +	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + +	ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, +				       btree_get_extent, mirror_num); +	if (ret) { +		free_extent_buffer(buf); +		return ret; +	} + +	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { +		free_extent_buffer(buf); +		return -EIO; +	} else if (extent_buffer_uptodate(io_tree, buf, NULL)) { +		*eb = buf; +	} else { +		free_extent_buffer(buf); +	} +	return 0; +} +  struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,  					    u64 bytenr, u32 blocksize)  { @@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,  	generation = btrfs_root_generation(&root->root_item);  	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); +	root->commit_root = NULL;  	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),  				     blocksize, generation);  	if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {  		free_extent_buffer(root->node); +		root->node = NULL;  		return -EIO;  	}  	root->commit_root = btrfs_root_node(root); @@ -1577,6 +1648,235 @@ sleep:  	return 0;  } +/* + * this will find the highest generation in the array of + * root backups.  The index of the highest array is returned, + * or -1 if we can't find anything. + * + * We check to make sure the array is valid by comparing the + * generation of the latest  root in the array with the generation + * in the super block.  If they don't match we pitch it. + */ +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +{ +	u64 cur; +	int newest_index = -1; +	struct btrfs_root_backup *root_backup; +	int i; + +	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { +		root_backup = info->super_copy->super_roots + i; +		cur = btrfs_backup_tree_root_gen(root_backup); +		if (cur == newest_gen) +			newest_index = i; +	} + +	/* check to see if we actually wrapped around */ +	if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { +		root_backup = info->super_copy->super_roots; +		cur = btrfs_backup_tree_root_gen(root_backup); +		if (cur == newest_gen) +			newest_index = 0; +	} +	return newest_index; +} + + +/* + * find the oldest backup so we know where to store new entries + * in the backup array.  This will set the backup_root_index + * field in the fs_info struct + */ +static void find_oldest_super_backup(struct btrfs_fs_info *info, +				     u64 newest_gen) +{ +	int newest_index = -1; + +	newest_index = find_newest_super_backup(info, newest_gen); +	/* if there was garbage in there, just move along */ +	if (newest_index == -1) { +		info->backup_root_index = 0; +	} else { +		info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; +	} +} + +/* + * copy all the root pointers into the super backup array. + * this will bump the backup pointer by one when it is + * done + */ +static void backup_super_roots(struct btrfs_fs_info *info) +{ +	int next_backup; +	struct btrfs_root_backup *root_backup; +	int last_backup; + +	next_backup = info->backup_root_index; +	last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % +		BTRFS_NUM_BACKUP_ROOTS; + +	/* +	 * just overwrite the last backup if we're at the same generation +	 * this happens only at umount +	 */ +	root_backup = info->super_for_commit->super_roots + last_backup; +	if (btrfs_backup_tree_root_gen(root_backup) == +	    btrfs_header_generation(info->tree_root->node)) +		next_backup = last_backup; + +	root_backup = info->super_for_commit->super_roots + next_backup; + +	/* +	 * make sure all of our padding and empty slots get zero filled +	 * regardless of which ones we use today +	 */ +	memset(root_backup, 0, sizeof(*root_backup)); + +	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; + +	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); +	btrfs_set_backup_tree_root_gen(root_backup, +			       btrfs_header_generation(info->tree_root->node)); + +	btrfs_set_backup_tree_root_level(root_backup, +			       btrfs_header_level(info->tree_root->node)); + +	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); +	btrfs_set_backup_chunk_root_gen(root_backup, +			       btrfs_header_generation(info->chunk_root->node)); +	btrfs_set_backup_chunk_root_level(root_backup, +			       btrfs_header_level(info->chunk_root->node)); + +	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); +	btrfs_set_backup_extent_root_gen(root_backup, +			       btrfs_header_generation(info->extent_root->node)); +	btrfs_set_backup_extent_root_level(root_backup, +			       btrfs_header_level(info->extent_root->node)); + +	/* +	 * we might commit during log recovery, which happens before we set +	 * the fs_root.  Make sure it is valid before we fill it in. +	 */ +	if (info->fs_root && info->fs_root->node) { +		btrfs_set_backup_fs_root(root_backup, +					 info->fs_root->node->start); +		btrfs_set_backup_fs_root_gen(root_backup, +			       btrfs_header_generation(info->fs_root->node)); +		btrfs_set_backup_fs_root_level(root_backup, +			       btrfs_header_level(info->fs_root->node)); +	} + +	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); +	btrfs_set_backup_dev_root_gen(root_backup, +			       btrfs_header_generation(info->dev_root->node)); +	btrfs_set_backup_dev_root_level(root_backup, +				       btrfs_header_level(info->dev_root->node)); + +	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); +	btrfs_set_backup_csum_root_gen(root_backup, +			       btrfs_header_generation(info->csum_root->node)); +	btrfs_set_backup_csum_root_level(root_backup, +			       btrfs_header_level(info->csum_root->node)); + +	btrfs_set_backup_total_bytes(root_backup, +			     btrfs_super_total_bytes(info->super_copy)); +	btrfs_set_backup_bytes_used(root_backup, +			     btrfs_super_bytes_used(info->super_copy)); +	btrfs_set_backup_num_devices(root_backup, +			     btrfs_super_num_devices(info->super_copy)); + +	/* +	 * if we don't copy this out to the super_copy, it won't get remembered +	 * for the next commit +	 */ +	memcpy(&info->super_copy->super_roots, +	       &info->super_for_commit->super_roots, +	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); +} + +/* + * this copies info out of the root backup array and back into + * the in-memory super block.  It is meant to help iterate through + * the array, so you send it the number of backups you've already + * tried and the last backup index you used. + * + * this returns -1 when it has tried all the backups + */ +static noinline int next_root_backup(struct btrfs_fs_info *info, +				     struct btrfs_super_block *super, +				     int *num_backups_tried, int *backup_index) +{ +	struct btrfs_root_backup *root_backup; +	int newest = *backup_index; + +	if (*num_backups_tried == 0) { +		u64 gen = btrfs_super_generation(super); + +		newest = find_newest_super_backup(info, gen); +		if (newest == -1) +			return -1; + +		*backup_index = newest; +		*num_backups_tried = 1; +	} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { +		/* we've tried all the backups, all done */ +		return -1; +	} else { +		/* jump to the next oldest backup */ +		newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % +			BTRFS_NUM_BACKUP_ROOTS; +		*backup_index = newest; +		*num_backups_tried += 1; +	} +	root_backup = super->super_roots + newest; + +	btrfs_set_super_generation(super, +				   btrfs_backup_tree_root_gen(root_backup)); +	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); +	btrfs_set_super_root_level(super, +				   btrfs_backup_tree_root_level(root_backup)); +	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); + +	/* +	 * fixme: the total bytes and num_devices need to match or we should +	 * need a fsck +	 */ +	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); +	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); +	return 0; +} + +/* helper to cleanup tree roots */ +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +{ +	free_extent_buffer(info->tree_root->node); +	free_extent_buffer(info->tree_root->commit_root); +	free_extent_buffer(info->dev_root->node); +	free_extent_buffer(info->dev_root->commit_root); +	free_extent_buffer(info->extent_root->node); +	free_extent_buffer(info->extent_root->commit_root); +	free_extent_buffer(info->csum_root->node); +	free_extent_buffer(info->csum_root->commit_root); + +	info->tree_root->node = NULL; +	info->tree_root->commit_root = NULL; +	info->dev_root->node = NULL; +	info->dev_root->commit_root = NULL; +	info->extent_root->node = NULL; +	info->extent_root->commit_root = NULL; +	info->csum_root->node = NULL; +	info->csum_root->commit_root = NULL; + +	if (chunk_root) { +		free_extent_buffer(info->chunk_root->node); +		free_extent_buffer(info->chunk_root->commit_root); +		info->chunk_root->node = NULL; +		info->chunk_root->commit_root = NULL; +	} +} + +  struct btrfs_root *open_ctree(struct super_block *sb,  			      struct btrfs_fs_devices *fs_devices,  			      char *options) @@ -1604,6 +1904,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	int ret;  	int err = -EINVAL; +	int num_backups_tried = 0; +	int backup_index = 0;  	struct btrfs_super_block *disk_super; @@ -1648,6 +1950,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	spin_lock_init(&fs_info->fs_roots_radix_lock);  	spin_lock_init(&fs_info->delayed_iput_lock);  	spin_lock_init(&fs_info->defrag_inodes_lock); +	spin_lock_init(&fs_info->free_chunk_lock);  	mutex_init(&fs_info->reloc_mutex);  	init_completion(&fs_info->kobj_unregister); @@ -1665,8 +1968,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	btrfs_init_block_rsv(&fs_info->trans_block_rsv);  	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);  	btrfs_init_block_rsv(&fs_info->empty_block_rsv); -	INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); -	mutex_init(&fs_info->durable_block_rsv_mutex); +	btrfs_init_block_rsv(&fs_info->delayed_block_rsv);  	atomic_set(&fs_info->nr_async_submits, 0);  	atomic_set(&fs_info->async_delalloc_pages, 0);  	atomic_set(&fs_info->async_submit_draining, 0); @@ -1677,6 +1979,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	fs_info->metadata_ratio = 0;  	fs_info->defrag_inodes = RB_ROOT;  	fs_info->trans_no_join = 0; +	fs_info->free_chunk_space = 0; + +	/* readahead state */ +	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); +	spin_lock_init(&fs_info->reada_lock);  	fs_info->thread_pool_size = min_t(unsigned long,  					  num_online_cpus() + 2, 8); @@ -1766,14 +2073,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,  		goto fail_alloc;  	} -	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); -	memcpy(&fs_info->super_for_commit, &fs_info->super_copy, -	       sizeof(fs_info->super_for_commit)); +	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); +	memcpy(fs_info->super_for_commit, fs_info->super_copy, +	       sizeof(*fs_info->super_for_commit));  	brelse(bh); -	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); +	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); -	disk_super = &fs_info->super_copy; +	disk_super = fs_info->super_copy;  	if (!btrfs_super_root(disk_super))  		goto fail_alloc; @@ -1783,6 +2090,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);  	/* +	 * run through our array of backup supers and setup +	 * our ring pointer to the oldest one +	 */ +	generation = btrfs_super_generation(disk_super); +	find_oldest_super_backup(fs_info, generation); + +	/*  	 * In the long term, we'll store the compression type in the super  	 * block, and it'll be used for per file compression control.  	 */ @@ -1870,6 +2184,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",  			   fs_info->thread_pool_size,  			   &fs_info->generic_worker); +	btrfs_init_workers(&fs_info->readahead_workers, "readahead", +			   fs_info->thread_pool_size, +			   &fs_info->generic_worker);  	/*  	 * endios are largely parallel and should have a very @@ -1880,6 +2197,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	fs_info->endio_write_workers.idle_thresh = 2;  	fs_info->endio_meta_write_workers.idle_thresh = 2; +	fs_info->readahead_workers.idle_thresh = 2;  	btrfs_start_workers(&fs_info->workers, 1);  	btrfs_start_workers(&fs_info->generic_worker, 1); @@ -1893,6 +2211,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);  	btrfs_start_workers(&fs_info->delayed_workers, 1);  	btrfs_start_workers(&fs_info->caching_workers, 1); +	btrfs_start_workers(&fs_info->readahead_workers, 1);  	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);  	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1939,7 +2258,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {  		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",  		       sb->s_id); -		goto fail_chunk_root; +		goto fail_tree_roots;  	}  	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);  	chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1954,11 +2273,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	if (ret) {  		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",  		       sb->s_id); -		goto fail_chunk_root; +		goto fail_tree_roots;  	}  	btrfs_close_extra_devices(fs_devices); +retry_root_backup:  	blocksize = btrfs_level_size(tree_root,  				     btrfs_super_root_level(disk_super));  	generation = btrfs_super_generation(disk_super); @@ -1966,32 +2286,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	tree_root->node = read_tree_block(tree_root,  					  btrfs_super_root(disk_super),  					  blocksize, generation); -	if (!tree_root->node) -		goto fail_chunk_root; -	if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { +	if (!tree_root->node || +	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {  		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",  		       sb->s_id); -		goto fail_tree_root; + +		goto recovery_tree_root;  	} +  	btrfs_set_root_node(&tree_root->root_item, tree_root->node);  	tree_root->commit_root = btrfs_root_node(tree_root);  	ret = find_and_setup_root(tree_root, fs_info,  				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);  	if (ret) -		goto fail_tree_root; +		goto recovery_tree_root;  	extent_root->track_dirty = 1;  	ret = find_and_setup_root(tree_root, fs_info,  				  BTRFS_DEV_TREE_OBJECTID, dev_root);  	if (ret) -		goto fail_extent_root; +		goto recovery_tree_root;  	dev_root->track_dirty = 1;  	ret = find_and_setup_root(tree_root, fs_info,  				  BTRFS_CSUM_TREE_OBJECTID, csum_root);  	if (ret) -		goto fail_dev_root; +		goto recovery_tree_root;  	csum_root->track_dirty = 1; @@ -2124,22 +2445,13 @@ fail_cleaner:  fail_block_groups:  	btrfs_free_block_groups(fs_info); -	free_extent_buffer(csum_root->node); -	free_extent_buffer(csum_root->commit_root); -fail_dev_root: -	free_extent_buffer(dev_root->node); -	free_extent_buffer(dev_root->commit_root); -fail_extent_root: -	free_extent_buffer(extent_root->node); -	free_extent_buffer(extent_root->commit_root); -fail_tree_root: -	free_extent_buffer(tree_root->node); -	free_extent_buffer(tree_root->commit_root); -fail_chunk_root: -	free_extent_buffer(chunk_root->node); -	free_extent_buffer(chunk_root->commit_root); + +fail_tree_roots: +	free_root_pointers(fs_info, 1); +  fail_sb_buffer:  	btrfs_stop_workers(&fs_info->generic_worker); +	btrfs_stop_workers(&fs_info->readahead_workers);  	btrfs_stop_workers(&fs_info->fixup_workers);  	btrfs_stop_workers(&fs_info->delalloc_workers);  	btrfs_stop_workers(&fs_info->workers); @@ -2152,7 +2464,6 @@ fail_sb_buffer:  	btrfs_stop_workers(&fs_info->delayed_workers);  	btrfs_stop_workers(&fs_info->caching_workers);  fail_alloc: -	kfree(fs_info->delayed_root);  fail_iput:  	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);  	iput(fs_info->btree_inode); @@ -2164,13 +2475,27 @@ fail_bdi:  fail_srcu:  	cleanup_srcu_struct(&fs_info->subvol_srcu);  fail: -	kfree(extent_root); -	kfree(tree_root); -	kfree(fs_info); -	kfree(chunk_root); -	kfree(dev_root); -	kfree(csum_root); +	free_fs_info(fs_info);  	return ERR_PTR(err); + +recovery_tree_root: + +	if (!btrfs_test_opt(tree_root, RECOVERY)) +		goto fail_tree_roots; + +	free_root_pointers(fs_info, 0); + +	/* don't use the log in recovery mode, it won't be valid */ +	btrfs_set_super_log_root(disk_super, 0); + +	/* we can't trust the free space cache either */ +	btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + +	ret = next_root_backup(fs_info, fs_info->super_copy, +			       &num_backups_tried, &backup_index); +	if (ret == -1) +		goto fail_block_groups; +	goto retry_root_backup;  }  static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) @@ -2338,10 +2663,11 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)  	int total_errors = 0;  	u64 flags; -	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; +	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;  	do_barriers = !btrfs_test_opt(root, NOBARRIER); +	backup_super_roots(root->fs_info); -	sb = &root->fs_info->super_for_commit; +	sb = root->fs_info->super_for_commit;  	dev_item = &sb->dev_item;  	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); @@ -2545,8 +2871,6 @@ int close_ctree(struct btrfs_root *root)  	/* clear out the rbtree of defraggable inodes */  	btrfs_run_defrag_inodes(root->fs_info); -	btrfs_put_block_group_cache(fs_info); -  	/*  	 * Here come 2 situations when btrfs is broken to flip readonly:  	 * @@ -2572,6 +2896,8 @@ int close_ctree(struct btrfs_root *root)  			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);  	} +	btrfs_put_block_group_cache(fs_info); +  	kthread_stop(root->fs_info->transaction_kthread);  	kthread_stop(root->fs_info->cleaner_kthread); @@ -2603,7 +2929,6 @@ int close_ctree(struct btrfs_root *root)  	del_fs_roots(fs_info);  	iput(fs_info->btree_inode); -	kfree(fs_info->delayed_root);  	btrfs_stop_workers(&fs_info->generic_worker);  	btrfs_stop_workers(&fs_info->fixup_workers); @@ -2617,6 +2942,7 @@ int close_ctree(struct btrfs_root *root)  	btrfs_stop_workers(&fs_info->submit_workers);  	btrfs_stop_workers(&fs_info->delayed_workers);  	btrfs_stop_workers(&fs_info->caching_workers); +	btrfs_stop_workers(&fs_info->readahead_workers);  	btrfs_close_devices(fs_info->fs_devices);  	btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2624,12 +2950,7 @@ int close_ctree(struct btrfs_root *root)  	bdi_destroy(&fs_info->bdi);  	cleanup_srcu_struct(&fs_info->subvol_srcu); -	kfree(fs_info->extent_root); -	kfree(fs_info->tree_root); -	kfree(fs_info->chunk_root); -	kfree(fs_info->dev_root); -	kfree(fs_info->csum_root); -	kfree(fs_info); +	free_fs_info(fs_info);  	return 0;  } @@ -2735,7 +3056,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)  	return ret;  } -int btree_lock_page_hook(struct page *page) +static int btree_lock_page_hook(struct page *page, void *data, +				void (*flush_fn)(void *))  {  	struct inode *inode = page->mapping->host;  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2752,7 +3074,10 @@ int btree_lock_page_hook(struct page *page)  	if (!eb)  		goto out; -	btrfs_tree_lock(eb); +	if (!btrfs_try_tree_write_lock(eb)) { +		flush_fn(data); +		btrfs_tree_lock(eb); +	}  	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);  	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { @@ -2767,7 +3092,10 @@ int btree_lock_page_hook(struct page *page)  	btrfs_tree_unlock(eb);  	free_extent_buffer(eb);  out: -	lock_page(page); +	if (!trylock_page(page)) { +		flush_fn(data); +		lock_page(page); +	}  	return 0;  } @@ -3123,6 +3451,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)  static struct extent_io_ops btree_extent_io_ops = {  	.write_cache_pages_lock_hook = btree_lock_page_hook,  	.readpage_end_io_hook = btree_readpage_end_io_hook, +	.readpage_io_failed_hook = btree_io_failed_hook,  	.submit_bio_hook = btree_submit_bio_hook,  	/* note we're sharing with inode.c for the merge bio hook */  	.merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bec3ea4bd67..c99d0a8f13f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,  				      u32 blocksize, u64 parent_transid);  int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,  			 u64 parent_transid); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +			 int mirror_num, struct extent_buffer **eb);  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,  						   u64 bytenr, u32 blocksize);  int clean_tree_block(struct btrfs_trans_handle *trans, @@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,  			     struct btrfs_fs_info *fs_info);  int btrfs_add_log_tree(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root); -int btree_lock_page_hook(struct page *page); -  #ifdef CONFIG_DEBUG_LOCK_ALLOC  void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c9ee0e18bbd..9879bd47463 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,6 +23,7 @@  #include <linux/rcupdate.h>  #include <linux/kthread.h>  #include <linux/slab.h> +#include <linux/ratelimit.h>  #include "compat.h"  #include "hash.h"  #include "ctree.h" @@ -52,6 +53,21 @@ enum {  	CHUNK_ALLOC_LIMITED = 2,  }; +/* + * Control how reservations are dealt with. + * + * RESERVE_FREE - freeing a reservation. + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for + *   ENOSPC accounting + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update + *   bytes_may_use as the ENOSPC accounting is done elsewhere + */ +enum { +	RESERVE_FREE = 0, +	RESERVE_ALLOC = 1, +	RESERVE_ALLOC_NO_ACCOUNT = 2, +}; +  static int update_block_group(struct btrfs_trans_handle *trans,  			      struct btrfs_root *root,  			      u64 bytenr, u64 num_bytes, int alloc); @@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,  			 struct btrfs_key *key);  static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  			    int dump_block_groups); +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, +				       u64 num_bytes, int reserve);  static noinline int  block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)  	if (atomic_dec_and_test(&cache->count)) {  		WARN_ON(cache->pinned > 0);  		WARN_ON(cache->reserved > 0); -		WARN_ON(cache->reserved_pinned > 0);  		kfree(cache->free_space_ctl);  		kfree(cache);  	} @@ -465,7 +482,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  	 * we likely hold important locks.  	 */  	if (trans && (!trans->transaction->in_commit) && -	    (root && root != root->fs_info->tree_root)) { +	    (root && root != root->fs_info->tree_root) && +	    btrfs_test_opt(root, SPACE_CACHE)) {  		spin_lock(&cache->lock);  		if (cache->cached != BTRFS_CACHE_NO) {  			spin_unlock(&cache->lock); @@ -1770,18 +1788,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,  {  	int ret;  	u64 discarded_bytes = 0; -	struct btrfs_multi_bio *multi = NULL; +	struct btrfs_bio *bbio = NULL;  	/* Tell the block device(s) that the sectors can be discarded */  	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, -			      bytenr, &num_bytes, &multi, 0); +			      bytenr, &num_bytes, &bbio, 0);  	if (!ret) { -		struct btrfs_bio_stripe *stripe = multi->stripes; +		struct btrfs_bio_stripe *stripe = bbio->stripes;  		int i; -		for (i = 0; i < multi->num_stripes; i++, stripe++) { +		for (i = 0; i < bbio->num_stripes; i++, stripe++) {  			if (!stripe->dev->can_discard)  				continue; @@ -1800,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,  			 */  			ret = 0;  		} -		kfree(multi); +		kfree(bbio);  	}  	if (actual_bytes) @@ -2700,6 +2718,13 @@ again:  		goto again;  	} +	/* We've already setup this transaction, go ahead and exit */ +	if (block_group->cache_generation == trans->transid && +	    i_size_read(inode)) { +		dcs = BTRFS_DC_SETUP; +		goto out_put; +	} +  	/*  	 * We want to set the generation to 0, that way if anything goes wrong  	 * from here on out we know not to trust this cache when we load up next @@ -2749,12 +2774,15 @@ again:  	if (!ret)  		dcs = BTRFS_DC_SETUP;  	btrfs_free_reserved_data_space(inode, num_pages); +  out_put:  	iput(inode);  out_free:  	btrfs_release_path(path);  out:  	spin_lock(&block_group->lock); +	if (!ret) +		block_group->cache_generation = trans->transid;  	block_group->disk_cache_state = dcs;  	spin_unlock(&block_group->lock); @@ -3122,16 +3150,13 @@ commit_trans:  		return -ENOSPC;  	}  	data_sinfo->bytes_may_use += bytes; -	BTRFS_I(inode)->reserved_bytes += bytes;  	spin_unlock(&data_sinfo->lock);  	return 0;  }  /* - * called when we are clearing an delalloc extent from the - * inode's io_tree or there was an error for whatever reason - * after calling btrfs_check_data_free_space + * Called if we need to clear a data reservation for this inode.   */  void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)  { @@ -3144,7 +3169,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)  	data_sinfo = BTRFS_I(inode)->space_info;  	spin_lock(&data_sinfo->lock);  	data_sinfo->bytes_may_use -= bytes; -	BTRFS_I(inode)->reserved_bytes -= bytes;  	spin_unlock(&data_sinfo->lock);  } @@ -3165,6 +3189,7 @@ static int should_alloc_chunk(struct btrfs_root *root,  			      struct btrfs_space_info *sinfo, u64 alloc_bytes,  			      int force)  { +	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;  	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;  	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;  	u64 thresh; @@ -3173,11 +3198,18 @@ static int should_alloc_chunk(struct btrfs_root *root,  		return 1;  	/* +	 * We need to take into account the global rsv because for all intents +	 * and purposes it's used space.  Don't worry about locking the +	 * global_rsv, it doesn't change except when the transaction commits. +	 */ +	num_allocated += global_rsv->size; + +	/*  	 * in limited mode, we want to have some free space up to  	 * about 1% of the FS size.  	 */  	if (force == CHUNK_ALLOC_LIMITED) { -		thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); +		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);  		thresh = max_t(u64, 64 * 1024 * 1024,  			       div_factor_fine(thresh, 1)); @@ -3199,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,  	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))  		return 0; -	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); +	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);  	/* 256MB or 5% of the FS */  	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); @@ -3302,24 +3334,26 @@ out:  /*   * shrink metadata reservation for delalloc   */ -static int shrink_delalloc(struct btrfs_trans_handle *trans, -			   struct btrfs_root *root, u64 to_reclaim, int sync) +static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, +			   bool wait_ordered)  {  	struct btrfs_block_rsv *block_rsv;  	struct btrfs_space_info *space_info; +	struct btrfs_trans_handle *trans;  	u64 reserved;  	u64 max_reclaim;  	u64 reclaimed = 0;  	long time_left; -	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; +	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;  	int loops = 0;  	unsigned long progress; +	trans = (struct btrfs_trans_handle *)current->journal_info;  	block_rsv = &root->fs_info->delalloc_block_rsv;  	space_info = block_rsv->space_info;  	smp_mb(); -	reserved = space_info->bytes_reserved; +	reserved = space_info->bytes_may_use;  	progress = space_info->reservation_progress;  	if (reserved == 0) @@ -3334,7 +3368,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,  	}  	max_reclaim = min(reserved, to_reclaim); - +	nr_pages = max_t(unsigned long, nr_pages, +			 max_reclaim >> PAGE_CACHE_SHIFT);  	while (loops < 1024) {  		/* have the flusher threads jump in and do some IO */  		smp_mb(); @@ -3344,9 +3379,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,  						WB_REASON_FS_FREE_SPACE);  		spin_lock(&space_info->lock); -		if (reserved > space_info->bytes_reserved) -			reclaimed += reserved - space_info->bytes_reserved; -		reserved = space_info->bytes_reserved; +		if (reserved > space_info->bytes_may_use) +			reclaimed += reserved - space_info->bytes_may_use; +		reserved = space_info->bytes_may_use;  		spin_unlock(&space_info->lock);  		loops++; @@ -3357,11 +3392,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,  		if (trans && trans->transaction->blocked)  			return -EAGAIN; -		time_left = schedule_timeout_interruptible(1); +		if (wait_ordered && !trans) { +			btrfs_wait_ordered_extents(root, 0, 0); +		} else { +			time_left = schedule_timeout_interruptible(1); -		/* We were interrupted, exit */ -		if (time_left) -			break; +			/* We were interrupted, exit */ +			if (time_left) +				break; +		}  		/* we've kicked the IO a few times, if anything has been freed,  		 * exit.  There is no sense in looping here for a long time @@ -3376,34 +3415,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,  		}  	} -	if (reclaimed >= to_reclaim && !trans) -		btrfs_wait_ordered_extents(root, 0, 0); +  	return reclaimed >= to_reclaim;  } -/* - * Retries tells us how many times we've called reserve_metadata_bytes.  The - * idea is if this is the first call (retries == 0) then we will add to our - * reserved count if we can't make the allocation in order to hold our place - * while we go and try and free up space.  That way for retries > 1 we don't try - * and add space, we just check to see if the amount of unused space is >= the - * total space, meaning that our reservation is valid. +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit   * - * However if we don't intend to retry this reservation, pass -1 as retries so - * that it short circuits this logic. + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does.  Otherwise it + * will return -ENOSPC.   */ -static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, -				  struct btrfs_root *root, +static int may_commit_transaction(struct btrfs_root *root, +				  struct btrfs_space_info *space_info, +				  u64 bytes, int force) +{ +	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; +	struct btrfs_trans_handle *trans; + +	trans = (struct btrfs_trans_handle *)current->journal_info; +	if (trans) +		return -EAGAIN; + +	if (force) +		goto commit; + +	/* See if there is enough pinned space to make this reservation */ +	spin_lock(&space_info->lock); +	if (space_info->bytes_pinned >= bytes) { +		spin_unlock(&space_info->lock); +		goto commit; +	} +	spin_unlock(&space_info->lock); + +	/* +	 * See if there is some space in the delayed insertion reservation for +	 * this reservation. +	 */ +	if (space_info != delayed_rsv->space_info) +		return -ENOSPC; + +	spin_lock(&delayed_rsv->lock); +	if (delayed_rsv->size < bytes) { +		spin_unlock(&delayed_rsv->lock); +		return -ENOSPC; +	} +	spin_unlock(&delayed_rsv->lock); + +commit: +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) +		return -ENOSPC; + +	return btrfs_commit_transaction(trans, root); +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - wether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv.  If there is not enough space it will make an attempt to + * flush out space to make room.  It will do this by flushing delalloc if + * possible or committing the transaction.  If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root,  				  struct btrfs_block_rsv *block_rsv,  				  u64 orig_bytes, int flush)  {  	struct btrfs_space_info *space_info = block_rsv->space_info; -	u64 unused; +	u64 used;  	u64 num_bytes = orig_bytes;  	int retries = 0;  	int ret = 0;  	bool committed = false;  	bool flushing = false; +	bool wait_ordered = false;  again:  	ret = 0; @@ -3420,7 +3515,7 @@ again:  		 * deadlock since we are waiting for the flusher to finish, but  		 * hold the current transaction open.  		 */ -		if (trans) +		if (current->journal_info)  			return -EAGAIN;  		ret = wait_event_interruptible(space_info->wait,  					       !space_info->flush); @@ -3432,9 +3527,9 @@ again:  	}  	ret = -ENOSPC; -	unused = space_info->bytes_used + space_info->bytes_reserved + -		 space_info->bytes_pinned + space_info->bytes_readonly + -		 space_info->bytes_may_use; +	used = space_info->bytes_used + space_info->bytes_reserved + +		space_info->bytes_pinned + space_info->bytes_readonly + +		space_info->bytes_may_use;  	/*  	 * The idea here is that we've not already over-reserved the block group @@ -3443,10 +3538,9 @@ again:  	 * lets start flushing stuff first and then come back and try to make  	 * our reservation.  	 */ -	if (unused <= space_info->total_bytes) { -		unused = space_info->total_bytes - unused; -		if (unused >= num_bytes) { -			space_info->bytes_reserved += orig_bytes; +	if (used <= space_info->total_bytes) { +		if (used + orig_bytes <= space_info->total_bytes) { +			space_info->bytes_may_use += orig_bytes;  			ret = 0;  		} else {  			/* @@ -3462,10 +3556,64 @@ again:  		 * amount plus the amount of bytes that we need for this  		 * reservation.  		 */ -		num_bytes = unused - space_info->total_bytes + +		wait_ordered = true; +		num_bytes = used - space_info->total_bytes +  			(orig_bytes * (retries + 1));  	} +	if (ret) { +		u64 profile = btrfs_get_alloc_profile(root, 0); +		u64 avail; + +		/* +		 * If we have a lot of space that's pinned, don't bother doing +		 * the overcommit dance yet and just commit the transaction. +		 */ +		avail = (space_info->total_bytes - space_info->bytes_used) * 8; +		do_div(avail, 10); +		if (space_info->bytes_pinned >= avail && flush && !committed) { +			space_info->flush = 1; +			flushing = true; +			spin_unlock(&space_info->lock); +			ret = may_commit_transaction(root, space_info, +						     orig_bytes, 1); +			if (ret) +				goto out; +			committed = true; +			goto again; +		} + +		spin_lock(&root->fs_info->free_chunk_lock); +		avail = root->fs_info->free_chunk_space; + +		/* +		 * If we have dup, raid1 or raid10 then only half of the free +		 * space is actually useable. +		 */ +		if (profile & (BTRFS_BLOCK_GROUP_DUP | +			       BTRFS_BLOCK_GROUP_RAID1 | +			       BTRFS_BLOCK_GROUP_RAID10)) +			avail >>= 1; + +		/* +		 * If we aren't flushing don't let us overcommit too much, say +		 * 1/8th of the space.  If we can flush, let it overcommit up to +		 * 1/2 of the space. +		 */ +		if (flush) +			avail >>= 3; +		else +			avail >>= 1; +		 spin_unlock(&root->fs_info->free_chunk_lock); + +		if (used + num_bytes < space_info->total_bytes + avail) { +			space_info->bytes_may_use += orig_bytes; +			ret = 0; +		} else { +			wait_ordered = true; +		} +	} +  	/*  	 * Couldn't make our reservation, save our place so while we're trying  	 * to reclaim space we can actually use it instead of somebody else @@ -3485,7 +3633,7 @@ again:  	 * We do synchronous shrinking since we don't actually unreserve  	 * metadata until after the IO is completed.  	 */ -	ret = shrink_delalloc(trans, root, num_bytes, 1); +	ret = shrink_delalloc(root, num_bytes, wait_ordered);  	if (ret < 0)  		goto out; @@ -3497,35 +3645,17 @@ again:  	 * so go back around and try again.  	 */  	if (retries < 2) { +		wait_ordered = true;  		retries++;  		goto again;  	} -	/* -	 * Not enough space to be reclaimed, don't bother committing the -	 * transaction. -	 */ -	spin_lock(&space_info->lock); -	if (space_info->bytes_pinned < orig_bytes) -		ret = -ENOSPC; -	spin_unlock(&space_info->lock); -	if (ret) -		goto out; - -	ret = -EAGAIN; -	if (trans) -		goto out; -  	ret = -ENOSPC;  	if (committed)  		goto out; -	trans = btrfs_join_transaction(root); -	if (IS_ERR(trans)) -		goto out; -	ret = btrfs_commit_transaction(trans, root); +	ret = may_commit_transaction(root, space_info, orig_bytes, 0);  	if (!ret) { -		trans = NULL;  		committed = true;  		goto again;  	} @@ -3543,10 +3673,12 @@ out:  static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,  					     struct btrfs_root *root)  { -	struct btrfs_block_rsv *block_rsv; -	if (root->ref_cows) +	struct btrfs_block_rsv *block_rsv = NULL; + +	if (root->ref_cows || root == root->fs_info->csum_root)  		block_rsv = trans->block_rsv; -	else + +	if (!block_rsv)  		block_rsv = root->block_rsv;  	if (!block_rsv) @@ -3617,7 +3749,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,  		}  		if (num_bytes) {  			spin_lock(&space_info->lock); -			space_info->bytes_reserved -= num_bytes; +			space_info->bytes_may_use -= num_bytes;  			space_info->reservation_progress++;  			spin_unlock(&space_info->lock);  		} @@ -3641,9 +3773,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)  {  	memset(rsv, 0, sizeof(*rsv));  	spin_lock_init(&rsv->lock); -	atomic_set(&rsv->usage, 1); -	rsv->priority = 6; -	INIT_LIST_HEAD(&rsv->list);  }  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) @@ -3664,38 +3793,38 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)  void btrfs_free_block_rsv(struct btrfs_root *root,  			  struct btrfs_block_rsv *rsv)  { -	if (rsv && atomic_dec_and_test(&rsv->usage)) { -		btrfs_block_rsv_release(root, rsv, (u64)-1); -		if (!rsv->durable) -			kfree(rsv); -	} +	btrfs_block_rsv_release(root, rsv, (u64)-1); +	kfree(rsv);  } -/* - * make the block_rsv struct be able to capture freed space. - * the captured space will re-add to the the block_rsv struct - * after transaction commit - */ -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, -				 struct btrfs_block_rsv *block_rsv) +int btrfs_block_rsv_add(struct btrfs_root *root, +			struct btrfs_block_rsv *block_rsv, +			u64 num_bytes)  { -	block_rsv->durable = 1; -	mutex_lock(&fs_info->durable_block_rsv_mutex); -	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); -	mutex_unlock(&fs_info->durable_block_rsv_mutex); +	int ret; + +	if (num_bytes == 0) +		return 0; + +	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); +	if (!ret) { +		block_rsv_add_bytes(block_rsv, num_bytes, 1); +		return 0; +	} + +	return ret;  } -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, -			struct btrfs_root *root, -			struct btrfs_block_rsv *block_rsv, -			u64 num_bytes) +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, +				struct btrfs_block_rsv *block_rsv, +				u64 num_bytes)  {  	int ret;  	if (num_bytes == 0)  		return 0; -	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); +	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);  	if (!ret) {  		block_rsv_add_bytes(block_rsv, num_bytes, 1);  		return 0; @@ -3704,55 +3833,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,  	return ret;  } -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, -			  struct btrfs_root *root, -			  struct btrfs_block_rsv *block_rsv, -			  u64 min_reserved, int min_factor) +int btrfs_block_rsv_check(struct btrfs_root *root, +			  struct btrfs_block_rsv *block_rsv, int min_factor)  {  	u64 num_bytes = 0; -	int commit_trans = 0;  	int ret = -ENOSPC;  	if (!block_rsv)  		return 0;  	spin_lock(&block_rsv->lock); -	if (min_factor > 0) -		num_bytes = div_factor(block_rsv->size, min_factor); -	if (min_reserved > num_bytes) -		num_bytes = min_reserved; +	num_bytes = div_factor(block_rsv->size, min_factor); +	if (block_rsv->reserved >= num_bytes) +		ret = 0; +	spin_unlock(&block_rsv->lock); -	if (block_rsv->reserved >= num_bytes) { +	return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, +			  struct btrfs_block_rsv *block_rsv, +			  u64 min_reserved) +{ +	u64 num_bytes = 0; +	int ret = -ENOSPC; + +	if (!block_rsv) +		return 0; + +	spin_lock(&block_rsv->lock); +	num_bytes = min_reserved; +	if (block_rsv->reserved >= num_bytes)  		ret = 0; -	} else { +	else  		num_bytes -= block_rsv->reserved; -		if (block_rsv->durable && -		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) -			commit_trans = 1; -	}  	spin_unlock(&block_rsv->lock); +  	if (!ret)  		return 0; -	if (block_rsv->refill_used) { -		ret = reserve_metadata_bytes(trans, root, block_rsv, -					     num_bytes, 0); -		if (!ret) { -			block_rsv_add_bytes(block_rsv, num_bytes, 0); -			return 0; -		} -	} - -	if (commit_trans) { -		if (trans) -			return -EAGAIN; -		trans = btrfs_join_transaction(root); -		BUG_ON(IS_ERR(trans)); -		ret = btrfs_commit_transaction(trans, root); +	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); +	if (!ret) { +		block_rsv_add_bytes(block_rsv, num_bytes, 0);  		return 0;  	} -	return -ENOSPC; +	return ret;  }  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, @@ -3784,7 +3910,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)  	u64 num_bytes;  	u64 meta_used;  	u64 data_used; -	int csum_size = btrfs_super_csum_size(&fs_info->super_copy); +	int csum_size = btrfs_super_csum_size(fs_info->super_copy);  	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);  	spin_lock(&sinfo->lock); @@ -3828,12 +3954,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)  	if (sinfo->total_bytes > num_bytes) {  		num_bytes = sinfo->total_bytes - num_bytes;  		block_rsv->reserved += num_bytes; -		sinfo->bytes_reserved += num_bytes; +		sinfo->bytes_may_use += num_bytes;  	}  	if (block_rsv->reserved >= block_rsv->size) {  		num_bytes = block_rsv->reserved - block_rsv->size; -		sinfo->bytes_reserved -= num_bytes; +		sinfo->bytes_may_use -= num_bytes;  		sinfo->reservation_progress++;  		block_rsv->reserved = block_rsv->size;  		block_rsv->full = 1; @@ -3849,16 +3975,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)  	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);  	fs_info->chunk_block_rsv.space_info = space_info; -	fs_info->chunk_block_rsv.priority = 10;  	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);  	fs_info->global_block_rsv.space_info = space_info; -	fs_info->global_block_rsv.priority = 10; -	fs_info->global_block_rsv.refill_used = 1;  	fs_info->delalloc_block_rsv.space_info = space_info;  	fs_info->trans_block_rsv.space_info = space_info;  	fs_info->empty_block_rsv.space_info = space_info; -	fs_info->empty_block_rsv.priority = 10; +	fs_info->delayed_block_rsv.space_info = space_info;  	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;  	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; @@ -3866,10 +3989,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)  	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;  	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; -	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); - -	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); -  	update_global_block_rsv(fs_info);  } @@ -3882,37 +4001,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)  	WARN_ON(fs_info->trans_block_rsv.reserved > 0);  	WARN_ON(fs_info->chunk_block_rsv.size > 0);  	WARN_ON(fs_info->chunk_block_rsv.reserved > 0); -} - -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, -				    struct btrfs_root *root, -				    struct btrfs_block_rsv *rsv) -{ -	struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; -	u64 num_bytes; -	int ret; - -	/* -	 * Truncate should be freeing data, but give us 2 items just in case it -	 * needs to use some space.  We may want to be smarter about this in the -	 * future. -	 */ -	num_bytes = btrfs_calc_trans_metadata_size(root, 2); - -	/* We already have enough bytes, just return */ -	if (rsv->reserved >= num_bytes) -		return 0; - -	num_bytes -= rsv->reserved; - -	/* -	 * You should have reserved enough space before hand to do this, so this -	 * should not fail. -	 */ -	ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); -	BUG_ON(ret); - -	return 0; +	WARN_ON(fs_info->delayed_block_rsv.size > 0); +	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);  }  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -3921,9 +4011,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,  	if (!trans->bytes_reserved)  		return; -	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); -	btrfs_block_rsv_release(root, trans->block_rsv, -				trans->bytes_reserved); +	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);  	trans->bytes_reserved = 0;  } @@ -3965,11 +4053,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,  	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);  } +/** + * drop_outstanding_extent - drop an outstanding extent + * @inode: the inode we're dropping the extent for + * + * This is called when we are freeing up an outstanding extent, either called + * after an error or after an extent is written.  This will return the number of + * reserved extents that need to be freed.  This must be called with + * BTRFS_I(inode)->lock held. + */  static unsigned drop_outstanding_extent(struct inode *inode)  {  	unsigned dropped_extents = 0; -	spin_lock(&BTRFS_I(inode)->lock);  	BUG_ON(!BTRFS_I(inode)->outstanding_extents);  	BTRFS_I(inode)->outstanding_extents--; @@ -3979,19 +4075,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)  	 */  	if (BTRFS_I(inode)->outstanding_extents >=  	    BTRFS_I(inode)->reserved_extents) -		goto out; +		return 0;  	dropped_extents = BTRFS_I(inode)->reserved_extents -  		BTRFS_I(inode)->outstanding_extents;  	BTRFS_I(inode)->reserved_extents -= dropped_extents; -out: -	spin_unlock(&BTRFS_I(inode)->lock);  	return dropped_extents;  } -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +/** + * calc_csum_metadata_size - return the amount of metada space that must be + *	reserved/free'd for the given bytes. + * @inode: the inode we're manipulating + * @num_bytes: the number of bytes in question + * @reserve: 1 if we are reserving space, 0 if we are freeing space + * + * This adjusts the number of csum_bytes in the inode and then returns the + * correct amount of metadata that must either be reserved or freed.  We + * calculate how many checksums we can fit into one leaf and then divide the + * number of bytes that will need to be checksumed by this value to figure out + * how many checksums will be required.  If we are adding bytes then the number + * may go up and we will return the number of additional bytes that must be + * reserved.  If it is going down we will return the number of bytes that must + * be freed. + * + * This must be called with BTRFS_I(inode)->lock held. + */ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, +				   int reserve)  { -	return num_bytes >>= 3; +	struct btrfs_root *root = BTRFS_I(inode)->root; +	u64 csum_size; +	int num_csums_per_leaf; +	int num_csums; +	int old_csums; + +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && +	    BTRFS_I(inode)->csum_bytes == 0) +		return 0; + +	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); +	if (reserve) +		BTRFS_I(inode)->csum_bytes += num_bytes; +	else +		BTRFS_I(inode)->csum_bytes -= num_bytes; +	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); +	num_csums_per_leaf = (int)div64_u64(csum_size, +					    sizeof(struct btrfs_csum_item) + +					    sizeof(struct btrfs_disk_key)); +	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); +	num_csums = num_csums + num_csums_per_leaf - 1; +	num_csums = num_csums / num_csums_per_leaf; + +	old_csums = old_csums + num_csums_per_leaf - 1; +	old_csums = old_csums / num_csums_per_leaf; + +	/* No change, no need to reserve more */ +	if (old_csums == num_csums) +		return 0; + +	if (reserve) +		return btrfs_calc_trans_metadata_size(root, +						      num_csums - old_csums); + +	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);  }  int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) @@ -4000,9 +4147,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;  	u64 to_reserve = 0;  	unsigned nr_extents = 0; +	int flush = 1;  	int ret; -	if (btrfs_transaction_in_commit(root->fs_info)) +	if (btrfs_is_free_space_inode(root, inode)) +		flush = 0; + +	if (flush && btrfs_transaction_in_commit(root->fs_info))  		schedule_timeout(1);  	num_bytes = ALIGN(num_bytes, root->sectorsize); @@ -4018,18 +4169,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);  	} +	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);  	spin_unlock(&BTRFS_I(inode)->lock); -	to_reserve += calc_csum_metadata_size(inode, num_bytes); -	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); +	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);  	if (ret) { +		u64 to_free = 0;  		unsigned dropped; + +		spin_lock(&BTRFS_I(inode)->lock); +		dropped = drop_outstanding_extent(inode); +		to_free = calc_csum_metadata_size(inode, num_bytes, 0); +		spin_unlock(&BTRFS_I(inode)->lock); +		to_free += btrfs_calc_trans_metadata_size(root, dropped); +  		/* -		 * We don't need the return value since our reservation failed, -		 * we just need to clean up our counter. +		 * Somebody could have come in and twiddled with the +		 * reservation, so if we have to free more than we would have +		 * reserved from this reservation go ahead and release those +		 * bytes.  		 */ -		dropped = drop_outstanding_extent(inode); -		WARN_ON(dropped > 1); +		to_free -= to_reserve; +		if (to_free) +			btrfs_block_rsv_release(root, block_rsv, to_free);  		return ret;  	} @@ -4038,6 +4200,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	return 0;  } +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for + * @num_bytes: the number of bytes we're releasing + * + * This will release the metadata reservation for an inode.  This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations. + */  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4045,9 +4216,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)  	unsigned dropped;  	num_bytes = ALIGN(num_bytes, root->sectorsize); +	spin_lock(&BTRFS_I(inode)->lock);  	dropped = drop_outstanding_extent(inode); -	to_free = calc_csum_metadata_size(inode, num_bytes); +	to_free = calc_csum_metadata_size(inode, num_bytes, 0); +	spin_unlock(&BTRFS_I(inode)->lock);  	if (dropped > 0)  		to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -4055,6 +4228,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)  				to_free);  } +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * @inode: inode we're writing to + * @num_bytes: the number of bytes we want to allocate + * + * This will do the following things + * + * o reserve space in the data space info for num_bytes + * o reserve space in the metadata space info based on number of outstanding + *   extents and how much csums will be needed + * o add to the inodes ->delalloc_bytes + * o add it to the fs_info's delalloc inodes list. + * + * This will return 0 for success and -ENOSPC if there is no space left. + */  int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)  {  	int ret; @@ -4072,6 +4260,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)  	return 0;  } +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @num_bytes: the number of bytes we want to free up + * + * This must be matched with a call to btrfs_delalloc_reserve_space.  This is + * called in the case that we don't need the metadata AND data reservations + * anymore.  So if there is an error or we insert an inline extent. + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + */  void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)  {  	btrfs_delalloc_release_metadata(inode, num_bytes); @@ -4091,12 +4292,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,  	/* block accounting for super block */  	spin_lock(&info->delalloc_lock); -	old_val = btrfs_super_bytes_used(&info->super_copy); +	old_val = btrfs_super_bytes_used(info->super_copy);  	if (alloc)  		old_val += num_bytes;  	else  		old_val -= num_bytes; -	btrfs_set_super_bytes_used(&info->super_copy, old_val); +	btrfs_set_super_bytes_used(info->super_copy, old_val);  	spin_unlock(&info->delalloc_lock);  	while (total) { @@ -4124,7 +4325,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,  		spin_lock(&cache->space_info->lock);  		spin_lock(&cache->lock); -		if (btrfs_super_cache_generation(&info->super_copy) != 0 && +		if (btrfs_test_opt(root, SPACE_CACHE) &&  		    cache->disk_cache_state < BTRFS_DC_CLEAR)  			cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -4136,7 +4337,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,  			btrfs_set_block_group_used(&cache->item, old_val);  			cache->reserved -= num_bytes;  			cache->space_info->bytes_reserved -= num_bytes; -			cache->space_info->reservation_progress++;  			cache->space_info->bytes_used += num_bytes;  			cache->space_info->disk_used += num_bytes * factor;  			spin_unlock(&cache->lock); @@ -4188,7 +4388,6 @@ static int pin_down_extent(struct btrfs_root *root,  	if (reserved) {  		cache->reserved -= num_bytes;  		cache->space_info->bytes_reserved -= num_bytes; -		cache->space_info->reservation_progress++;  	}  	spin_unlock(&cache->lock);  	spin_unlock(&cache->space_info->lock); @@ -4216,45 +4415,82 @@ int btrfs_pin_extent(struct btrfs_root *root,  }  /* - * update size of reserved extents. this function may return -EAGAIN - * if 'reserve' is true or 'sinfo' is false. + * this function must be called within transaction   */ -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, -				u64 num_bytes, int reserve, int sinfo) +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, +				    struct btrfs_root *root, +				    u64 bytenr, u64 num_bytes)  { +	struct btrfs_block_group_cache *cache; + +	cache = btrfs_lookup_block_group(root->fs_info, bytenr); +	BUG_ON(!cache); + +	/* +	 * pull in the free space cache (if any) so that our pin +	 * removes the free space from the cache.  We have load_only set +	 * to one because the slow code to read in the free extents does check +	 * the pinned extents. +	 */ +	cache_block_group(cache, trans, root, 1); + +	pin_down_extent(root, cache, bytenr, num_bytes, 0); + +	/* remove us from the free space cache (if we're there at all) */ +	btrfs_remove_free_space(cache, bytenr, num_bytes); +	btrfs_put_block_group(cache); +	return 0; +} + +/** + * btrfs_update_reserved_bytes - update the block_group and space info counters + * @cache:	The cache we are manipulating + * @num_bytes:	The number of bytes in question + * @reserve:	One of the reservation enums + * + * This is called by the allocator when it reserves space, or by somebody who is + * freeing space that was never actually used on disk.  For example if you + * reserve some space for a new leaf in transaction A and before transaction A + * commits you free that leaf, you call this with reserve set to 0 in order to + * clear the reservation. + * + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * ENOSPC accounting.  For data we handle the reservation through clearing the + * delalloc bits in the io_tree.  We have to do this since we could end up + * allocating less disk space for the amount of data we have reserved in the + * case of compression. + * + * If this is a reservation and the block group has become read only we cannot + * make the reservation and return -EAGAIN, otherwise this function always + * succeeds. + */ +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, +				       u64 num_bytes, int reserve) +{ +	struct btrfs_space_info *space_info = cache->space_info;  	int ret = 0; -	if (sinfo) { -		struct btrfs_space_info *space_info = cache->space_info; -		spin_lock(&space_info->lock); -		spin_lock(&cache->lock); -		if (reserve) { -			if (cache->ro) { -				ret = -EAGAIN; -			} else { -				cache->reserved += num_bytes; -				space_info->bytes_reserved += num_bytes; -			} -		} else { -			if (cache->ro) -				space_info->bytes_readonly += num_bytes; -			cache->reserved -= num_bytes; -			space_info->bytes_reserved -= num_bytes; -			space_info->reservation_progress++; -		} -		spin_unlock(&cache->lock); -		spin_unlock(&space_info->lock); -	} else { -		spin_lock(&cache->lock); +	spin_lock(&space_info->lock); +	spin_lock(&cache->lock); +	if (reserve != RESERVE_FREE) {  		if (cache->ro) {  			ret = -EAGAIN;  		} else { -			if (reserve) -				cache->reserved += num_bytes; -			else -				cache->reserved -= num_bytes; +			cache->reserved += num_bytes; +			space_info->bytes_reserved += num_bytes; +			if (reserve == RESERVE_ALLOC) { +				BUG_ON(space_info->bytes_may_use < num_bytes); +				space_info->bytes_may_use -= num_bytes; +			}  		} -		spin_unlock(&cache->lock); +	} else { +		if (cache->ro) +			space_info->bytes_readonly += num_bytes; +		cache->reserved -= num_bytes; +		space_info->bytes_reserved -= num_bytes; +		space_info->reservation_progress++;  	} +	spin_unlock(&cache->lock); +	spin_unlock(&space_info->lock);  	return ret;  } @@ -4320,13 +4556,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)  		spin_lock(&cache->lock);  		cache->pinned -= len;  		cache->space_info->bytes_pinned -= len; -		if (cache->ro) { +		if (cache->ro)  			cache->space_info->bytes_readonly += len; -		} else if (cache->reserved_pinned > 0) { -			len = min(len, cache->reserved_pinned); -			cache->reserved_pinned -= len; -			cache->space_info->bytes_reserved += len; -		}  		spin_unlock(&cache->lock);  		spin_unlock(&cache->space_info->lock);  	} @@ -4341,11 +4572,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,  {  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct extent_io_tree *unpin; -	struct btrfs_block_rsv *block_rsv; -	struct btrfs_block_rsv *next_rsv;  	u64 start;  	u64 end; -	int idx;  	int ret;  	if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -4368,30 +4596,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,  		cond_resched();  	} -	mutex_lock(&fs_info->durable_block_rsv_mutex); -	list_for_each_entry_safe(block_rsv, next_rsv, -				 &fs_info->durable_block_rsv_list, list) { - -		idx = trans->transid & 0x1; -		if (block_rsv->freed[idx] > 0) { -			block_rsv_add_bytes(block_rsv, -					    block_rsv->freed[idx], 0); -			block_rsv->freed[idx] = 0; -		} -		if (atomic_read(&block_rsv->usage) == 0) { -			btrfs_block_rsv_release(root, block_rsv, (u64)-1); - -			if (block_rsv->freed[0] == 0 && -			    block_rsv->freed[1] == 0) { -				list_del_init(&block_rsv->list); -				kfree(block_rsv); -			} -		} else { -			btrfs_block_rsv_release(root, block_rsv, 0); -		} -	} -	mutex_unlock(&fs_info->durable_block_rsv_mutex); -  	return 0;  } @@ -4669,7 +4873,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  			   struct extent_buffer *buf,  			   u64 parent, int last_ref)  { -	struct btrfs_block_rsv *block_rsv;  	struct btrfs_block_group_cache *cache = NULL;  	int ret; @@ -4684,64 +4887,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  	if (!last_ref)  		return; -	block_rsv = get_block_rsv(trans, root);  	cache = btrfs_lookup_block_group(root->fs_info, buf->start); -	if (block_rsv->space_info != cache->space_info) -		goto out;  	if (btrfs_header_generation(buf) == trans->transid) {  		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {  			ret = check_ref_cleanup(trans, root, buf->start);  			if (!ret) -				goto pin; +				goto out;  		}  		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {  			pin_down_extent(root, cache, buf->start, buf->len, 1); -			goto pin; +			goto out;  		}  		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));  		btrfs_add_free_space(cache, buf->start, buf->len); -		ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); -		if (ret == -EAGAIN) { -			/* block group became read-only */ -			btrfs_update_reserved_bytes(cache, buf->len, 0, 1); -			goto out; -		} - -		ret = 1; -		spin_lock(&block_rsv->lock); -		if (block_rsv->reserved < block_rsv->size) { -			block_rsv->reserved += buf->len; -			ret = 0; -		} -		spin_unlock(&block_rsv->lock); - -		if (ret) { -			spin_lock(&cache->space_info->lock); -			cache->space_info->bytes_reserved -= buf->len; -			cache->space_info->reservation_progress++; -			spin_unlock(&cache->space_info->lock); -		} -		goto out; -	} -pin: -	if (block_rsv->durable && !cache->ro) { -		ret = 0; -		spin_lock(&cache->lock); -		if (!cache->ro) { -			cache->reserved_pinned += buf->len; -			ret = 1; -		} -		spin_unlock(&cache->lock); - -		if (ret) { -			spin_lock(&block_rsv->lock); -			block_rsv->freed[trans->transid & 0x1] += buf->len; -			spin_unlock(&block_rsv->lock); -		} +		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);  	}  out:  	/* @@ -4884,10 +5047,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,  	int last_ptr_loop = 0;  	int loop = 0;  	int index = 0; +	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? +		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;  	bool found_uncached_bg = false;  	bool failed_cluster_refill = false;  	bool failed_alloc = false;  	bool use_cluster = true; +	bool have_caching_bg = false;  	u64 ideal_cache_percent = 0;  	u64 ideal_cache_offset = 0; @@ -4970,6 +5136,7 @@ ideal_cache:  		}  	}  search: +	have_caching_bg = false;  	down_read(&space_info->groups_sem);  	list_for_each_entry(block_group, &space_info->block_groups[index],  			    list) { @@ -5178,6 +5345,8 @@ refill_cluster:  			failed_alloc = true;  			goto have_block_group;  		} else if (!offset) { +			if (!cached) +				have_caching_bg = true;  			goto loop;  		}  checks: @@ -5203,8 +5372,8 @@ checks:  					     search_start - offset);  		BUG_ON(offset > search_start); -		ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, -					    (data & BTRFS_BLOCK_GROUP_DATA)); +		ret = btrfs_update_reserved_bytes(block_group, num_bytes, +						  alloc_type);  		if (ret == -EAGAIN) {  			btrfs_add_free_space(block_group, offset, num_bytes);  			goto loop; @@ -5228,6 +5397,9 @@ loop:  	}  	up_read(&space_info->groups_sem); +	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) +		goto search; +  	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)  		goto search; @@ -5326,7 +5498,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  	int index = 0;  	spin_lock(&info->lock); -	printk(KERN_INFO "space_info has %llu free, is %sfull\n", +	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", +	       (unsigned long long)info->flags,  	       (unsigned long long)(info->total_bytes - info->bytes_used -  				    info->bytes_pinned - info->bytes_reserved -  				    info->bytes_readonly), @@ -5412,7 +5585,8 @@ again:  	return ret;  } -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +static int __btrfs_free_reserved_extent(struct btrfs_root *root, +					u64 start, u64 len, int pin)  {  	struct btrfs_block_group_cache *cache;  	int ret = 0; @@ -5427,8 +5601,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)  	if (btrfs_test_opt(root, DISCARD))  		ret = btrfs_discard_extent(root, start, len, NULL); -	btrfs_add_free_space(cache, start, len); -	btrfs_update_reserved_bytes(cache, len, 0, 1); +	if (pin) +		pin_down_extent(root, cache, start, len, 1); +	else { +		btrfs_add_free_space(cache, start, len); +		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); +	}  	btrfs_put_block_group(cache);  	trace_btrfs_reserved_extent_free(root, start, len); @@ -5436,6 +5614,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)  	return ret;  } +int btrfs_free_reserved_extent(struct btrfs_root *root, +					u64 start, u64 len) +{ +	return __btrfs_free_reserved_extent(root, start, len, 0); +} + +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, +				       u64 start, u64 len) +{ +	return __btrfs_free_reserved_extent(root, start, len, 1); +} +  static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,  				      struct btrfs_root *root,  				      u64 parent, u64 root_objectid, @@ -5631,7 +5821,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,  		put_caching_control(caching_ctl);  	} -	ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); +	ret = btrfs_update_reserved_bytes(block_group, ins->offset, +					  RESERVE_ALLOC_NO_ACCOUNT);  	BUG_ON(ret);  	btrfs_put_block_group(block_group);  	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, @@ -5688,8 +5879,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,  	block_rsv = get_block_rsv(trans, root);  	if (block_rsv->size == 0) { -		ret = reserve_metadata_bytes(trans, root, block_rsv, -					     blocksize, 0); +		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);  		/*  		 * If we couldn't reserve metadata bytes try and use some from  		 * the global reserve. @@ -5709,13 +5899,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,  	if (!ret)  		return block_rsv;  	if (ret) { -		WARN_ON(1); -		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, -					     0); +		static DEFINE_RATELIMIT_STATE(_rs, +				DEFAULT_RATELIMIT_INTERVAL, +				/*DEFAULT_RATELIMIT_BURST*/ 2); +		if (__ratelimit(&_rs)) { +			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); +			WARN_ON(1); +		} +		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);  		if (!ret) { -			spin_lock(&block_rsv->lock); -			block_rsv->size += blocksize; -			spin_unlock(&block_rsv->lock);  			return block_rsv;  		} else if (ret && block_rsv != global_rsv) {  			ret = block_rsv_use_bytes(global_rsv, blocksize); @@ -6593,12 +6785,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)  		    cache->bytes_super - btrfs_block_group_used(&cache->item);  	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + -	    sinfo->bytes_may_use + sinfo->bytes_readonly + -	    cache->reserved_pinned + num_bytes + min_allocable_bytes <= -	    sinfo->total_bytes) { +	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + +	    min_allocable_bytes <= sinfo->total_bytes) {  		sinfo->bytes_readonly += num_bytes; -		sinfo->bytes_reserved += cache->reserved_pinned; -		cache->reserved_pinned = 0;  		cache->ro = 1;  		ret = 0;  	} @@ -6965,7 +7154,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  					struct btrfs_space_info,  					list);  		if (space_info->bytes_pinned > 0 || -		    space_info->bytes_reserved > 0) { +		    space_info->bytes_reserved > 0 || +		    space_info->bytes_may_use > 0) {  			WARN_ON(1);  			dump_space_info(space_info, 0, 0);  		} @@ -7007,14 +7197,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		return -ENOMEM;  	path->reada = 1; -	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); -	if (cache_gen != 0 && -	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) +	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); +	if (btrfs_test_opt(root, SPACE_CACHE) && +	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)  		need_clear = 1;  	if (btrfs_test_opt(root, CLEAR_CACHE))  		need_clear = 1; -	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) -		printk(KERN_INFO "btrfs: disk space caching is enabled\n");  	while (1) {  		ret = find_first_block_group(root, path, &key); @@ -7253,7 +7441,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  		goto out;  	} -	inode = lookup_free_space_inode(root, block_group, path); +	inode = lookup_free_space_inode(tree_root, block_group, path);  	if (!IS_ERR(inode)) {  		ret = btrfs_orphan_add(trans, inode);  		BUG_ON(ret); @@ -7269,7 +7457,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  			spin_unlock(&block_group->lock);  		}  		/* One for our lookup ref */ -		iput(inode); +		btrfs_add_delayed_iput(inode);  	}  	key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -7340,7 +7528,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)  	int mixed = 0;  	int ret; -	disk_super = &fs_info->super_copy; +	disk_super = fs_info->super_copy;  	if (!btrfs_super_root(disk_super))  		return 1; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d418164a35f..1f87c4d0e7a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,6 +17,7 @@  #include "compat.h"  #include "ctree.h"  #include "btrfs_inode.h" +#include "volumes.h"  static struct kmem_cache *extent_state_cache;  static struct kmem_cache *extent_buffer_cache; @@ -894,6 +895,194 @@ search_again:  	goto again;  } +/** + * convert_extent - convert all bits in a given range from one bit to another + * @tree:	the io tree to search + * @start:	the start offset in bytes + * @end:	the end offset in bytes (inclusive) + * @bits:	the bits to set in this range + * @clear_bits:	the bits to clear in this range + * @mask:	the allocation mask + * + * This will go through and set bits for the given range.  If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits.  This is only meant to be used by things that are mergeable, ie + * converting from say DELALLOC to DIRTY.  This is not meant to be used with + * boundary bits like LOCK. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +		       int bits, int clear_bits, gfp_t mask) +{ +	struct extent_state *state; +	struct extent_state *prealloc = NULL; +	struct rb_node *node; +	int err = 0; +	u64 last_start; +	u64 last_end; + +again: +	if (!prealloc && (mask & __GFP_WAIT)) { +		prealloc = alloc_extent_state(mask); +		if (!prealloc) +			return -ENOMEM; +	} + +	spin_lock(&tree->lock); +	/* +	 * this search will find all the extents that end after +	 * our range starts. +	 */ +	node = tree_search(tree, start); +	if (!node) { +		prealloc = alloc_extent_state_atomic(prealloc); +		if (!prealloc) +			return -ENOMEM; +		err = insert_state(tree, prealloc, start, end, &bits); +		prealloc = NULL; +		BUG_ON(err == -EEXIST); +		goto out; +	} +	state = rb_entry(node, struct extent_state, rb_node); +hit_next: +	last_start = state->start; +	last_end = state->end; + +	/* +	 * | ---- desired range ---- | +	 * | state | +	 * +	 * Just lock what we found and keep going +	 */ +	if (state->start == start && state->end <= end) { +		struct rb_node *next_node; + +		set_state_bits(tree, state, &bits); +		clear_state_bit(tree, state, &clear_bits, 0); + +		merge_state(tree, state); +		if (last_end == (u64)-1) +			goto out; + +		start = last_end + 1; +		next_node = rb_next(&state->rb_node); +		if (next_node && start < end && prealloc && !need_resched()) { +			state = rb_entry(next_node, struct extent_state, +					 rb_node); +			if (state->start == start) +				goto hit_next; +		} +		goto search_again; +	} + +	/* +	 *     | ---- desired range ---- | +	 * | state | +	 *   or +	 * | ------------- state -------------- | +	 * +	 * We need to split the extent we found, and may flip bits on +	 * second half. +	 * +	 * If the extent we found extends past our +	 * range, we just split and search again.  It'll get split +	 * again the next time though. +	 * +	 * If the extent we found is inside our range, we set the +	 * desired bit on it. +	 */ +	if (state->start < start) { +		prealloc = alloc_extent_state_atomic(prealloc); +		if (!prealloc) +			return -ENOMEM; +		err = split_state(tree, state, prealloc, start); +		BUG_ON(err == -EEXIST); +		prealloc = NULL; +		if (err) +			goto out; +		if (state->end <= end) { +			set_state_bits(tree, state, &bits); +			clear_state_bit(tree, state, &clear_bits, 0); +			merge_state(tree, state); +			if (last_end == (u64)-1) +				goto out; +			start = last_end + 1; +		} +		goto search_again; +	} +	/* +	 * | ---- desired range ---- | +	 *     | state | or               | state | +	 * +	 * There's a hole, we need to insert something in it and +	 * ignore the extent we found. +	 */ +	if (state->start > start) { +		u64 this_end; +		if (end < last_start) +			this_end = end; +		else +			this_end = last_start - 1; + +		prealloc = alloc_extent_state_atomic(prealloc); +		if (!prealloc) +			return -ENOMEM; + +		/* +		 * Avoid to free 'prealloc' if it can be merged with +		 * the later extent. +		 */ +		err = insert_state(tree, prealloc, start, this_end, +				   &bits); +		BUG_ON(err == -EEXIST); +		if (err) { +			free_extent_state(prealloc); +			prealloc = NULL; +			goto out; +		} +		prealloc = NULL; +		start = this_end + 1; +		goto search_again; +	} +	/* +	 * | ---- desired range ---- | +	 *                        | state | +	 * We need to split the extent, and set the bit +	 * on the first half +	 */ +	if (state->start <= end && state->end > end) { +		prealloc = alloc_extent_state_atomic(prealloc); +		if (!prealloc) +			return -ENOMEM; + +		err = split_state(tree, state, prealloc, end + 1); +		BUG_ON(err == -EEXIST); + +		set_state_bits(tree, prealloc, &bits); +		clear_state_bit(tree, prealloc, &clear_bits, 0); + +		merge_state(tree, prealloc); +		prealloc = NULL; +		goto out; +	} + +	goto search_again; + +out: +	spin_unlock(&tree->lock); +	if (prealloc) +		free_extent_state(prealloc); + +	return err; + +search_again: +	if (start > end) +		goto out; +	spin_unlock(&tree->lock); +	if (mask & __GFP_WAIT) +		cond_resched(); +	goto again; +} +  /* wrappers around set/clear extent bit */  int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,  		     gfp_t mask) @@ -919,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,  			struct extent_state **cached_state, gfp_t mask)  {  	return set_extent_bit(tree, start, end, -			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, +			      EXTENT_DELALLOC | EXTENT_UPTODATE,  			      0, NULL, cached_state, mask);  } @@ -1599,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,  	return 0;  } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data.  This + * io_failure_record is used to record state as we go through all the + * mirrors.  If another mirror has good data, the page is set up to date + * and things continue.  If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { +	struct page *page; +	u64 start; +	u64 len; +	u64 logical; +	unsigned long bio_flags; +	int this_mirror; +	int failed_mirror; +	int in_validation; +}; + +static int free_io_failure(struct inode *inode, struct io_failure_record *rec, +				int did_repair) +{ +	int ret; +	int err = 0; +	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + +	set_state_private(failure_tree, rec->start, 0); +	ret = clear_extent_bits(failure_tree, rec->start, +				rec->start + rec->len - 1, +				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); +	if (ret) +		err = ret; + +	if (did_repair) { +		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, +					rec->start + rec->len - 1, +					EXTENT_DAMAGED, GFP_NOFS); +		if (ret && !err) +			err = ret; +	} + +	kfree(rec); +	return err; +} + +static void repair_io_failure_callback(struct bio *bio, int err) +{ +	complete(bio->bi_private); +} + +/* + * this bypasses the standard btrfs submit functions deliberately, as + * the standard behavior is to write all copies in a raid setup. here we only + * want to write the one bad copy. so we do the mapping for ourselves and issue + * submit_bio directly. + * to avoid any synchonization issues, wait for the data after writing, which + * actually prevents the read that triggered the error from finishing. + * currently, there can be no more than two copies of every data bit. thus, + * exactly one rewrite is required. + */ +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +			u64 length, u64 logical, struct page *page, +			int mirror_num) +{ +	struct bio *bio; +	struct btrfs_device *dev; +	DECLARE_COMPLETION_ONSTACK(compl); +	u64 map_length = 0; +	u64 sector; +	struct btrfs_bio *bbio = NULL; +	int ret; + +	BUG_ON(!mirror_num); + +	bio = bio_alloc(GFP_NOFS, 1); +	if (!bio) +		return -EIO; +	bio->bi_private = &compl; +	bio->bi_end_io = repair_io_failure_callback; +	bio->bi_size = 0; +	map_length = length; + +	ret = btrfs_map_block(map_tree, WRITE, logical, +			      &map_length, &bbio, mirror_num); +	if (ret) { +		bio_put(bio); +		return -EIO; +	} +	BUG_ON(mirror_num != bbio->mirror_num); +	sector = bbio->stripes[mirror_num-1].physical >> 9; +	bio->bi_sector = sector; +	dev = bbio->stripes[mirror_num-1].dev; +	kfree(bbio); +	if (!dev || !dev->bdev || !dev->writeable) { +		bio_put(bio); +		return -EIO; +	} +	bio->bi_bdev = dev->bdev; +	bio_add_page(bio, page, length, start-page_offset(page)); +	submit_bio(WRITE_SYNC, bio); +	wait_for_completion(&compl); + +	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { +		/* try to remap that extent elsewhere? */ +		bio_put(bio); +		return -EIO; +	} + +	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " +			"sector %llu)\n", page->mapping->host->i_ino, start, +			dev->name, sector); + +	bio_put(bio); +	return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int clean_io_failure(u64 start, struct page *page) +{ +	u64 private; +	u64 private_failure; +	struct io_failure_record *failrec; +	struct btrfs_mapping_tree *map_tree; +	struct extent_state *state; +	int num_copies; +	int did_repair = 0; +	int ret; +	struct inode *inode = page->mapping->host; + +	private = 0; +	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, +				(u64)-1, 1, EXTENT_DIRTY, 0); +	if (!ret) +		return 0; + +	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, +				&private_failure); +	if (ret) +		return 0; + +	failrec = (struct io_failure_record *)(unsigned long) private_failure; +	BUG_ON(!failrec->this_mirror); + +	if (failrec->in_validation) { +		/* there was no real error, just free the record */ +		pr_debug("clean_io_failure: freeing dummy error at %llu\n", +			 failrec->start); +		did_repair = 1; +		goto out; +	} + +	spin_lock(&BTRFS_I(inode)->io_tree.lock); +	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, +					    failrec->start, +					    EXTENT_LOCKED); +	spin_unlock(&BTRFS_I(inode)->io_tree.lock); + +	if (state && state->start == failrec->start) { +		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; +		num_copies = btrfs_num_copies(map_tree, failrec->logical, +						failrec->len); +		if (num_copies > 1)  { +			ret = repair_io_failure(map_tree, start, failrec->len, +						failrec->logical, page, +						failrec->failed_mirror); +			did_repair = !ret; +		} +	} + +out: +	if (!ret) +		ret = free_io_failure(inode, failrec, did_repair); + +	return ret; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, struct page *page, +				u64 start, u64 end, int failed_mirror, +				struct extent_state *state) +{ +	struct io_failure_record *failrec = NULL; +	u64 private; +	struct extent_map *em; +	struct inode *inode = page->mapping->host; +	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; +	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; +	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +	struct bio *bio; +	int num_copies; +	int ret; +	int read_mode; +	u64 logical; + +	BUG_ON(failed_bio->bi_rw & REQ_WRITE); + +	ret = get_state_private(failure_tree, start, &private); +	if (ret) { +		failrec = kzalloc(sizeof(*failrec), GFP_NOFS); +		if (!failrec) +			return -ENOMEM; +		failrec->start = start; +		failrec->len = end - start + 1; +		failrec->this_mirror = 0; +		failrec->bio_flags = 0; +		failrec->in_validation = 0; + +		read_lock(&em_tree->lock); +		em = lookup_extent_mapping(em_tree, start, failrec->len); +		if (!em) { +			read_unlock(&em_tree->lock); +			kfree(failrec); +			return -EIO; +		} + +		if (em->start > start || em->start + em->len < start) { +			free_extent_map(em); +			em = NULL; +		} +		read_unlock(&em_tree->lock); + +		if (!em || IS_ERR(em)) { +			kfree(failrec); +			return -EIO; +		} +		logical = start - em->start; +		logical = em->block_start + logical; +		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { +			logical = em->block_start; +			failrec->bio_flags = EXTENT_BIO_COMPRESSED; +			extent_set_compress_type(&failrec->bio_flags, +						 em->compress_type); +		} +		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " +			 "len=%llu\n", logical, start, failrec->len); +		failrec->logical = logical; +		free_extent_map(em); + +		/* set the bits in the private failure tree */ +		ret = set_extent_bits(failure_tree, start, end, +					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); +		if (ret >= 0) +			ret = set_state_private(failure_tree, start, +						(u64)(unsigned long)failrec); +		/* set the bits in the inode's tree */ +		if (ret >= 0) +			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, +						GFP_NOFS); +		if (ret < 0) { +			kfree(failrec); +			return ret; +		} +	} else { +		failrec = (struct io_failure_record *)(unsigned long)private; +		pr_debug("bio_readpage_error: (found) logical=%llu, " +			 "start=%llu, len=%llu, validation=%d\n", +			 failrec->logical, failrec->start, failrec->len, +			 failrec->in_validation); +		/* +		 * when data can be on disk more than twice, add to failrec here +		 * (e.g. with a list for failed_mirror) to make +		 * clean_io_failure() clean all those errors at once. +		 */ +	} +	num_copies = btrfs_num_copies( +			      &BTRFS_I(inode)->root->fs_info->mapping_tree, +			      failrec->logical, failrec->len); +	if (num_copies == 1) { +		/* +		 * we only have a single copy of the data, so don't bother with +		 * all the retry and error correction code that follows. no +		 * matter what the error is, it is very likely to persist. +		 */ +		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " +			 "state=%p, num_copies=%d, next_mirror %d, " +			 "failed_mirror %d\n", state, num_copies, +			 failrec->this_mirror, failed_mirror); +		free_io_failure(inode, failrec, 0); +		return -EIO; +	} + +	if (!state) { +		spin_lock(&tree->lock); +		state = find_first_extent_bit_state(tree, failrec->start, +						    EXTENT_LOCKED); +		if (state && state->start != failrec->start) +			state = NULL; +		spin_unlock(&tree->lock); +	} + +	/* +	 * there are two premises: +	 *	a) deliver good data to the caller +	 *	b) correct the bad sectors on disk +	 */ +	if (failed_bio->bi_vcnt > 1) { +		/* +		 * to fulfill b), we need to know the exact failing sectors, as +		 * we don't want to rewrite any more than the failed ones. thus, +		 * we need separate read requests for the failed bio +		 * +		 * if the following BUG_ON triggers, our validation request got +		 * merged. we need separate requests for our algorithm to work. +		 */ +		BUG_ON(failrec->in_validation); +		failrec->in_validation = 1; +		failrec->this_mirror = failed_mirror; +		read_mode = READ_SYNC | REQ_FAILFAST_DEV; +	} else { +		/* +		 * we're ready to fulfill a) and b) alongside. get a good copy +		 * of the failed sector and if we succeed, we have setup +		 * everything for repair_io_failure to do the rest for us. +		 */ +		if (failrec->in_validation) { +			BUG_ON(failrec->this_mirror != failed_mirror); +			failrec->in_validation = 0; +			failrec->this_mirror = 0; +		} +		failrec->failed_mirror = failed_mirror; +		failrec->this_mirror++; +		if (failrec->this_mirror == failed_mirror) +			failrec->this_mirror++; +		read_mode = READ_SYNC; +	} + +	if (!state || failrec->this_mirror > num_copies) { +		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " +			 "next_mirror %d, failed_mirror %d\n", state, +			 num_copies, failrec->this_mirror, failed_mirror); +		free_io_failure(inode, failrec, 0); +		return -EIO; +	} + +	bio = bio_alloc(GFP_NOFS, 1); +	bio->bi_private = state; +	bio->bi_end_io = failed_bio->bi_end_io; +	bio->bi_sector = failrec->logical >> 9; +	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; +	bio->bi_size = 0; + +	bio_add_page(bio, page, failrec->len, start - page_offset(page)); + +	pr_debug("bio_readpage_error: submitting new read[%#x] to " +		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, +		 failrec->this_mirror, num_copies, failrec->in_validation); + +	tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, +					failrec->bio_flags, 0); +	return 0; +} +  /* lots and lots of room for performance fixes in the end_bio funcs */  /* @@ -1697,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)  		struct extent_state *cached = NULL;  		struct extent_state *state; +		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " +			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, +			 (long int)bio->bi_bdev);  		tree = &BTRFS_I(page->mapping->host)->io_tree;  		start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1727,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)  							      state);  			if (ret)  				uptodate = 0; +			else +				clean_io_failure(start, page);  		} -		if (!uptodate && tree->ops && -		    tree->ops->readpage_io_failed_hook) { -			ret = tree->ops->readpage_io_failed_hook(bio, page, -							 start, end, NULL); +		if (!uptodate) { +			u64 failed_mirror; +			failed_mirror = (u64)bio->bi_bdev; +			if (tree->ops && tree->ops->readpage_io_failed_hook) +				ret = tree->ops->readpage_io_failed_hook( +						bio, page, start, end, +						failed_mirror, state); +			else +				ret = bio_readpage_error(bio, page, start, end, +							 failed_mirror, NULL);  			if (ret == 0) {  				uptodate =  					test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -1811,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,  					   mirror_num, bio_flags, start);  	else  		submit_bio(rw, bio); +  	if (bio_flagged(bio, BIO_EOPNOTSUPP))  		ret = -EOPNOTSUPP;  	bio_put(bio); @@ -2076,16 +2639,16 @@ out:  }  int extent_read_full_page(struct extent_io_tree *tree, struct page *page, -			    get_extent_t *get_extent) +			    get_extent_t *get_extent, int mirror_num)  {  	struct bio *bio = NULL;  	unsigned long bio_flags = 0;  	int ret; -	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, +	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,  				      &bio_flags);  	if (bio) -		ret = submit_one_bio(READ, bio, 0, bio_flags); +		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);  	return ret;  } @@ -2136,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	int compressed;  	int write_flags;  	unsigned long nr_written = 0; +	bool fill_delalloc = true;  	if (wbc->sync_mode == WB_SYNC_ALL)  		write_flags = WRITE_SYNC; @@ -2145,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	trace___extent_writepage(page, inode, wbc);  	WARN_ON(!PageLocked(page)); + +	ClearPageError(page); +  	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);  	if (page->index > end_index ||  	   (page->index == end_index && !pg_offset)) { @@ -2166,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	set_page_extent_mapped(page); +	if (!tree->ops || !tree->ops->fill_delalloc) +		fill_delalloc = false; +  	delalloc_start = start;  	delalloc_end = 0;  	page_started = 0; -	if (!epd->extent_locked) { +	if (!epd->extent_locked && fill_delalloc) {  		u64 delalloc_to_write = 0;  		/*  		 * make sure the wbc mapping index is at least updated @@ -2421,10 +2991,16 @@ retry:  			 * swizzled back from swapper_space to tmpfs file  			 * mapping  			 */ -			if (tree->ops && tree->ops->write_cache_pages_lock_hook) -				tree->ops->write_cache_pages_lock_hook(page); -			else -				lock_page(page); +			if (tree->ops && +			    tree->ops->write_cache_pages_lock_hook) { +				tree->ops->write_cache_pages_lock_hook(page, +							       data, flush_fn); +			} else { +				if (!trylock_page(page)) { +					flush_fn(data); +					lock_page(page); +				} +			}  			if (unlikely(page->mapping != mapping)) {  				unlock_page(page); @@ -2926,7 +3502,7 @@ out:  	return ret;  } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, +inline struct page *extent_buffer_page(struct extent_buffer *eb,  					      unsigned long i)  {  	struct page *p; @@ -2951,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,  	return p;  } -static inline unsigned long num_extent_pages(u64 start, u64 len) +inline unsigned long num_extent_pages(u64 start, u64 len)  {  	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -  		(start >> PAGE_CACHE_SHIFT); @@ -3204,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,  						PAGECACHE_TAG_DIRTY);  		}  		spin_unlock_irq(&page->mapping->tree_lock); +		ClearPageError(page);  		unlock_page(page);  	}  	return 0; @@ -3349,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,  }  int read_extent_buffer_pages(struct extent_io_tree *tree, -			     struct extent_buffer *eb, -			     u64 start, int wait, +			     struct extent_buffer *eb, u64 start, int wait,  			     get_extent_t *get_extent, int mirror_num)  {  	unsigned long i; @@ -3386,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  	num_pages = num_extent_pages(eb->start, eb->len);  	for (i = start_i; i < num_pages; i++) {  		page = extent_buffer_page(eb, i); -		if (!wait) { +		if (wait == WAIT_NONE) {  			if (!trylock_page(page))  				goto unlock_exit;  		} else { @@ -3430,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  	if (bio)  		submit_one_bio(READ, bio, mirror_num, bio_flags); -	if (ret || !wait) +	if (ret || wait != WAIT_COMPLETE)  		return ret;  	for (i = start_i; i < num_pages; i++) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7b2f0c3e792..feb9be0e23b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -17,6 +17,8 @@  #define EXTENT_NODATASUM (1 << 10)  #define EXTENT_DO_ACCOUNTING (1 << 11)  #define EXTENT_FIRST_DELALLOC (1 << 12) +#define EXTENT_NEED_WAIT (1 << 13) +#define EXTENT_DAMAGED (1 << 14)  #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)  #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -32,6 +34,7 @@  #define EXTENT_BUFFER_BLOCKING 1  #define EXTENT_BUFFER_DIRTY 2  #define EXTENT_BUFFER_CORRUPT 3 +#define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */  /* these are flags for extent_clear_unlock_delalloc */  #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -67,7 +70,7 @@ struct extent_io_ops {  			      unsigned long bio_flags);  	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);  	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, -				       u64 start, u64 end, +				       u64 start, u64 end, u64 failed_mirror,  				       struct extent_state *state);  	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,  					u64 start, u64 end, @@ -85,7 +88,8 @@ struct extent_io_ops {  				  struct extent_state *other);  	void (*split_extent_hook)(struct inode *inode,  				  struct extent_state *orig, u64 split); -	int (*write_cache_pages_lock_hook)(struct page *page); +	int (*write_cache_pages_lock_hook)(struct page *page, void *data, +					   void (*flush_fn)(void *));  };  struct extent_io_tree { @@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,  int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,  		    gfp_t mask);  int extent_read_full_page(struct extent_io_tree *tree, struct page *page, -			  get_extent_t *get_extent); +			  get_extent_t *get_extent, int mirror_num);  int __init extent_io_init(void);  void extent_io_exit(void); @@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,  		     gfp_t mask);  int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,  		       gfp_t mask); +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +		       int bits, int clear_bits, gfp_t mask);  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,  			struct extent_state **cached_state, gfp_t mask);  int find_first_extent_bit(struct extent_io_tree *tree, u64 start, @@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,  struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,  					 u64 start, unsigned long len);  void free_extent_buffer(struct extent_buffer *eb); +#define WAIT_NONE	0 +#define WAIT_COMPLETE	1 +#define WAIT_PAGE_LOCK	2  int read_extent_buffer_pages(struct extent_io_tree *tree,  			     struct extent_buffer *eb, u64 start, int wait,  			     get_extent_t *get_extent, int mirror_num); +unsigned long num_extent_pages(u64 start, u64 len); +struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);  static inline void extent_buffer_get(struct extent_buffer *eb)  { @@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,  struct bio *  btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,  		gfp_t gfp_flags); + +struct btrfs_mapping_tree; + +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +			u64 length, u64 logical, struct page *page, +			int mirror_num);  #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a1cb7821bec..c7fb3a4247d 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,  	struct btrfs_csum_item *item;  	struct extent_buffer *leaf;  	u64 csum_offset = 0; -	u16 csum_size = -		btrfs_super_csum_size(&root->fs_info->super_copy); +	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);  	int csums_in_item;  	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,  	u64 item_last_offset = 0;  	u64 disk_bytenr;  	u32 diff; -	u16 csum_size = -		btrfs_super_csum_size(&root->fs_info->super_copy); +	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);  	int ret;  	struct btrfs_path *path;  	struct btrfs_csum_item *item = NULL; @@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,  	int ret;  	size_t size;  	u64 csum_end; -	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); +	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);  	path = btrfs_alloc_path();  	if (!path) @@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,  				      u64 bytenr, u64 len)  {  	struct extent_buffer *leaf; -	u16 csum_size = -		btrfs_super_csum_size(&root->fs_info->super_copy); +	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);  	u64 csum_end;  	u64 end_byte = bytenr + len;  	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; @@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,  	u64 csum_end;  	struct extent_buffer *leaf;  	int ret; -	u16 csum_size = -		btrfs_super_csum_size(&root->fs_info->super_copy); +	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);  	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;  	root = root->fs_info->csum_root; @@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,  	struct btrfs_sector_sum *sector_sum;  	u32 nritems;  	u32 ins_size; -	u16 csum_size = -		btrfs_super_csum_size(&root->fs_info->super_copy); +	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);  	path = btrfs_alloc_path();  	if (!path) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1266f6e9cdb..dafdfa059bf 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,  	int i;  	unsigned long index = pos >> PAGE_CACHE_SHIFT;  	struct inode *inode = fdentry(file)->d_inode; +	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);  	int err = 0;  	int faili = 0;  	u64 start_pos; @@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,  again:  	for (i = 0; i < num_pages; i++) {  		pages[i] = find_or_create_page(inode->i_mapping, index + i, -					       GFP_NOFS); +					       mask);  		if (!pages[i]) {  			faili = i - 1;  			err = -ENOMEM; @@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,  			goto out;  	} -	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); -	if (ret) -		goto out; -  	locked_end = alloc_end - 1;  	while (1) {  		struct btrfs_ordered_extent *ordered; @@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,  		if (em->block_start == EXTENT_MAP_HOLE ||  		    (cur_offset >= inode->i_size &&  		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + +			/* +			 * Make sure we have enough space before we do the +			 * allocation. +			 */ +			ret = btrfs_check_data_free_space(inode, last_byte - +							  cur_offset); +			if (ret) { +				free_extent_map(em); +				break; +			} +  			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,  							last_byte - cur_offset,  							1 << inode->i_blkbits,  							offset + len,  							&alloc_hint); + +			/* Let go of our reservation. */ +			btrfs_free_reserved_data_space(inode, last_byte - +						       cur_offset);  			if (ret < 0) {  				free_extent_map(em);  				break; @@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,  	}  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,  			     &cached_state, GFP_NOFS); - -	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);  out:  	mutex_unlock(&inode->i_mutex);  	return ret; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 41ac927401d..7a15fcfb3e1 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -20,6 +20,7 @@  #include <linux/sched.h>  #include <linux/slab.h>  #include <linux/math64.h> +#include <linux/ratelimit.h>  #include "ctree.h"  #include "free-space-cache.h"  #include "transaction.h" @@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,  				      *block_group, struct btrfs_path *path)  {  	struct inode *inode = NULL; +	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;  	spin_lock(&block_group->lock);  	if (block_group->inode) @@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,  		return inode;  	spin_lock(&block_group->lock); -	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { +	if (!((BTRFS_I(inode)->flags & flags) == flags)) {  		printk(KERN_INFO "Old style space inode found, converting.\n"); -		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; +		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | +			BTRFS_INODE_NODATACOW;  		block_group->disk_cache_state = BTRFS_DC_CLEAR;  	} -	if (!btrfs_fs_closing(root->fs_info)) { +	if (!block_group->iref) {  		block_group->inode = igrab(inode);  		block_group->iref = 1;  	} @@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,  	struct btrfs_free_space_header *header;  	struct btrfs_inode_item *inode_item;  	struct extent_buffer *leaf; +	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;  	int ret;  	ret = btrfs_insert_empty_inode(trans, root, path, ino);  	if (ret)  		return ret; +	/* We inline crc's for the free disk space cache */ +	if (ino != BTRFS_FREE_INO_OBJECTID) +		flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; +  	leaf = path->nodes[0];  	inode_item = btrfs_item_ptr(leaf, path->slots[0],  				    struct btrfs_inode_item); @@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,  	btrfs_set_inode_uid(leaf, inode_item, 0);  	btrfs_set_inode_gid(leaf, inode_item, 0);  	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); -	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | -			      BTRFS_INODE_PREALLOC); +	btrfs_set_inode_flags(leaf, inode_item, flags);  	btrfs_set_inode_nlink(leaf, inode_item, 1);  	btrfs_set_inode_transid(leaf, inode_item, trans->transid);  	btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,  				    struct inode *inode)  {  	struct btrfs_block_rsv *rsv; +	u64 needed_bytes;  	loff_t oldsize;  	int ret = 0;  	rsv = trans->block_rsv; -	trans->block_rsv = root->orphan_block_rsv; -	ret = btrfs_block_rsv_check(trans, root, -				    root->orphan_block_rsv, -				    0, 5); -	if (ret) -		return ret; +	trans->block_rsv = &root->fs_info->global_block_rsv; + +	/* 1 for slack space, 1 for updating the inode */ +	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + +		btrfs_calc_trans_metadata_size(root, 1); + +	spin_lock(&trans->block_rsv->lock); +	if (trans->block_rsv->reserved < needed_bytes) { +		spin_unlock(&trans->block_rsv->lock); +		trans->block_rsv = rsv; +		return -ENOSPC; +	} +	spin_unlock(&trans->block_rsv->lock);  	oldsize = i_size_read(inode);  	btrfs_i_size_write(inode, 0); @@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,  	ret = btrfs_truncate_inode_items(trans, root, inode,  					 0, BTRFS_EXTENT_DATA_KEY); -	trans->block_rsv = rsv;  	if (ret) { +		trans->block_rsv = rsv;  		WARN_ON(1);  		return ret;  	}  	ret = btrfs_update_inode(trans, root, inode); +	trans->block_rsv = rsv; +  	return ret;  } @@ -242,26 +259,342 @@ static int readahead_cache(struct inode *inode)  	return 0;  } +struct io_ctl { +	void *cur, *orig; +	struct page *page; +	struct page **pages; +	struct btrfs_root *root; +	unsigned long size; +	int index; +	int num_pages; +	unsigned check_crcs:1; +}; + +static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, +		       struct btrfs_root *root) +{ +	memset(io_ctl, 0, sizeof(struct io_ctl)); +	io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> +		PAGE_CACHE_SHIFT; +	io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, +				GFP_NOFS); +	if (!io_ctl->pages) +		return -ENOMEM; +	io_ctl->root = root; +	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) +		io_ctl->check_crcs = 1; +	return 0; +} + +static void io_ctl_free(struct io_ctl *io_ctl) +{ +	kfree(io_ctl->pages); +} + +static void io_ctl_unmap_page(struct io_ctl *io_ctl) +{ +	if (io_ctl->cur) { +		kunmap(io_ctl->page); +		io_ctl->cur = NULL; +		io_ctl->orig = NULL; +	} +} + +static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +{ +	WARN_ON(io_ctl->cur); +	BUG_ON(io_ctl->index >= io_ctl->num_pages); +	io_ctl->page = io_ctl->pages[io_ctl->index++]; +	io_ctl->cur = kmap(io_ctl->page); +	io_ctl->orig = io_ctl->cur; +	io_ctl->size = PAGE_CACHE_SIZE; +	if (clear) +		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); +} + +static void io_ctl_drop_pages(struct io_ctl *io_ctl) +{ +	int i; + +	io_ctl_unmap_page(io_ctl); + +	for (i = 0; i < io_ctl->num_pages; i++) { +		ClearPageChecked(io_ctl->pages[i]); +		unlock_page(io_ctl->pages[i]); +		page_cache_release(io_ctl->pages[i]); +	} +} + +static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, +				int uptodate) +{ +	struct page *page; +	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); +	int i; + +	for (i = 0; i < io_ctl->num_pages; i++) { +		page = find_or_create_page(inode->i_mapping, i, mask); +		if (!page) { +			io_ctl_drop_pages(io_ctl); +			return -ENOMEM; +		} +		io_ctl->pages[i] = page; +		if (uptodate && !PageUptodate(page)) { +			btrfs_readpage(NULL, page); +			lock_page(page); +			if (!PageUptodate(page)) { +				printk(KERN_ERR "btrfs: error reading free " +				       "space cache\n"); +				io_ctl_drop_pages(io_ctl); +				return -EIO; +			} +		} +	} + +	return 0; +} + +static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +{ +	u64 *val; + +	io_ctl_map_page(io_ctl, 1); + +	/* +	 * Skip the csum areas.  If we don't check crcs then we just have a +	 * 64bit chunk at the front of the first page. +	 */ +	if (io_ctl->check_crcs) { +		io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); +		io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); +	} else { +		io_ctl->cur += sizeof(u64); +		io_ctl->size -= sizeof(u64) * 2; +	} + +	val = io_ctl->cur; +	*val = cpu_to_le64(generation); +	io_ctl->cur += sizeof(u64); +} + +static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +{ +	u64 *gen; + +	/* +	 * Skip the crc area.  If we don't check crcs then we just have a 64bit +	 * chunk at the front of the first page. +	 */ +	if (io_ctl->check_crcs) { +		io_ctl->cur += sizeof(u32) * io_ctl->num_pages; +		io_ctl->size -= sizeof(u64) + +			(sizeof(u32) * io_ctl->num_pages); +	} else { +		io_ctl->cur += sizeof(u64); +		io_ctl->size -= sizeof(u64) * 2; +	} + +	gen = io_ctl->cur; +	if (le64_to_cpu(*gen) != generation) { +		printk_ratelimited(KERN_ERR "btrfs: space cache generation " +				   "(%Lu) does not match inode (%Lu)\n", *gen, +				   generation); +		io_ctl_unmap_page(io_ctl); +		return -EIO; +	} +	io_ctl->cur += sizeof(u64); +	return 0; +} + +static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +{ +	u32 *tmp; +	u32 crc = ~(u32)0; +	unsigned offset = 0; + +	if (!io_ctl->check_crcs) { +		io_ctl_unmap_page(io_ctl); +		return; +	} + +	if (index == 0) +		offset = sizeof(u32) * io_ctl->num_pages;; + +	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, +			      PAGE_CACHE_SIZE - offset); +	btrfs_csum_final(crc, (char *)&crc); +	io_ctl_unmap_page(io_ctl); +	tmp = kmap(io_ctl->pages[0]); +	tmp += index; +	*tmp = crc; +	kunmap(io_ctl->pages[0]); +} + +static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +{ +	u32 *tmp, val; +	u32 crc = ~(u32)0; +	unsigned offset = 0; + +	if (!io_ctl->check_crcs) { +		io_ctl_map_page(io_ctl, 0); +		return 0; +	} + +	if (index == 0) +		offset = sizeof(u32) * io_ctl->num_pages; + +	tmp = kmap(io_ctl->pages[0]); +	tmp += index; +	val = *tmp; +	kunmap(io_ctl->pages[0]); + +	io_ctl_map_page(io_ctl, 0); +	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, +			      PAGE_CACHE_SIZE - offset); +	btrfs_csum_final(crc, (char *)&crc); +	if (val != crc) { +		printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " +				   "space cache\n"); +		io_ctl_unmap_page(io_ctl); +		return -EIO; +	} + +	return 0; +} + +static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, +			    void *bitmap) +{ +	struct btrfs_free_space_entry *entry; + +	if (!io_ctl->cur) +		return -ENOSPC; + +	entry = io_ctl->cur; +	entry->offset = cpu_to_le64(offset); +	entry->bytes = cpu_to_le64(bytes); +	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : +		BTRFS_FREE_SPACE_EXTENT; +	io_ctl->cur += sizeof(struct btrfs_free_space_entry); +	io_ctl->size -= sizeof(struct btrfs_free_space_entry); + +	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) +		return 0; + +	io_ctl_set_crc(io_ctl, io_ctl->index - 1); + +	/* No more pages to map */ +	if (io_ctl->index >= io_ctl->num_pages) +		return 0; + +	/* map the next page */ +	io_ctl_map_page(io_ctl, 1); +	return 0; +} + +static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +{ +	if (!io_ctl->cur) +		return -ENOSPC; + +	/* +	 * If we aren't at the start of the current page, unmap this one and +	 * map the next one if there is any left. +	 */ +	if (io_ctl->cur != io_ctl->orig) { +		io_ctl_set_crc(io_ctl, io_ctl->index - 1); +		if (io_ctl->index >= io_ctl->num_pages) +			return -ENOSPC; +		io_ctl_map_page(io_ctl, 0); +	} + +	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); +	io_ctl_set_crc(io_ctl, io_ctl->index - 1); +	if (io_ctl->index < io_ctl->num_pages) +		io_ctl_map_page(io_ctl, 0); +	return 0; +} + +static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +{ +	/* +	 * If we're not on the boundary we know we've modified the page and we +	 * need to crc the page. +	 */ +	if (io_ctl->cur != io_ctl->orig) +		io_ctl_set_crc(io_ctl, io_ctl->index - 1); +	else +		io_ctl_unmap_page(io_ctl); + +	while (io_ctl->index < io_ctl->num_pages) { +		io_ctl_map_page(io_ctl, 1); +		io_ctl_set_crc(io_ctl, io_ctl->index - 1); +	} +} + +static int io_ctl_read_entry(struct io_ctl *io_ctl, +			    struct btrfs_free_space *entry, u8 *type) +{ +	struct btrfs_free_space_entry *e; + +	e = io_ctl->cur; +	entry->offset = le64_to_cpu(e->offset); +	entry->bytes = le64_to_cpu(e->bytes); +	*type = e->type; +	io_ctl->cur += sizeof(struct btrfs_free_space_entry); +	io_ctl->size -= sizeof(struct btrfs_free_space_entry); + +	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) +		return 0; + +	io_ctl_unmap_page(io_ctl); + +	if (io_ctl->index >= io_ctl->num_pages) +		return 0; + +	return io_ctl_check_crc(io_ctl, io_ctl->index); +} + +static int io_ctl_read_bitmap(struct io_ctl *io_ctl, +			      struct btrfs_free_space *entry) +{ +	int ret; + +	if (io_ctl->cur && io_ctl->cur != io_ctl->orig) +		io_ctl_unmap_page(io_ctl); + +	ret = io_ctl_check_crc(io_ctl, io_ctl->index); +	if (ret) +		return ret; + +	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); +	io_ctl_unmap_page(io_ctl); + +	return 0; +} +  int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  			    struct btrfs_free_space_ctl *ctl,  			    struct btrfs_path *path, u64 offset)  {  	struct btrfs_free_space_header *header;  	struct extent_buffer *leaf; -	struct page *page; +	struct io_ctl io_ctl;  	struct btrfs_key key; +	struct btrfs_free_space *e, *n;  	struct list_head bitmaps;  	u64 num_entries;  	u64 num_bitmaps;  	u64 generation; -	pgoff_t index = 0; +	u8 type;  	int ret = 0;  	INIT_LIST_HEAD(&bitmaps);  	/* Nothing in the space cache, goodbye */  	if (!i_size_read(inode)) -		goto out; +		return 0;  	key.objectid = BTRFS_FREE_SPACE_OBJECTID;  	key.offset = offset; @@ -269,11 +602,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  	if (ret < 0) -		goto out; +		return 0;  	else if (ret > 0) {  		btrfs_release_path(path); -		ret = 0; -		goto out; +		return 0;  	}  	ret = -1; @@ -291,169 +623,100 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  		       " not match free space cache generation (%llu)\n",  		       (unsigned long long)BTRFS_I(inode)->generation,  		       (unsigned long long)generation); -		goto out; +		return 0;  	}  	if (!num_entries) -		goto out; +		return 0; +	io_ctl_init(&io_ctl, inode, root);  	ret = readahead_cache(inode);  	if (ret)  		goto out; -	while (1) { -		struct btrfs_free_space_entry *entry; -		struct btrfs_free_space *e; -		void *addr; -		unsigned long offset = 0; -		int need_loop = 0; +	ret = io_ctl_prepare_pages(&io_ctl, inode, 1); +	if (ret) +		goto out; -		if (!num_entries && !num_bitmaps) -			break; +	ret = io_ctl_check_crc(&io_ctl, 0); +	if (ret) +		goto free_cache; + +	ret = io_ctl_check_generation(&io_ctl, generation); +	if (ret) +		goto free_cache; -		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); -		if (!page) +	while (num_entries) { +		e = kmem_cache_zalloc(btrfs_free_space_cachep, +				      GFP_NOFS); +		if (!e)  			goto free_cache; -		if (!PageUptodate(page)) { -			btrfs_readpage(NULL, page); -			lock_page(page); -			if (!PageUptodate(page)) { -				unlock_page(page); -				page_cache_release(page); -				printk(KERN_ERR "btrfs: error reading free " -				       "space cache\n"); -				goto free_cache; -			} +		ret = io_ctl_read_entry(&io_ctl, e, &type); +		if (ret) { +			kmem_cache_free(btrfs_free_space_cachep, e); +			goto free_cache;  		} -		addr = kmap(page); -		if (index == 0) { -			u64 *gen; - -			/* -			 * We put a bogus crc in the front of the first page in -			 * case old kernels try to mount a fs with the new -			 * format to make sure they discard the cache. -			 */ -			addr += sizeof(u64); -			offset += sizeof(u64); +		if (!e->bytes) { +			kmem_cache_free(btrfs_free_space_cachep, e); +			goto free_cache; +		} -			gen = addr; -			if (*gen != BTRFS_I(inode)->generation) { -				printk(KERN_ERR "btrfs: space cache generation" -				       " (%llu) does not match inode (%llu)\n", -				       (unsigned long long)*gen, -				       (unsigned long long) -				       BTRFS_I(inode)->generation); -				kunmap(page); -				unlock_page(page); -				page_cache_release(page); +		if (type == BTRFS_FREE_SPACE_EXTENT) { +			spin_lock(&ctl->tree_lock); +			ret = link_free_space(ctl, e); +			spin_unlock(&ctl->tree_lock); +			if (ret) { +				printk(KERN_ERR "Duplicate entries in " +				       "free space cache, dumping\n"); +				kmem_cache_free(btrfs_free_space_cachep, e);  				goto free_cache;  			} -			addr += sizeof(u64); -			offset += sizeof(u64); -		} -		entry = addr; - -		while (1) { -			if (!num_entries) -				break; - -			need_loop = 1; -			e = kmem_cache_zalloc(btrfs_free_space_cachep, -					      GFP_NOFS); -			if (!e) { -				kunmap(page); -				unlock_page(page); -				page_cache_release(page); +		} else { +			BUG_ON(!num_bitmaps); +			num_bitmaps--; +			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); +			if (!e->bitmap) { +				kmem_cache_free( +					btrfs_free_space_cachep, e);  				goto free_cache;  			} - -			e->offset = le64_to_cpu(entry->offset); -			e->bytes = le64_to_cpu(entry->bytes); -			if (!e->bytes) { -				kunmap(page); +			spin_lock(&ctl->tree_lock); +			ret = link_free_space(ctl, e); +			ctl->total_bitmaps++; +			ctl->op->recalc_thresholds(ctl); +			spin_unlock(&ctl->tree_lock); +			if (ret) { +				printk(KERN_ERR "Duplicate entries in " +				       "free space cache, dumping\n");  				kmem_cache_free(btrfs_free_space_cachep, e); -				unlock_page(page); -				page_cache_release(page);  				goto free_cache;  			} - -			if (entry->type == BTRFS_FREE_SPACE_EXTENT) { -				spin_lock(&ctl->tree_lock); -				ret = link_free_space(ctl, e); -				spin_unlock(&ctl->tree_lock); -				if (ret) { -					printk(KERN_ERR "Duplicate entries in " -					       "free space cache, dumping\n"); -					kunmap(page); -					unlock_page(page); -					page_cache_release(page); -					goto free_cache; -				} -			} else { -				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); -				if (!e->bitmap) { -					kunmap(page); -					kmem_cache_free( -						btrfs_free_space_cachep, e); -					unlock_page(page); -					page_cache_release(page); -					goto free_cache; -				} -				spin_lock(&ctl->tree_lock); -				ret = link_free_space(ctl, e); -				ctl->total_bitmaps++; -				ctl->op->recalc_thresholds(ctl); -				spin_unlock(&ctl->tree_lock); -				if (ret) { -					printk(KERN_ERR "Duplicate entries in " -					       "free space cache, dumping\n"); -					kunmap(page); -					unlock_page(page); -					page_cache_release(page); -					goto free_cache; -				} -				list_add_tail(&e->list, &bitmaps); -			} - -			num_entries--; -			offset += sizeof(struct btrfs_free_space_entry); -			if (offset + sizeof(struct btrfs_free_space_entry) >= -			    PAGE_CACHE_SIZE) -				break; -			entry++; +			list_add_tail(&e->list, &bitmaps);  		} -		/* -		 * We read an entry out of this page, we need to move on to the -		 * next page. -		 */ -		if (need_loop) { -			kunmap(page); -			goto next; -		} +		num_entries--; +	} -		/* -		 * We add the bitmaps at the end of the entries in order that -		 * the bitmap entries are added to the cache. -		 */ -		e = list_entry(bitmaps.next, struct btrfs_free_space, list); +	/* +	 * We add the bitmaps at the end of the entries in order that +	 * the bitmap entries are added to the cache. +	 */ +	list_for_each_entry_safe(e, n, &bitmaps, list) {  		list_del_init(&e->list); -		memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); -		kunmap(page); -		num_bitmaps--; -next: -		unlock_page(page); -		page_cache_release(page); -		index++; +		ret = io_ctl_read_bitmap(&io_ctl, e); +		if (ret) +			goto free_cache;  	} +	io_ctl_drop_pages(&io_ctl);  	ret = 1;  out: +	io_ctl_free(&io_ctl);  	return ret;  free_cache: +	io_ctl_drop_pages(&io_ctl);  	__btrfs_remove_free_space_cache(ctl);  	goto out;  } @@ -465,7 +728,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,  	struct btrfs_root *root = fs_info->tree_root;  	struct inode *inode;  	struct btrfs_path *path; -	int ret; +	int ret = 0;  	bool matched;  	u64 used = btrfs_block_group_used(&block_group->item); @@ -497,6 +760,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,  		return 0;  	} +	/* We may have converted the inode and made the cache invalid. */ +	spin_lock(&block_group->lock); +	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { +		spin_unlock(&block_group->lock); +		goto out; +	} +	spin_unlock(&block_group->lock); +  	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,  				      path, block_group->key.objectid);  	btrfs_free_path(path); @@ -530,6 +801,19 @@ out:  	return ret;  } +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount.  This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */  int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  			    struct btrfs_free_space_ctl *ctl,  			    struct btrfs_block_group_cache *block_group, @@ -540,42 +824,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	struct extent_buffer *leaf;  	struct rb_node *node;  	struct list_head *pos, *n; -	struct page **pages; -	struct page *page;  	struct extent_state *cached_state = NULL;  	struct btrfs_free_cluster *cluster = NULL;  	struct extent_io_tree *unpin = NULL; +	struct io_ctl io_ctl;  	struct list_head bitmap_list;  	struct btrfs_key key;  	u64 start, end, len; -	u64 bytes = 0; -	u32 crc = ~(u32)0; -	int index = 0, num_pages = 0;  	int entries = 0;  	int bitmaps = 0; -	int ret = -1; -	bool next_page = false; -	bool out_of_space = false; +	int ret; +	int err = -1;  	INIT_LIST_HEAD(&bitmap_list); -	node = rb_first(&ctl->free_space_offset); -	if (!node) -		return 0; -  	if (!i_size_read(inode))  		return -1; -	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> -		PAGE_CACHE_SHIFT; - -	filemap_write_and_wait(inode->i_mapping); -	btrfs_wait_ordered_range(inode, inode->i_size & -				 ~(root->sectorsize - 1), (u64)-1); - -	pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); -	if (!pages) -		return -1; +	io_ctl_init(&io_ctl, inode, root);  	/* Get the cluster for this block_group if it exists */  	if (block_group && !list_empty(&block_group->cluster_list)) @@ -589,30 +855,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	 */  	unpin = root->fs_info->pinned_extents; -	/* -	 * Lock all pages first so we can lock the extent safely. -	 * -	 * NOTE: Because we hold the ref the entire time we're going to write to -	 * the page find_get_page should never fail, so we don't do a check -	 * after find_get_page at this point.  Just putting this here so people -	 * know and don't freak out. -	 */ -	while (index < num_pages) { -		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); -		if (!page) { -			int i; - -			for (i = 0; i < num_pages; i++) { -				unlock_page(pages[i]); -				page_cache_release(pages[i]); -			} -			goto out; -		} -		pages[index] = page; -		index++; -	} +	/* Lock all pages first so we can lock the extent safely. */ +	io_ctl_prepare_pages(&io_ctl, inode, 0); -	index = 0;  	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,  			 0, &cached_state, GFP_NOFS); @@ -623,189 +868,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	if (block_group)  		start = block_group->key.objectid; -	/* Write out the extent entries */ -	do { -		struct btrfs_free_space_entry *entry; -		void *addr, *orig; -		unsigned long offset = 0; +	node = rb_first(&ctl->free_space_offset); +	if (!node && cluster) { +		node = rb_first(&cluster->root); +		cluster = NULL; +	} -		next_page = false; +	/* Make sure we can fit our crcs into the first page */ +	if (io_ctl.check_crcs && +	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { +		WARN_ON(1); +		goto out_nospc; +	} -		if (index >= num_pages) { -			out_of_space = true; -			break; -		} +	io_ctl_set_generation(&io_ctl, trans->transid); -		page = pages[index]; +	/* Write out the extent entries */ +	while (node) { +		struct btrfs_free_space *e; -		orig = addr = kmap(page); -		if (index == 0) { -			u64 *gen; +		e = rb_entry(node, struct btrfs_free_space, offset_index); +		entries++; -			/* -			 * We're going to put in a bogus crc for this page to -			 * make sure that old kernels who aren't aware of this -			 * format will be sure to discard the cache. -			 */ -			addr += sizeof(u64); -			offset += sizeof(u64); +		ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, +				       e->bitmap); +		if (ret) +			goto out_nospc; -			gen = addr; -			*gen = trans->transid; -			addr += sizeof(u64); -			offset += sizeof(u64); +		if (e->bitmap) { +			list_add_tail(&e->list, &bitmap_list); +			bitmaps++;  		} -		entry = addr; - -		memset(addr, 0, PAGE_CACHE_SIZE - offset); -		while (node && !next_page) { -			struct btrfs_free_space *e; - -			e = rb_entry(node, struct btrfs_free_space, offset_index); -			entries++; - -			entry->offset = cpu_to_le64(e->offset); -			entry->bytes = cpu_to_le64(e->bytes); -			if (e->bitmap) { -				entry->type = BTRFS_FREE_SPACE_BITMAP; -				list_add_tail(&e->list, &bitmap_list); -				bitmaps++; -			} else { -				entry->type = BTRFS_FREE_SPACE_EXTENT; -			} -			node = rb_next(node); -			if (!node && cluster) { -				node = rb_first(&cluster->root); -				cluster = NULL; -			} -			offset += sizeof(struct btrfs_free_space_entry); -			if (offset + sizeof(struct btrfs_free_space_entry) >= -			    PAGE_CACHE_SIZE) -				next_page = true; -			entry++; +		node = rb_next(node); +		if (!node && cluster) { +			node = rb_first(&cluster->root); +			cluster = NULL;  		} +	} -		/* -		 * We want to add any pinned extents to our free space cache -		 * so we don't leak the space -		 */ -		while (block_group && !next_page && -		       (start < block_group->key.objectid + -			block_group->key.offset)) { -			ret = find_first_extent_bit(unpin, start, &start, &end, -						    EXTENT_DIRTY); -			if (ret) { -				ret = 0; -				break; -			} - -			/* This pinned extent is out of our range */ -			if (start >= block_group->key.objectid + -			    block_group->key.offset) -				break; - -			len = block_group->key.objectid + -				block_group->key.offset - start; -			len = min(len, end + 1 - start); - -			entries++; -			entry->offset = cpu_to_le64(start); -			entry->bytes = cpu_to_le64(len); -			entry->type = BTRFS_FREE_SPACE_EXTENT; - -			start = end + 1; -			offset += sizeof(struct btrfs_free_space_entry); -			if (offset + sizeof(struct btrfs_free_space_entry) >= -			    PAGE_CACHE_SIZE) -				next_page = true; -			entry++; +	/* +	 * We want to add any pinned extents to our free space cache +	 * so we don't leak the space +	 */ +	while (block_group && (start < block_group->key.objectid + +			       block_group->key.offset)) { +		ret = find_first_extent_bit(unpin, start, &start, &end, +					    EXTENT_DIRTY); +		if (ret) { +			ret = 0; +			break;  		} -		/* Generate bogus crc value */ -		if (index == 0) { -			u32 *tmp; -			crc = btrfs_csum_data(root, orig + sizeof(u64), crc, -					      PAGE_CACHE_SIZE - sizeof(u64)); -			btrfs_csum_final(crc, (char *)&crc); -			crc++; -			tmp = orig; -			*tmp = crc; -		} +		/* This pinned extent is out of our range */ +		if (start >= block_group->key.objectid + +		    block_group->key.offset) +			break; -		kunmap(page); +		len = block_group->key.objectid + +			block_group->key.offset - start; +		len = min(len, end + 1 - start); -		bytes += PAGE_CACHE_SIZE; +		entries++; +		ret = io_ctl_add_entry(&io_ctl, start, len, NULL); +		if (ret) +			goto out_nospc; -		index++; -	} while (node || next_page); +		start = end + 1; +	}  	/* Write out the bitmaps */  	list_for_each_safe(pos, n, &bitmap_list) { -		void *addr;  		struct btrfs_free_space *entry =  			list_entry(pos, struct btrfs_free_space, list); -		if (index >= num_pages) { -			out_of_space = true; -			break; -		} -		page = pages[index]; - -		addr = kmap(page); -		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); -		kunmap(page); -		bytes += PAGE_CACHE_SIZE; - +		ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); +		if (ret) +			goto out_nospc;  		list_del_init(&entry->list); -		index++; -	} - -	if (out_of_space) { -		btrfs_drop_pages(pages, num_pages); -		unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, -				     i_size_read(inode) - 1, &cached_state, -				     GFP_NOFS); -		ret = 0; -		goto out;  	}  	/* Zero out the rest of the pages just to make sure */ -	while (index < num_pages) { -		void *addr; +	io_ctl_zero_remaining_pages(&io_ctl); -		page = pages[index]; -		addr = kmap(page); -		memset(addr, 0, PAGE_CACHE_SIZE); -		kunmap(page); -		bytes += PAGE_CACHE_SIZE; -		index++; -	} - -	ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, -					    bytes, &cached_state); -	btrfs_drop_pages(pages, num_pages); +	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, +				0, i_size_read(inode), &cached_state); +	io_ctl_drop_pages(&io_ctl);  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,  			     i_size_read(inode) - 1, &cached_state, GFP_NOFS); -	if (ret) { -		ret = 0; +	if (ret)  		goto out; -	} -	BTRFS_I(inode)->generation = trans->transid; -	filemap_write_and_wait(inode->i_mapping); +	ret = filemap_write_and_wait(inode->i_mapping); +	if (ret) +		goto out;  	key.objectid = BTRFS_FREE_SPACE_OBJECTID;  	key.offset = offset;  	key.type = 0; -	ret = btrfs_search_slot(trans, root, &key, path, 1, 1); +	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);  	if (ret < 0) { -		ret = -1; -		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, -				 EXTENT_DIRTY | EXTENT_DELALLOC | -				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); +		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, +				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, +				 GFP_NOFS);  		goto out;  	}  	leaf = path->nodes[0]; @@ -816,15 +983,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);  		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||  		    found_key.offset != offset) { -			ret = -1; -			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, -					 EXTENT_DIRTY | EXTENT_DELALLOC | -					 EXTENT_DO_ACCOUNTING, 0, 0, NULL, -					 GFP_NOFS); +			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, +					 inode->i_size - 1, +					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, +					 NULL, GFP_NOFS);  			btrfs_release_path(path);  			goto out;  		}  	} + +	BTRFS_I(inode)->generation = trans->transid;  	header = btrfs_item_ptr(leaf, path->slots[0],  				struct btrfs_free_space_header);  	btrfs_set_free_space_entries(leaf, header, entries); @@ -833,16 +1001,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	btrfs_mark_buffer_dirty(leaf);  	btrfs_release_path(path); -	ret = 1; - +	err = 0;  out: -	kfree(pages); -	if (ret != 1) { -		invalidate_inode_pages2_range(inode->i_mapping, 0, index); +	io_ctl_free(&io_ctl); +	if (err) { +		invalidate_inode_pages2(inode->i_mapping);  		BTRFS_I(inode)->generation = 0;  	}  	btrfs_update_inode(trans, root, inode); -	return ret; +	return err; + +out_nospc: +	list_for_each_safe(pos, n, &bitmap_list) { +		struct btrfs_free_space *entry = +			list_entry(pos, struct btrfs_free_space, list); +		list_del_init(&entry->list); +	} +	io_ctl_drop_pages(&io_ctl); +	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, +			     i_size_read(inode) - 1, &cached_state, GFP_NOFS); +	goto out;  }  int btrfs_write_out_cache(struct btrfs_root *root, @@ -869,14 +1047,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,  	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,  				      path, block_group->key.objectid); -	if (ret < 0) { +	if (ret) {  		spin_lock(&block_group->lock);  		block_group->disk_cache_state = BTRFS_DC_ERROR;  		spin_unlock(&block_group->lock);  		ret = 0; - +#ifdef DEBUG  		printk(KERN_ERR "btrfs: failed to write free space cace "  		       "for block group %llu\n", block_group->key.objectid); +#endif  	}  	iput(inode); @@ -1701,6 +1880,7 @@ again:  			ctl->total_bitmaps--;  		}  		kmem_cache_free(btrfs_free_space_cachep, info); +		ret = 0;  		goto out_lock;  	} @@ -1708,7 +1888,8 @@ again:  		unlink_free_space(ctl, info);  		info->offset += bytes;  		info->bytes -= bytes; -		link_free_space(ctl, info); +		ret = link_free_space(ctl, info); +		WARN_ON(ret);  		goto out_lock;  	} @@ -2472,9 +2653,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,  		spin_unlock(&ctl->tree_lock);  		if (bytes >= minlen) { -			int update_ret; -			update_ret = btrfs_update_reserved_bytes(block_group, -								 bytes, 1, 1); +			struct btrfs_space_info *space_info; +			int update = 0; + +			space_info = block_group->space_info; +			spin_lock(&space_info->lock); +			spin_lock(&block_group->lock); +			if (!block_group->ro) { +				block_group->reserved += bytes; +				space_info->bytes_reserved += bytes; +				update = 1; +			} +			spin_unlock(&block_group->lock); +			spin_unlock(&space_info->lock);  			ret = btrfs_error_discard_extent(fs_info->extent_root,  							 start, @@ -2482,9 +2673,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,  							 &actually_trimmed);  			btrfs_add_free_space(block_group, start, bytes); -			if (!update_ret) -				btrfs_update_reserved_bytes(block_group, -							    bytes, 0, 1); +			if (update) { +				spin_lock(&space_info->lock); +				spin_lock(&block_group->lock); +				if (block_group->ro) +					space_info->bytes_readonly += bytes; +				block_group->reserved -= bytes; +				space_info->bytes_reserved -= bytes; +				spin_unlock(&space_info->lock); +				spin_unlock(&block_group->lock); +			}  			if (ret)  				break; @@ -2643,9 +2841,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,  		return 0;  	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); -	if (ret < 0) +	if (ret) { +		btrfs_delalloc_release_metadata(inode, inode->i_size); +#ifdef DEBUG  		printk(KERN_ERR "btrfs: failed to write free ino cache "  		       "for root %llu\n", root->root_key.objectid); +#endif +	}  	iput(inode);  	return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b4087e0fa87..53dcbdf446c 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -465,14 +465,16 @@ again:  	/* Just to make sure we have enough space */  	prealloc += 8 * PAGE_CACHE_SIZE; -	ret = btrfs_check_data_free_space(inode, prealloc); +	ret = btrfs_delalloc_reserve_space(inode, prealloc);  	if (ret)  		goto out_put;  	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,  					      prealloc, prealloc, &alloc_hint); -	if (ret) +	if (ret) { +		btrfs_delalloc_release_space(inode, prealloc);  		goto out_put; +	}  	btrfs_free_reserved_data_space(inode, prealloc);  out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 75686a61bd4..966ddcc4c63 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -45,10 +45,10 @@  #include "btrfs_inode.h"  #include "ioctl.h"  #include "print-tree.h" -#include "volumes.h"  #include "ordered-data.h"  #include "xattr.h"  #include "tree-log.h" +#include "volumes.h"  #include "compression.h"  #include "locking.h"  #include "free-space-cache.h" @@ -393,7 +393,10 @@ again:  	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {  		WARN_ON(pages);  		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); -		BUG_ON(!pages); +		if (!pages) { +			/* just bail out to the uncompressed code */ +			goto cont; +		}  		if (BTRFS_I(inode)->force_compress)  			compress_type = BTRFS_I(inode)->force_compress; @@ -424,6 +427,7 @@ again:  			will_compress = 1;  		}  	} +cont:  	if (start == 0) {  		trans = btrfs_join_transaction(root);  		BUG_ON(IS_ERR(trans)); @@ -820,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,  	}  	BUG_ON(disk_num_bytes > -	       btrfs_super_total_bytes(&root->fs_info->super_copy)); +	       btrfs_super_total_bytes(root->fs_info->super_copy));  	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);  	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); @@ -1792,12 +1796,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  	}  	ret = 0;  out: -	if (nolock) { -		if (trans) -			btrfs_end_transaction_nolock(trans, root); -	} else { +	if (root != root->fs_info->tree_root)  		btrfs_delalloc_release_metadata(inode, ordered_extent->len); -		if (trans) +	if (trans) { +		if (nolock) +			btrfs_end_transaction_nolock(trans, root); +		else  			btrfs_end_transaction(trans, root);  	} @@ -1819,153 +1823,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,  }  /* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data.  This - * io_failure_record is used to record state as we go through all the - * mirrors.  If another mirror has good data, the page is set up to date - * and things continue.  If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { -	struct page *page; -	u64 start; -	u64 len; -	u64 logical; -	unsigned long bio_flags; -	int last_mirror; -}; - -static int btrfs_io_failed_hook(struct bio *failed_bio, -			 struct page *page, u64 start, u64 end, -			 struct extent_state *state) -{ -	struct io_failure_record *failrec = NULL; -	u64 private; -	struct extent_map *em; -	struct inode *inode = page->mapping->host; -	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; -	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; -	struct bio *bio; -	int num_copies; -	int ret; -	int rw; -	u64 logical; - -	ret = get_state_private(failure_tree, start, &private); -	if (ret) { -		failrec = kmalloc(sizeof(*failrec), GFP_NOFS); -		if (!failrec) -			return -ENOMEM; -		failrec->start = start; -		failrec->len = end - start + 1; -		failrec->last_mirror = 0; -		failrec->bio_flags = 0; - -		read_lock(&em_tree->lock); -		em = lookup_extent_mapping(em_tree, start, failrec->len); -		if (em->start > start || em->start + em->len < start) { -			free_extent_map(em); -			em = NULL; -		} -		read_unlock(&em_tree->lock); - -		if (IS_ERR_OR_NULL(em)) { -			kfree(failrec); -			return -EIO; -		} -		logical = start - em->start; -		logical = em->block_start + logical; -		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { -			logical = em->block_start; -			failrec->bio_flags = EXTENT_BIO_COMPRESSED; -			extent_set_compress_type(&failrec->bio_flags, -						 em->compress_type); -		} -		failrec->logical = logical; -		free_extent_map(em); -		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | -				EXTENT_DIRTY, GFP_NOFS); -		set_state_private(failure_tree, start, -				 (u64)(unsigned long)failrec); -	} else { -		failrec = (struct io_failure_record *)(unsigned long)private; -	} -	num_copies = btrfs_num_copies( -			      &BTRFS_I(inode)->root->fs_info->mapping_tree, -			      failrec->logical, failrec->len); -	failrec->last_mirror++; -	if (!state) { -		spin_lock(&BTRFS_I(inode)->io_tree.lock); -		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, -						    failrec->start, -						    EXTENT_LOCKED); -		if (state && state->start != failrec->start) -			state = NULL; -		spin_unlock(&BTRFS_I(inode)->io_tree.lock); -	} -	if (!state || failrec->last_mirror > num_copies) { -		set_state_private(failure_tree, failrec->start, 0); -		clear_extent_bits(failure_tree, failrec->start, -				  failrec->start + failrec->len - 1, -				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); -		kfree(failrec); -		return -EIO; -	} -	bio = bio_alloc(GFP_NOFS, 1); -	bio->bi_private = state; -	bio->bi_end_io = failed_bio->bi_end_io; -	bio->bi_sector = failrec->logical >> 9; -	bio->bi_bdev = failed_bio->bi_bdev; -	bio->bi_size = 0; - -	bio_add_page(bio, page, failrec->len, start - page_offset(page)); -	if (failed_bio->bi_rw & REQ_WRITE) -		rw = WRITE; -	else -		rw = READ; - -	ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, -						      failrec->last_mirror, -						      failrec->bio_flags, 0); -	return ret; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -static int btrfs_clean_io_failures(struct inode *inode, u64 start) -{ -	u64 private; -	u64 private_failure; -	struct io_failure_record *failure; -	int ret; - -	private = 0; -	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, -			     (u64)-1, 1, EXTENT_DIRTY, 0)) { -		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, -					start, &private_failure); -		if (ret == 0) { -			failure = (struct io_failure_record *)(unsigned long) -				   private_failure; -			set_state_private(&BTRFS_I(inode)->io_failure_tree, -					  failure->start, 0); -			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, -					  failure->start, -					  failure->start + failure->len - 1, -					  EXTENT_DIRTY | EXTENT_LOCKED, -					  GFP_NOFS); -			kfree(failure); -		} -	} -	return 0; -} - -/*   * when reads are done, we need to check csums to verify the data is correct - * if there's a match, we allow the bio to finish.  If not, we go through - * the io_failure_record routines to find good copies + * if there's a match, we allow the bio to finish.  If not, the code in + * extent_io.c will try to find good copies for us.   */  static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,  			       struct extent_state *state) @@ -2011,10 +1871,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,  	kunmap_atomic(kaddr, KM_USER0);  good: -	/* if the io failure tree for this inode is non-empty, -	 * check to see if we've recovered from a failed IO -	 */ -	btrfs_clean_io_failures(inode, start);  	return 0;  zeroit: @@ -2079,89 +1935,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)  	up_read(&root->fs_info->cleanup_work_sem);  } -/* - * calculate extra metadata reservation when snapshotting a subvolume - * contains orphan files. - */ -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, -				struct btrfs_pending_snapshot *pending, -				u64 *bytes_to_reserve) -{ -	struct btrfs_root *root; -	struct btrfs_block_rsv *block_rsv; -	u64 num_bytes; -	int index; - -	root = pending->root; -	if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) -		return; - -	block_rsv = root->orphan_block_rsv; - -	/* orphan block reservation for the snapshot */ -	num_bytes = block_rsv->size; - -	/* -	 * after the snapshot is created, COWing tree blocks may use more -	 * space than it frees. So we should make sure there is enough -	 * reserved space. -	 */ -	index = trans->transid & 0x1; -	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { -		num_bytes += block_rsv->size - -			     (block_rsv->reserved + block_rsv->freed[index]); -	} - -	*bytes_to_reserve += num_bytes; -} - -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, -				struct btrfs_pending_snapshot *pending) -{ -	struct btrfs_root *root = pending->root; -	struct btrfs_root *snap = pending->snap; -	struct btrfs_block_rsv *block_rsv; -	u64 num_bytes; -	int index; -	int ret; - -	if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) -		return; - -	/* refill source subvolume's orphan block reservation */ -	block_rsv = root->orphan_block_rsv; -	index = trans->transid & 0x1; -	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { -		num_bytes = block_rsv->size - -			    (block_rsv->reserved + block_rsv->freed[index]); -		ret = btrfs_block_rsv_migrate(&pending->block_rsv, -					      root->orphan_block_rsv, -					      num_bytes); -		BUG_ON(ret); -	} - -	/* setup orphan block reservation for the snapshot */ -	block_rsv = btrfs_alloc_block_rsv(snap); -	BUG_ON(!block_rsv); - -	btrfs_add_durable_block_rsv(root->fs_info, block_rsv); -	snap->orphan_block_rsv = block_rsv; - -	num_bytes = root->orphan_block_rsv->size; -	ret = btrfs_block_rsv_migrate(&pending->block_rsv, -				      block_rsv, num_bytes); -	BUG_ON(ret); - -#if 0 -	/* insert orphan item for the snapshot */ -	WARN_ON(!root->orphan_item_inserted); -	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, -				       snap->root_key.objectid); -	BUG_ON(ret); -	snap->orphan_item_inserted = 1; -#endif -} -  enum btrfs_orphan_cleanup_state {  	ORPHAN_CLEANUP_STARTED	= 1,  	ORPHAN_CLEANUP_DONE	= 2, @@ -2247,9 +2020,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  	}  	spin_unlock(&root->orphan_lock); -	if (block_rsv) -		btrfs_add_durable_block_rsv(root->fs_info, block_rsv); -  	/* grab metadata reservation from transaction handle */  	if (reserve) {  		ret = btrfs_orphan_reserve_metadata(trans, inode); @@ -2316,6 +2086,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  	struct btrfs_key key, found_key;  	struct btrfs_trans_handle *trans;  	struct inode *inode; +	u64 last_objectid = 0;  	int ret = 0, nr_unlink = 0, nr_truncate = 0;  	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) @@ -2367,41 +2138,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		 * crossing root thing.  we store the inode number in the  		 * offset of the orphan item.  		 */ + +		if (found_key.offset == last_objectid) { +			printk(KERN_ERR "btrfs: Error removing orphan entry, " +			       "stopping orphan cleanup\n"); +			ret = -EINVAL; +			goto out; +		} + +		last_objectid = found_key.offset; +  		found_key.objectid = found_key.offset;  		found_key.type = BTRFS_INODE_ITEM_KEY;  		found_key.offset = 0;  		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); -		if (IS_ERR(inode)) { -			ret = PTR_ERR(inode); +		ret = PTR_RET(inode); +		if (ret && ret != -ESTALE)  			goto out; -		}  		/* -		 * add this inode to the orphan list so btrfs_orphan_del does -		 * the proper thing when we hit it +		 * Inode is already gone but the orphan item is still there, +		 * kill the orphan item.  		 */ -		spin_lock(&root->orphan_lock); -		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); -		spin_unlock(&root->orphan_lock); - -		/* -		 * if this is a bad inode, means we actually succeeded in -		 * removing the inode, but not the orphan record, which means -		 * we need to manually delete the orphan since iput will just -		 * do a destroy_inode -		 */ -		if (is_bad_inode(inode)) { -			trans = btrfs_start_transaction(root, 0); +		if (ret == -ESTALE) { +			trans = btrfs_start_transaction(root, 1);  			if (IS_ERR(trans)) {  				ret = PTR_ERR(trans);  				goto out;  			} -			btrfs_orphan_del(trans, inode); +			ret = btrfs_del_orphan_item(trans, root, +						    found_key.objectid); +			BUG_ON(ret);  			btrfs_end_transaction(trans, root); -			iput(inode);  			continue;  		} +		/* +		 * add this inode to the orphan list so btrfs_orphan_del does +		 * the proper thing when we hit it +		 */ +		spin_lock(&root->orphan_lock); +		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); +		spin_unlock(&root->orphan_lock); +  		/* if we have links, this was a truncate, lets do that */  		if (inode->i_nlink) {  			if (!S_ISREG(inode->i_mode)) { @@ -2835,7 +2614,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,  	u64 ino = btrfs_ino(inode);  	u64 dir_ino = btrfs_ino(dir); -	trans = btrfs_start_transaction(root, 10); +	/* +	 * 1 for the possible orphan item +	 * 1 for the dir item +	 * 1 for the dir index +	 * 1 for the inode ref +	 * 1 for the inode ref in the tree log +	 * 2 for the dir entries in the log +	 * 1 for the inode +	 */ +	trans = btrfs_start_transaction(root, 8);  	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)  		return trans; @@ -2858,7 +2646,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,  		return ERR_PTR(-ENOMEM);  	} -	trans = btrfs_start_transaction(root, 0); +	/* 1 for the orphan item */ +	trans = btrfs_start_transaction(root, 1);  	if (IS_ERR(trans)) {  		btrfs_free_path(path);  		root->fs_info->enospc_unlink = 0; @@ -2963,6 +2752,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,  	err = 0;  out:  	btrfs_free_path(path); +	/* Migrate the orphan reservation over */ +	if (!err) +		err = btrfs_block_rsv_migrate(trans->block_rsv, +				&root->fs_info->global_block_rsv, +				trans->bytes_reserved); +  	if (err) {  		btrfs_end_transaction(trans, root);  		root->fs_info->enospc_unlink = 0; @@ -2977,6 +2772,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,  			       struct btrfs_root *root)  {  	if (trans->block_rsv == &root->fs_info->global_block_rsv) { +		btrfs_block_rsv_release(root, trans->block_rsv, +					trans->bytes_reserved); +		trans->block_rsv = &root->fs_info->trans_block_rsv;  		BUG_ON(!root->fs_info->enospc_unlink);  		root->fs_info->enospc_unlink = 0;  	} @@ -3368,6 +3166,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)  	pgoff_t index = from >> PAGE_CACHE_SHIFT;  	unsigned offset = from & (PAGE_CACHE_SIZE-1);  	struct page *page; +	gfp_t mask = btrfs_alloc_write_mask(mapping);  	int ret = 0;  	u64 page_start;  	u64 page_end; @@ -3380,7 +3179,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)  	ret = -ENOMEM;  again: -	page = find_or_create_page(mapping, index, GFP_NOFS); +	page = find_or_create_page(mapping, index, mask);  	if (!page) {  		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);  		goto out; @@ -3613,6 +3412,8 @@ void btrfs_evict_inode(struct inode *inode)  {  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct btrfs_block_rsv *rsv, *global_rsv; +	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);  	unsigned long nr;  	int ret; @@ -3640,22 +3441,55 @@ void btrfs_evict_inode(struct inode *inode)  		goto no_delete;  	} +	rsv = btrfs_alloc_block_rsv(root); +	if (!rsv) { +		btrfs_orphan_del(NULL, inode); +		goto no_delete; +	} +	rsv->size = min_size; +	global_rsv = &root->fs_info->global_block_rsv; +  	btrfs_i_size_write(inode, 0); +	/* +	 * This is a bit simpler than btrfs_truncate since +	 * +	 * 1) We've already reserved our space for our orphan item in the +	 *    unlink. +	 * 2) We're going to delete the inode item, so we don't need to update +	 *    it at all. +	 * +	 * So we just need to reserve some slack space in case we add bytes when +	 * doing the truncate. +	 */  	while (1) { -		trans = btrfs_join_transaction(root); -		BUG_ON(IS_ERR(trans)); -		trans->block_rsv = root->orphan_block_rsv; +		ret = btrfs_block_rsv_refill(root, rsv, min_size); + +		/* +		 * Try and steal from the global reserve since we will +		 * likely not use this space anyway, we want to try as +		 * hard as possible to get this to work. +		 */ +		if (ret) +			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); -		ret = btrfs_block_rsv_check(trans, root, -					    root->orphan_block_rsv, 0, 5);  		if (ret) { -			BUG_ON(ret != -EAGAIN); -			ret = btrfs_commit_transaction(trans, root); -			BUG_ON(ret); -			continue; +			printk(KERN_WARNING "Could not get space for a " +			       "delete, will truncate on mount %d\n", ret); +			btrfs_orphan_del(NULL, inode); +			btrfs_free_block_rsv(root, rsv); +			goto no_delete; +		} + +		trans = btrfs_start_transaction(root, 0); +		if (IS_ERR(trans)) { +			btrfs_orphan_del(NULL, inode); +			btrfs_free_block_rsv(root, rsv); +			goto no_delete;  		} +		trans->block_rsv = rsv; +  		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);  		if (ret != -EAGAIN)  			break; @@ -3664,14 +3498,17 @@ void btrfs_evict_inode(struct inode *inode)  		btrfs_end_transaction(trans, root);  		trans = NULL;  		btrfs_btree_balance_dirty(root, nr); -  	} +	btrfs_free_block_rsv(root, rsv); +  	if (ret == 0) { +		trans->block_rsv = root->orphan_block_rsv;  		ret = btrfs_orphan_del(trans, inode);  		BUG_ON(ret);  	} +	trans->block_rsv = &root->fs_info->trans_block_rsv;  	if (!(root == root->fs_info->tree_root ||  	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))  		btrfs_return_ino(root, btrfs_ino(inode)); @@ -5795,8 +5632,7 @@ again:  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {  		ret = btrfs_ordered_update_i_size(inode, 0, ordered);  		if (!ret) -			ret = btrfs_update_inode(trans, root, inode); -		err = ret; +			err = btrfs_update_inode(trans, root, inode);  		goto out;  	} @@ -6289,7 +6125,7 @@ int btrfs_readpage(struct file *file, struct page *page)  {  	struct extent_io_tree *tree;  	tree = &BTRFS_I(page->mapping->host)->io_tree; -	return extent_read_full_page(tree, page, btrfs_get_extent); +	return extent_read_full_page(tree, page, btrfs_get_extent, 0);  }  static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -6541,6 +6377,7 @@ static int btrfs_truncate(struct inode *inode)  	struct btrfs_trans_handle *trans;  	unsigned long nr;  	u64 mask = root->sectorsize - 1; +	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);  	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);  	if (ret) @@ -6588,19 +6425,23 @@ static int btrfs_truncate(struct inode *inode)  	rsv = btrfs_alloc_block_rsv(root);  	if (!rsv)  		return -ENOMEM; -	btrfs_add_durable_block_rsv(root->fs_info, rsv); +	rsv->size = min_size; +	/* +	 * 1 for the truncate slack space +	 * 1 for the orphan item we're going to add +	 * 1 for the orphan item deletion +	 * 1 for updating the inode. +	 */  	trans = btrfs_start_transaction(root, 4);  	if (IS_ERR(trans)) {  		err = PTR_ERR(trans);  		goto out;  	} -	/* -	 * Reserve space for the truncate process.  Truncate should be adding -	 * space, but if there are snapshots it may end up using space. -	 */ -	ret = btrfs_truncate_reserve_metadata(trans, root, rsv); +	/* Migrate the slack space for the truncate to our reserve */ +	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, +				      min_size);  	BUG_ON(ret);  	ret = btrfs_orphan_add(trans, inode); @@ -6609,21 +6450,6 @@ static int btrfs_truncate(struct inode *inode)  		goto out;  	} -	nr = trans->blocks_used; -	btrfs_end_transaction(trans, root); -	btrfs_btree_balance_dirty(root, nr); - -	/* -	 * Ok so we've already migrated our bytes over for the truncate, so here -	 * just reserve the one slot we need for updating the inode. -	 */ -	trans = btrfs_start_transaction(root, 1); -	if (IS_ERR(trans)) { -		err = PTR_ERR(trans); -		goto out; -	} -	trans->block_rsv = rsv; -  	/*  	 * setattr is responsible for setting the ordered_data_close flag,  	 * but that is only tested during the last file release.  That @@ -6645,20 +6471,30 @@ static int btrfs_truncate(struct inode *inode)  		btrfs_add_ordered_operation(trans, root, inode);  	while (1) { +		ret = btrfs_block_rsv_refill(root, rsv, min_size); +		if (ret) { +			/* +			 * This can only happen with the original transaction we +			 * started above, every other time we shouldn't have a +			 * transaction started yet. +			 */ +			if (ret == -EAGAIN) +				goto end_trans; +			err = ret; +			break; +		} +  		if (!trans) { -			trans = btrfs_start_transaction(root, 3); +			/* Just need the 1 for updating the inode */ +			trans = btrfs_start_transaction(root, 1);  			if (IS_ERR(trans)) {  				err = PTR_ERR(trans);  				goto out;  			} - -			ret = btrfs_truncate_reserve_metadata(trans, root, -							      rsv); -			BUG_ON(ret); - -			trans->block_rsv = rsv;  		} +		trans->block_rsv = rsv; +  		ret = btrfs_truncate_inode_items(trans, root, inode,  						 inode->i_size,  						 BTRFS_EXTENT_DATA_KEY); @@ -6673,7 +6509,7 @@ static int btrfs_truncate(struct inode *inode)  			err = ret;  			break;  		} - +end_trans:  		nr = trans->blocks_used;  		btrfs_end_transaction(trans, root);  		trans = NULL; @@ -6755,9 +6591,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->last_sub_trans = 0;  	ei->logged_trans = 0;  	ei->delalloc_bytes = 0; -	ei->reserved_bytes = 0;  	ei->disk_i_size = 0;  	ei->flags = 0; +	ei->csum_bytes = 0;  	ei->index_cnt = (u64)-1;  	ei->last_unlink_trans = 0; @@ -6803,6 +6639,8 @@ void btrfs_destroy_inode(struct inode *inode)  	WARN_ON(inode->i_data.nrpages);  	WARN_ON(BTRFS_I(inode)->outstanding_extents);  	WARN_ON(BTRFS_I(inode)->reserved_extents); +	WARN_ON(BTRFS_I(inode)->delalloc_bytes); +	WARN_ON(BTRFS_I(inode)->csum_bytes);  	/*  	 * This can happen where we create an inode, but somebody else also @@ -7420,7 +7258,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {  	.readpage_end_io_hook = btrfs_readpage_end_io_hook,  	.writepage_end_io_hook = btrfs_writepage_end_io_hook,  	.writepage_start_hook = btrfs_writepage_start_hook, -	.readpage_io_failed_hook = btrfs_io_failed_hook,  	.set_bit_hook = btrfs_set_bit_hook,  	.clear_bit_hook = btrfs_clear_bit_hook,  	.merge_extent_hook = btrfs_merge_extent_hook, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dae5dfe41ba..4a34c472f12 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -51,6 +51,7 @@  #include "volumes.h"  #include "locking.h"  #include "inode-map.h" +#include "backref.h"  /* Mask out flags that are inappropriate for the given type of inode. */  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)  /*   * Inherit flags from the parent inode.   * - * Unlike extN we don't have any flags we don't want to inherit currently. + * Currently only the compression flags and the cow flags are inherited.   */  void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)  { @@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)  	flags = BTRFS_I(dir)->flags; -	if (S_ISREG(inode->i_mode)) -		flags &= ~BTRFS_INODE_DIRSYNC; -	else if (!S_ISDIR(inode->i_mode)) -		flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); +	if (flags & BTRFS_INODE_NOCOMPRESS) { +		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; +		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; +	} else if (flags & BTRFS_INODE_COMPRESS) { +		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; +		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; +	} + +	if (flags & BTRFS_INODE_NODATACOW) +		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; -	BTRFS_I(inode)->flags = flags;  	btrfs_update_iflags(inode);  } @@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)  	struct fstrim_range range;  	u64 minlen = ULLONG_MAX;  	u64 num_devices = 0; +	u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);  	int ret;  	if (!capable(CAP_SYS_ADMIN)) @@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)  		}  	}  	rcu_read_unlock(); +  	if (!num_devices)  		return -EOPNOTSUPP; -  	if (copy_from_user(&range, arg, sizeof(range)))  		return -EFAULT; +	if (range.start > total_bytes) +		return -EINVAL; +	range.len = min(range.len, total_bytes - range.start);  	range.minlen = max(range.minlen, minlen);  	ret = btrfs_trim_fs(root, &range);  	if (ret < 0) @@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,  	int ret = 1;  	/* -	 * make sure that once we start defragging and extent, we keep on +	 * make sure that once we start defragging an extent, we keep on  	 * defragging it  	 */  	if (start < *defrag_end) @@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,  	 * extent will force at least part of that big extent to be defragged.  	 */  	if (ret) { -		*last_len += len;  		*defrag_end = extent_map_end(em);  	} else {  		*last_len = 0; @@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,  	int i_done;  	struct btrfs_ordered_extent *ordered;  	struct extent_state *cached_state = NULL; +	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);  	if (isize == 0)  		return 0; @@ -860,7 +870,7 @@ again:  	for (i = 0; i < num_pages; i++) {  		struct page *page;  		page = find_or_create_page(inode->i_mapping, -					    start_index + i, GFP_NOFS); +					    start_index + i, mask);  		if (!page)  			break; @@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  	struct btrfs_super_block *disk_super;  	struct file_ra_state *ra = NULL;  	unsigned long last_index; +	u64 isize = i_size_read(inode);  	u64 features;  	u64 last_len = 0;  	u64 skip = 0;  	u64 defrag_end = 0;  	u64 newer_off = range->start; -	int newer_left = 0;  	unsigned long i; +	unsigned long ra_index = 0;  	int ret;  	int defrag_count = 0;  	int compress_type = BTRFS_COMPRESS_ZLIB;  	int extent_thresh = range->extent_thresh; -	int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; +	int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; +	int cluster = max_cluster;  	u64 new_align = ~((u64)128 * 1024 - 1);  	struct page **pages = NULL; @@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  			compress_type = range->compress_type;  	} -	if (inode->i_size == 0) +	if (isize == 0)  		return 0;  	/* @@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  		ra = &file->f_ra;  	} -	pages = kmalloc(sizeof(struct page *) * newer_cluster, +	pages = kmalloc(sizeof(struct page *) * max_cluster,  			GFP_NOFS);  	if (!pages) {  		ret = -ENOMEM; @@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  	/* find the last page to defrag */  	if (range->start + range->len > range->start) { -		last_index = min_t(u64, inode->i_size - 1, +		last_index = min_t(u64, isize - 1,  			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;  	} else { -		last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; +		last_index = (isize - 1) >> PAGE_CACHE_SHIFT;  	}  	if (newer_than) { @@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  			 * the extents in the file evenly spaced  			 */  			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; -			newer_left = newer_cluster;  		} else  			goto out_ra;  	} else {  		i = range->start >> PAGE_CACHE_SHIFT;  	}  	if (!max_to_defrag) -		max_to_defrag = last_index - 1; +		max_to_defrag = last_index;  	/*  	 * make writeback starts from i, so the defrag range can be @@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  			i = max(i + 1, next);  			continue;  		} + +		if (!newer_than) { +			cluster = (PAGE_CACHE_ALIGN(defrag_end) >> +				   PAGE_CACHE_SHIFT) - i; +			cluster = min(cluster, max_cluster); +		} else { +			cluster = max_cluster; +		} +  		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)  			BTRFS_I(inode)->force_compress = compress_type; -		btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); +		if (i + cluster > ra_index) { +			ra_index = max(i, ra_index); +			btrfs_force_ra(inode->i_mapping, ra, file, ra_index, +				       cluster); +			ra_index += max_cluster; +		} -		ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); +		ret = cluster_pages_for_defrag(inode, pages, i, cluster);  		if (ret < 0)  			goto out_ra;  		defrag_count += ret;  		balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); -		i += ret;  		if (newer_than) {  			if (newer_off == (u64)-1) @@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  			if (!ret) {  				range->start = newer_off;  				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; -				newer_left = newer_cluster;  			} else {  				break;  			}  		} else { -			i++; +			if (ret > 0) { +				i += ret; +				last_len += ret << PAGE_CACHE_SHIFT; +			} else { +				i++; +				last_len = 0; +			}  		}  	} @@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  		mutex_unlock(&inode->i_mutex);  	} -	disk_super = &root->fs_info->super_copy; +	disk_super = root->fs_info->super_copy;  	features = btrfs_super_incompat_flags(disk_super);  	if (range->compress_type == BTRFS_COMPRESS_LZO) {  		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;  		btrfs_set_super_incompat_flags(disk_super, features);  	} -	if (!file) -		kfree(ra); -	return defrag_count; +	ret = defrag_count;  out_ra:  	if (!file) @@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  		return PTR_ERR(trans);  	} -	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); +	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);  	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,  				   dir_id, "default", 7, 1);  	if (IS_ERR_OR_NULL(di)) { @@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  	btrfs_mark_buffer_dirty(path->nodes[0]);  	btrfs_free_path(path); -	disk_super = &root->fs_info->super_copy; +	disk_super = root->fs_info->super_copy;  	features = btrfs_super_incompat_flags(disk_super);  	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {  		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; @@ -2864,6 +2891,144 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,  	return ret;  } +static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) +{ +	int ret = 0; +	int i; +	u64 rel_ptr; +	int size; +	struct btrfs_ioctl_ino_path_args *ipa = NULL; +	struct inode_fs_paths *ipath = NULL; +	struct btrfs_path *path; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	path = btrfs_alloc_path(); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} + +	ipa = memdup_user(arg, sizeof(*ipa)); +	if (IS_ERR(ipa)) { +		ret = PTR_ERR(ipa); +		ipa = NULL; +		goto out; +	} + +	size = min_t(u32, ipa->size, 4096); +	ipath = init_ipath(size, root, path); +	if (IS_ERR(ipath)) { +		ret = PTR_ERR(ipath); +		ipath = NULL; +		goto out; +	} + +	ret = paths_from_inode(ipa->inum, ipath); +	if (ret < 0) +		goto out; + +	for (i = 0; i < ipath->fspath->elem_cnt; ++i) { +		rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; +		ipath->fspath->val[i] = rel_ptr; +	} + +	ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); +	if (ret) { +		ret = -EFAULT; +		goto out; +	} + +out: +	btrfs_free_path(path); +	free_ipath(ipath); +	kfree(ipa); + +	return ret; +} + +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ +	struct btrfs_data_container *inodes = ctx; +	const size_t c = 3 * sizeof(u64); + +	if (inodes->bytes_left >= c) { +		inodes->bytes_left -= c; +		inodes->val[inodes->elem_cnt] = inum; +		inodes->val[inodes->elem_cnt + 1] = offset; +		inodes->val[inodes->elem_cnt + 2] = root; +		inodes->elem_cnt += 3; +	} else { +		inodes->bytes_missing += c - inodes->bytes_left; +		inodes->bytes_left = 0; +		inodes->elem_missed += 3; +	} + +	return 0; +} + +static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, +					void __user *arg) +{ +	int ret = 0; +	int size; +	u64 extent_offset; +	struct btrfs_ioctl_logical_ino_args *loi; +	struct btrfs_data_container *inodes = NULL; +	struct btrfs_path *path = NULL; +	struct btrfs_key key; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	loi = memdup_user(arg, sizeof(*loi)); +	if (IS_ERR(loi)) { +		ret = PTR_ERR(loi); +		loi = NULL; +		goto out; +	} + +	path = btrfs_alloc_path(); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} + +	size = min_t(u32, loi->size, 4096); +	inodes = init_data_container(size); +	if (IS_ERR(inodes)) { +		ret = PTR_ERR(inodes); +		inodes = NULL; +		goto out; +	} + +	ret = extent_from_logical(root->fs_info, loi->logical, path, &key); + +	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) +		ret = -ENOENT; +	if (ret < 0) +		goto out; + +	extent_offset = loi->logical - key.objectid; +	ret = iterate_extent_inodes(root->fs_info, path, key.objectid, +					extent_offset, build_ino_list, inodes); + +	if (ret < 0) +		goto out; + +	ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); +	if (ret) +		ret = -EFAULT; + +out: +	btrfs_free_path(path); +	kfree(inodes); +	kfree(loi); + +	return ret; +} +  long btrfs_ioctl(struct file *file, unsigned int  		cmd, unsigned long arg)  { @@ -2921,6 +3086,10 @@ long btrfs_ioctl(struct file *file, unsigned int  		return btrfs_ioctl_tree_search(file, argp);  	case BTRFS_IOC_INO_LOOKUP:  		return btrfs_ioctl_ino_lookup(file, argp); +	case BTRFS_IOC_INO_PATHS: +		return btrfs_ioctl_ino_to_path(root, argp); +	case BTRFS_IOC_LOGICAL_INO: +		return btrfs_ioctl_logical_to_ino(root, argp);  	case BTRFS_IOC_SPACE_INFO:  		return btrfs_ioctl_space_info(root, argp);  	case BTRFS_IOC_SYNC: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ad1ea789fcb..252ae9915de 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {  	struct btrfs_ioctl_space_info spaces[0];  }; +struct btrfs_data_container { +	__u32	bytes_left;	/* out -- bytes not needed to deliver output */ +	__u32	bytes_missing;	/* out -- additional bytes needed for result */ +	__u32	elem_cnt;	/* out */ +	__u32	elem_missed;	/* out */ +	__u64	val[0];		/* out */ +}; + +struct btrfs_ioctl_ino_path_args { +	__u64				inum;		/* in */ +	__u32				size;		/* in */ +	__u64				reserved[4]; +	/* struct btrfs_data_container	*fspath;	   out */ +	__u64				fspath;		/* out */ +}; + +struct btrfs_ioctl_logical_ino_args { +	__u64				logical;	/* in */ +	__u32				size;		/* in */ +	__u64				reserved[4]; +	/* struct btrfs_data_container	*inodes;	out   */ +	__u64				inodes; +}; +  #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \  				   struct btrfs_ioctl_vol_args)  #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {  				 struct btrfs_ioctl_dev_info_args)  #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \  			       struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ +					struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ +					struct btrfs_ioctl_ino_path_args) +  #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fb2605d998e..f38e452486b 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)  void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)  {  	int i; -	u32 type; -	u32 nr = btrfs_header_nritems(l); +	u32 type, nr;  	struct btrfs_item *item;  	struct btrfs_root_item *ri;  	struct btrfs_dir_item *di; @@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)  	struct btrfs_key key;  	struct btrfs_key found_key; +	if (!l) +		return; + +	nr = btrfs_header_nritems(l); +  	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",  		(unsigned long long)btrfs_header_bytenr(l), nr,  		btrfs_leaf_free_space(root, l)); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c new file mode 100644 index 00000000000..2373b39a132 --- /dev/null +++ b/fs/btrfs/reada.c @@ -0,0 +1,951 @@ +/* + * Copyright (C) 2011 STRATO.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "transaction.h" + +#undef DEBUG + +/* + * This is the implementation for the generic read ahead framework. + * + * To trigger a readahead, btrfs_reada_add must be called. It will start + * a read ahead for the given range [start, end) on tree root. The returned + * handle can either be used to wait on the readahead to finish + * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). + * + * The read ahead works as follows: + * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. + * reada_start_machine will then search for extents to prefetch and trigger + * some reads. When a read finishes for a node, all contained node/leaf + * pointers that lie in the given range will also be enqueued. The reads will + * be triggered in sequential order, thus giving a big win over a naive + * enumeration. It will also make use of multi-device layouts. Each disk + * will have its on read pointer and all disks will by utilized in parallel. + * Also will no two disks read both sides of a mirror simultaneously, as this + * would waste seeking capacity. Instead both disks will read different parts + * of the filesystem. + * Any number of readaheads can be started in parallel. The read order will be + * determined globally, i.e. 2 parallel readaheads will normally finish faster + * than the 2 started one after another. + */ + +#define MAX_MIRRORS 2 +#define MAX_IN_FLIGHT 6 + +struct reada_extctl { +	struct list_head	list; +	struct reada_control	*rc; +	u64			generation; +}; + +struct reada_extent { +	u64			logical; +	struct btrfs_key	top; +	u32			blocksize; +	int			err; +	struct list_head	extctl; +	struct kref		refcnt; +	spinlock_t		lock; +	struct reada_zone	*zones[MAX_MIRRORS]; +	int			nzones; +	struct btrfs_device	*scheduled_for; +}; + +struct reada_zone { +	u64			start; +	u64			end; +	u64			elems; +	struct list_head	list; +	spinlock_t		lock; +	int			locked; +	struct btrfs_device	*device; +	struct btrfs_device	*devs[MAX_MIRRORS]; /* full list, incl self */ +	int			ndevs; +	struct kref		refcnt; +}; + +struct reada_machine_work { +	struct btrfs_work	work; +	struct btrfs_fs_info	*fs_info; +}; + +static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); +static void reada_control_release(struct kref *kref); +static void reada_zone_release(struct kref *kref); +static void reada_start_machine(struct btrfs_fs_info *fs_info); +static void __reada_start_machine(struct btrfs_fs_info *fs_info); + +static int reada_add_block(struct reada_control *rc, u64 logical, +			   struct btrfs_key *top, int level, u64 generation); + +/* recurses */ +/* in case of err, eb might be NULL */ +static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, +			    u64 start, int err) +{ +	int level = 0; +	int nritems; +	int i; +	u64 bytenr; +	u64 generation; +	struct reada_extent *re; +	struct btrfs_fs_info *fs_info = root->fs_info; +	struct list_head list; +	unsigned long index = start >> PAGE_CACHE_SHIFT; +	struct btrfs_device *for_dev; + +	if (eb) +		level = btrfs_header_level(eb); + +	/* find extent */ +	spin_lock(&fs_info->reada_lock); +	re = radix_tree_lookup(&fs_info->reada_tree, index); +	if (re) +		kref_get(&re->refcnt); +	spin_unlock(&fs_info->reada_lock); + +	if (!re) +		return -1; + +	spin_lock(&re->lock); +	/* +	 * just take the full list from the extent. afterwards we +	 * don't need the lock anymore +	 */ +	list_replace_init(&re->extctl, &list); +	for_dev = re->scheduled_for; +	re->scheduled_for = NULL; +	spin_unlock(&re->lock); + +	if (err == 0) { +		nritems = level ? btrfs_header_nritems(eb) : 0; +		generation = btrfs_header_generation(eb); +		/* +		 * FIXME: currently we just set nritems to 0 if this is a leaf, +		 * effectively ignoring the content. In a next step we could +		 * trigger more readahead depending from the content, e.g. +		 * fetch the checksums for the extents in the leaf. +		 */ +	} else { +		/* +		 * this is the error case, the extent buffer has not been +		 * read correctly. We won't access anything from it and +		 * just cleanup our data structures. Effectively this will +		 * cut the branch below this node from read ahead. +		 */ +		nritems = 0; +		generation = 0; +	} + +	for (i = 0; i < nritems; i++) { +		struct reada_extctl *rec; +		u64 n_gen; +		struct btrfs_key key; +		struct btrfs_key next_key; + +		btrfs_node_key_to_cpu(eb, &key, i); +		if (i + 1 < nritems) +			btrfs_node_key_to_cpu(eb, &next_key, i + 1); +		else +			next_key = re->top; +		bytenr = btrfs_node_blockptr(eb, i); +		n_gen = btrfs_node_ptr_generation(eb, i); + +		list_for_each_entry(rec, &list, list) { +			struct reada_control *rc = rec->rc; + +			/* +			 * if the generation doesn't match, just ignore this +			 * extctl. This will probably cut off a branch from +			 * prefetch. Alternatively one could start a new (sub-) +			 * prefetch for this branch, starting again from root. +			 * FIXME: move the generation check out of this loop +			 */ +#ifdef DEBUG +			if (rec->generation != generation) { +				printk(KERN_DEBUG "generation mismatch for " +						"(%llu,%d,%llu) %llu != %llu\n", +				       key.objectid, key.type, key.offset, +				       rec->generation, generation); +			} +#endif +			if (rec->generation == generation && +			    btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && +			    btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) +				reada_add_block(rc, bytenr, &next_key, +						level - 1, n_gen); +		} +	} +	/* +	 * free extctl records +	 */ +	while (!list_empty(&list)) { +		struct reada_control *rc; +		struct reada_extctl *rec; + +		rec = list_first_entry(&list, struct reada_extctl, list); +		list_del(&rec->list); +		rc = rec->rc; +		kfree(rec); + +		kref_get(&rc->refcnt); +		if (atomic_dec_and_test(&rc->elems)) { +			kref_put(&rc->refcnt, reada_control_release); +			wake_up(&rc->wait); +		} +		kref_put(&rc->refcnt, reada_control_release); + +		reada_extent_put(fs_info, re);	/* one ref for each entry */ +	} +	reada_extent_put(fs_info, re);	/* our ref */ +	if (for_dev) +		atomic_dec(&for_dev->reada_in_flight); + +	return 0; +} + +/* + * start is passed separately in case eb in NULL, which may be the case with + * failed I/O + */ +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, +			 u64 start, int err) +{ +	int ret; + +	ret = __readahead_hook(root, eb, start, err); + +	reada_start_machine(root->fs_info); + +	return ret; +} + +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, +					  struct btrfs_device *dev, u64 logical, +					  struct btrfs_bio *bbio) +{ +	int ret; +	int looped = 0; +	struct reada_zone *zone; +	struct btrfs_block_group_cache *cache = NULL; +	u64 start; +	u64 end; +	int i; + +again: +	zone = NULL; +	spin_lock(&fs_info->reada_lock); +	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, +				     logical >> PAGE_CACHE_SHIFT, 1); +	if (ret == 1) +		kref_get(&zone->refcnt); +	spin_unlock(&fs_info->reada_lock); + +	if (ret == 1) { +		if (logical >= zone->start && logical < zone->end) +			return zone; +		spin_lock(&fs_info->reada_lock); +		kref_put(&zone->refcnt, reada_zone_release); +		spin_unlock(&fs_info->reada_lock); +	} + +	if (looped) +		return NULL; + +	cache = btrfs_lookup_block_group(fs_info, logical); +	if (!cache) +		return NULL; + +	start = cache->key.objectid; +	end = start + cache->key.offset - 1; +	btrfs_put_block_group(cache); + +	zone = kzalloc(sizeof(*zone), GFP_NOFS); +	if (!zone) +		return NULL; + +	zone->start = start; +	zone->end = end; +	INIT_LIST_HEAD(&zone->list); +	spin_lock_init(&zone->lock); +	zone->locked = 0; +	kref_init(&zone->refcnt); +	zone->elems = 0; +	zone->device = dev; /* our device always sits at index 0 */ +	for (i = 0; i < bbio->num_stripes; ++i) { +		/* bounds have already been checked */ +		zone->devs[i] = bbio->stripes[i].dev; +	} +	zone->ndevs = bbio->num_stripes; + +	spin_lock(&fs_info->reada_lock); +	ret = radix_tree_insert(&dev->reada_zones, +				(unsigned long)zone->end >> PAGE_CACHE_SHIFT, +				zone); +	spin_unlock(&fs_info->reada_lock); + +	if (ret) { +		kfree(zone); +		looped = 1; +		goto again; +	} + +	return zone; +} + +static struct reada_extent *reada_find_extent(struct btrfs_root *root, +					      u64 logical, +					      struct btrfs_key *top, int level) +{ +	int ret; +	int looped = 0; +	struct reada_extent *re = NULL; +	struct btrfs_fs_info *fs_info = root->fs_info; +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; +	struct btrfs_bio *bbio = NULL; +	struct btrfs_device *dev; +	u32 blocksize; +	u64 length; +	int nzones = 0; +	int i; +	unsigned long index = logical >> PAGE_CACHE_SHIFT; + +again: +	spin_lock(&fs_info->reada_lock); +	re = radix_tree_lookup(&fs_info->reada_tree, index); +	if (re) +		kref_get(&re->refcnt); +	spin_unlock(&fs_info->reada_lock); + +	if (re || looped) +		return re; + +	re = kzalloc(sizeof(*re), GFP_NOFS); +	if (!re) +		return NULL; + +	blocksize = btrfs_level_size(root, level); +	re->logical = logical; +	re->blocksize = blocksize; +	re->top = *top; +	INIT_LIST_HEAD(&re->extctl); +	spin_lock_init(&re->lock); +	kref_init(&re->refcnt); + +	/* +	 * map block +	 */ +	length = blocksize; +	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); +	if (ret || !bbio || length < blocksize) +		goto error; + +	if (bbio->num_stripes > MAX_MIRRORS) { +		printk(KERN_ERR "btrfs readahead: more than %d copies not " +				"supported", MAX_MIRRORS); +		goto error; +	} + +	for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { +		struct reada_zone *zone; + +		dev = bbio->stripes[nzones].dev; +		zone = reada_find_zone(fs_info, dev, logical, bbio); +		if (!zone) +			break; + +		re->zones[nzones] = zone; +		spin_lock(&zone->lock); +		if (!zone->elems) +			kref_get(&zone->refcnt); +		++zone->elems; +		spin_unlock(&zone->lock); +		spin_lock(&fs_info->reada_lock); +		kref_put(&zone->refcnt, reada_zone_release); +		spin_unlock(&fs_info->reada_lock); +	} +	re->nzones = nzones; +	if (nzones == 0) { +		/* not a single zone found, error and out */ +		goto error; +	} + +	/* insert extent in reada_tree + all per-device trees, all or nothing */ +	spin_lock(&fs_info->reada_lock); +	ret = radix_tree_insert(&fs_info->reada_tree, index, re); +	if (ret) { +		spin_unlock(&fs_info->reada_lock); +		if (ret != -ENOMEM) { +			/* someone inserted the extent in the meantime */ +			looped = 1; +		} +		goto error; +	} +	for (i = 0; i < nzones; ++i) { +		dev = bbio->stripes[i].dev; +		ret = radix_tree_insert(&dev->reada_extents, index, re); +		if (ret) { +			while (--i >= 0) { +				dev = bbio->stripes[i].dev; +				BUG_ON(dev == NULL); +				radix_tree_delete(&dev->reada_extents, index); +			} +			BUG_ON(fs_info == NULL); +			radix_tree_delete(&fs_info->reada_tree, index); +			spin_unlock(&fs_info->reada_lock); +			goto error; +		} +	} +	spin_unlock(&fs_info->reada_lock); + +	kfree(bbio); +	return re; + +error: +	while (nzones) { +		struct reada_zone *zone; + +		--nzones; +		zone = re->zones[nzones]; +		kref_get(&zone->refcnt); +		spin_lock(&zone->lock); +		--zone->elems; +		if (zone->elems == 0) { +			/* +			 * no fs_info->reada_lock needed, as this can't be +			 * the last ref +			 */ +			kref_put(&zone->refcnt, reada_zone_release); +		} +		spin_unlock(&zone->lock); + +		spin_lock(&fs_info->reada_lock); +		kref_put(&zone->refcnt, reada_zone_release); +		spin_unlock(&fs_info->reada_lock); +	} +	kfree(bbio); +	kfree(re); +	if (looped) +		goto again; +	return NULL; +} + +static void reada_kref_dummy(struct kref *kr) +{ +} + +static void reada_extent_put(struct btrfs_fs_info *fs_info, +			     struct reada_extent *re) +{ +	int i; +	unsigned long index = re->logical >> PAGE_CACHE_SHIFT; + +	spin_lock(&fs_info->reada_lock); +	if (!kref_put(&re->refcnt, reada_kref_dummy)) { +		spin_unlock(&fs_info->reada_lock); +		return; +	} + +	radix_tree_delete(&fs_info->reada_tree, index); +	for (i = 0; i < re->nzones; ++i) { +		struct reada_zone *zone = re->zones[i]; + +		radix_tree_delete(&zone->device->reada_extents, index); +	} + +	spin_unlock(&fs_info->reada_lock); + +	for (i = 0; i < re->nzones; ++i) { +		struct reada_zone *zone = re->zones[i]; + +		kref_get(&zone->refcnt); +		spin_lock(&zone->lock); +		--zone->elems; +		if (zone->elems == 0) { +			/* no fs_info->reada_lock needed, as this can't be +			 * the last ref */ +			kref_put(&zone->refcnt, reada_zone_release); +		} +		spin_unlock(&zone->lock); + +		spin_lock(&fs_info->reada_lock); +		kref_put(&zone->refcnt, reada_zone_release); +		spin_unlock(&fs_info->reada_lock); +	} +	if (re->scheduled_for) +		atomic_dec(&re->scheduled_for->reada_in_flight); + +	kfree(re); +} + +static void reada_zone_release(struct kref *kref) +{ +	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + +	radix_tree_delete(&zone->device->reada_zones, +			  zone->end >> PAGE_CACHE_SHIFT); + +	kfree(zone); +} + +static void reada_control_release(struct kref *kref) +{ +	struct reada_control *rc = container_of(kref, struct reada_control, +						refcnt); + +	kfree(rc); +} + +static int reada_add_block(struct reada_control *rc, u64 logical, +			   struct btrfs_key *top, int level, u64 generation) +{ +	struct btrfs_root *root = rc->root; +	struct reada_extent *re; +	struct reada_extctl *rec; + +	re = reada_find_extent(root, logical, top, level); /* takes one ref */ +	if (!re) +		return -1; + +	rec = kzalloc(sizeof(*rec), GFP_NOFS); +	if (!rec) { +		reada_extent_put(root->fs_info, re); +		return -1; +	} + +	rec->rc = rc; +	rec->generation = generation; +	atomic_inc(&rc->elems); + +	spin_lock(&re->lock); +	list_add_tail(&rec->list, &re->extctl); +	spin_unlock(&re->lock); + +	/* leave the ref on the extent */ + +	return 0; +} + +/* + * called with fs_info->reada_lock held + */ +static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) +{ +	int i; +	unsigned long index = zone->end >> PAGE_CACHE_SHIFT; + +	for (i = 0; i < zone->ndevs; ++i) { +		struct reada_zone *peer; +		peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); +		if (peer && peer->device != zone->device) +			peer->locked = lock; +	} +} + +/* + * called with fs_info->reada_lock held + */ +static int reada_pick_zone(struct btrfs_device *dev) +{ +	struct reada_zone *top_zone = NULL; +	struct reada_zone *top_locked_zone = NULL; +	u64 top_elems = 0; +	u64 top_locked_elems = 0; +	unsigned long index = 0; +	int ret; + +	if (dev->reada_curr_zone) { +		reada_peer_zones_set_lock(dev->reada_curr_zone, 0); +		kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); +		dev->reada_curr_zone = NULL; +	} +	/* pick the zone with the most elements */ +	while (1) { +		struct reada_zone *zone; + +		ret = radix_tree_gang_lookup(&dev->reada_zones, +					     (void **)&zone, index, 1); +		if (ret == 0) +			break; +		index = (zone->end >> PAGE_CACHE_SHIFT) + 1; +		if (zone->locked) { +			if (zone->elems > top_locked_elems) { +				top_locked_elems = zone->elems; +				top_locked_zone = zone; +			} +		} else { +			if (zone->elems > top_elems) { +				top_elems = zone->elems; +				top_zone = zone; +			} +		} +	} +	if (top_zone) +		dev->reada_curr_zone = top_zone; +	else if (top_locked_zone) +		dev->reada_curr_zone = top_locked_zone; +	else +		return 0; + +	dev->reada_next = dev->reada_curr_zone->start; +	kref_get(&dev->reada_curr_zone->refcnt); +	reada_peer_zones_set_lock(dev->reada_curr_zone, 1); + +	return 1; +} + +static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, +				   struct btrfs_device *dev) +{ +	struct reada_extent *re = NULL; +	int mirror_num = 0; +	struct extent_buffer *eb = NULL; +	u64 logical; +	u32 blocksize; +	int ret; +	int i; +	int need_kick = 0; + +	spin_lock(&fs_info->reada_lock); +	if (dev->reada_curr_zone == NULL) { +		ret = reada_pick_zone(dev); +		if (!ret) { +			spin_unlock(&fs_info->reada_lock); +			return 0; +		} +	} +	/* +	 * FIXME currently we issue the reads one extent at a time. If we have +	 * a contiguous block of extents, we could also coagulate them or use +	 * plugging to speed things up +	 */ +	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, +				     dev->reada_next >> PAGE_CACHE_SHIFT, 1); +	if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { +		ret = reada_pick_zone(dev); +		if (!ret) { +			spin_unlock(&fs_info->reada_lock); +			return 0; +		} +		re = NULL; +		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, +					dev->reada_next >> PAGE_CACHE_SHIFT, 1); +	} +	if (ret == 0) { +		spin_unlock(&fs_info->reada_lock); +		return 0; +	} +	dev->reada_next = re->logical + re->blocksize; +	kref_get(&re->refcnt); + +	spin_unlock(&fs_info->reada_lock); + +	/* +	 * find mirror num +	 */ +	for (i = 0; i < re->nzones; ++i) { +		if (re->zones[i]->device == dev) { +			mirror_num = i + 1; +			break; +		} +	} +	logical = re->logical; +	blocksize = re->blocksize; + +	spin_lock(&re->lock); +	if (re->scheduled_for == NULL) { +		re->scheduled_for = dev; +		need_kick = 1; +	} +	spin_unlock(&re->lock); + +	reada_extent_put(fs_info, re); + +	if (!need_kick) +		return 0; + +	atomic_inc(&dev->reada_in_flight); +	ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, +			 mirror_num, &eb); +	if (ret) +		__readahead_hook(fs_info->extent_root, NULL, logical, ret); +	else if (eb) +		__readahead_hook(fs_info->extent_root, eb, eb->start, ret); + +	if (eb) +		free_extent_buffer(eb); + +	return 1; + +} + +static void reada_start_machine_worker(struct btrfs_work *work) +{ +	struct reada_machine_work *rmw; +	struct btrfs_fs_info *fs_info; + +	rmw = container_of(work, struct reada_machine_work, work); +	fs_info = rmw->fs_info; + +	kfree(rmw); + +	__reada_start_machine(fs_info); +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_device *device; +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; +	u64 enqueued; +	u64 total = 0; +	int i; + +	do { +		enqueued = 0; +		list_for_each_entry(device, &fs_devices->devices, dev_list) { +			if (atomic_read(&device->reada_in_flight) < +			    MAX_IN_FLIGHT) +				enqueued += reada_start_machine_dev(fs_info, +								    device); +		} +		total += enqueued; +	} while (enqueued && total < 10000); + +	if (enqueued == 0) +		return; + +	/* +	 * If everything is already in the cache, this is effectively single +	 * threaded. To a) not hold the caller for too long and b) to utilize +	 * more cores, we broke the loop above after 10000 iterations and now +	 * enqueue to workers to finish it. This will distribute the load to +	 * the cores. +	 */ +	for (i = 0; i < 2; ++i) +		reada_start_machine(fs_info); +} + +static void reada_start_machine(struct btrfs_fs_info *fs_info) +{ +	struct reada_machine_work *rmw; + +	rmw = kzalloc(sizeof(*rmw), GFP_NOFS); +	if (!rmw) { +		/* FIXME we cannot handle this properly right now */ +		BUG(); +	} +	rmw->work.func = reada_start_machine_worker; +	rmw->fs_info = fs_info; + +	btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); +} + +#ifdef DEBUG +static void dump_devs(struct btrfs_fs_info *fs_info, int all) +{ +	struct btrfs_device *device; +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; +	unsigned long index; +	int ret; +	int i; +	int j; +	int cnt; + +	spin_lock(&fs_info->reada_lock); +	list_for_each_entry(device, &fs_devices->devices, dev_list) { +		printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid, +			atomic_read(&device->reada_in_flight)); +		index = 0; +		while (1) { +			struct reada_zone *zone; +			ret = radix_tree_gang_lookup(&device->reada_zones, +						     (void **)&zone, index, 1); +			if (ret == 0) +				break; +			printk(KERN_DEBUG "  zone %llu-%llu elems %llu locked " +				"%d devs", zone->start, zone->end, zone->elems, +				zone->locked); +			for (j = 0; j < zone->ndevs; ++j) { +				printk(KERN_CONT " %lld", +					zone->devs[j]->devid); +			} +			if (device->reada_curr_zone == zone) +				printk(KERN_CONT " curr off %llu", +					device->reada_next - zone->start); +			printk(KERN_CONT "\n"); +			index = (zone->end >> PAGE_CACHE_SHIFT) + 1; +		} +		cnt = 0; +		index = 0; +		while (all) { +			struct reada_extent *re = NULL; + +			ret = radix_tree_gang_lookup(&device->reada_extents, +						     (void **)&re, index, 1); +			if (ret == 0) +				break; +			printk(KERN_DEBUG +				"  re: logical %llu size %u empty %d for %lld", +				re->logical, re->blocksize, +				list_empty(&re->extctl), re->scheduled_for ? +				re->scheduled_for->devid : -1); + +			for (i = 0; i < re->nzones; ++i) { +				printk(KERN_CONT " zone %llu-%llu devs", +					re->zones[i]->start, +					re->zones[i]->end); +				for (j = 0; j < re->zones[i]->ndevs; ++j) { +					printk(KERN_CONT " %lld", +						re->zones[i]->devs[j]->devid); +				} +			} +			printk(KERN_CONT "\n"); +			index = (re->logical >> PAGE_CACHE_SHIFT) + 1; +			if (++cnt > 15) +				break; +		} +	} + +	index = 0; +	cnt = 0; +	while (all) { +		struct reada_extent *re = NULL; + +		ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, +					     index, 1); +		if (ret == 0) +			break; +		if (!re->scheduled_for) { +			index = (re->logical >> PAGE_CACHE_SHIFT) + 1; +			continue; +		} +		printk(KERN_DEBUG +			"re: logical %llu size %u list empty %d for %lld", +			re->logical, re->blocksize, list_empty(&re->extctl), +			re->scheduled_for ? re->scheduled_for->devid : -1); +		for (i = 0; i < re->nzones; ++i) { +			printk(KERN_CONT " zone %llu-%llu devs", +				re->zones[i]->start, +				re->zones[i]->end); +			for (i = 0; i < re->nzones; ++i) { +				printk(KERN_CONT " zone %llu-%llu devs", +					re->zones[i]->start, +					re->zones[i]->end); +				for (j = 0; j < re->zones[i]->ndevs; ++j) { +					printk(KERN_CONT " %lld", +						re->zones[i]->devs[j]->devid); +				} +			} +		} +		printk(KERN_CONT "\n"); +		index = (re->logical >> PAGE_CACHE_SHIFT) + 1; +	} +	spin_unlock(&fs_info->reada_lock); +} +#endif + +/* + * interface + */ +struct reada_control *btrfs_reada_add(struct btrfs_root *root, +			struct btrfs_key *key_start, struct btrfs_key *key_end) +{ +	struct reada_control *rc; +	u64 start; +	u64 generation; +	int level; +	struct extent_buffer *node; +	static struct btrfs_key max_key = { +		.objectid = (u64)-1, +		.type = (u8)-1, +		.offset = (u64)-1 +	}; + +	rc = kzalloc(sizeof(*rc), GFP_NOFS); +	if (!rc) +		return ERR_PTR(-ENOMEM); + +	rc->root = root; +	rc->key_start = *key_start; +	rc->key_end = *key_end; +	atomic_set(&rc->elems, 0); +	init_waitqueue_head(&rc->wait); +	kref_init(&rc->refcnt); +	kref_get(&rc->refcnt); /* one ref for having elements */ + +	node = btrfs_root_node(root); +	start = node->start; +	level = btrfs_header_level(node); +	generation = btrfs_header_generation(node); +	free_extent_buffer(node); + +	reada_add_block(rc, start, &max_key, level, generation); + +	reada_start_machine(root->fs_info); + +	return rc; +} + +#ifdef DEBUG +int btrfs_reada_wait(void *handle) +{ +	struct reada_control *rc = handle; + +	while (atomic_read(&rc->elems)) { +		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, +				   5 * HZ); +		dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); +	} + +	dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + +	kref_put(&rc->refcnt, reada_control_release); + +	return 0; +} +#else +int btrfs_reada_wait(void *handle) +{ +	struct reada_control *rc = handle; + +	while (atomic_read(&rc->elems)) { +		wait_event(rc->wait, atomic_read(&rc->elems) == 0); +	} + +	kref_put(&rc->refcnt, reada_control_release); + +	return 0; +} +#endif + +void btrfs_reada_detach(void *handle) +{ +	struct reada_control *rc = handle; + +	kref_put(&rc->refcnt, reada_control_release); +} diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59bb1764273..24d654ce7a0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,  		BUG_ON(IS_ERR(trans));  		trans->block_rsv = rc->block_rsv; -		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, -					    min_reserved, 0); +		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);  		if (ret) {  			BUG_ON(ret != -EAGAIN);  			ret = btrfs_commit_transaction(trans, root); @@ -2152,8 +2151,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)  again:  	if (!err) {  		num_bytes = rc->merging_rsv_size; -		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, -					  num_bytes); +		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);  		if (ret)  			err = ret;  	} @@ -2427,7 +2425,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,  	num_bytes = calcu_metadata_size(rc, node, 1) * 2;  	trans->block_rsv = rc->block_rsv; -	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); +	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);  	if (ret) {  		if (ret == -EAGAIN)  			rc->commit_transaction = 1; @@ -2922,6 +2920,7 @@ static int relocate_file_extent_cluster(struct inode *inode,  	unsigned long last_index;  	struct page *page;  	struct file_ra_state *ra; +	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);  	int nr = 0;  	int ret = 0; @@ -2956,7 +2955,7 @@ static int relocate_file_extent_cluster(struct inode *inode,  						  ra, NULL, index,  						  last_index + 1 - index);  			page = find_or_create_page(inode->i_mapping, index, -						   GFP_NOFS); +						   mask);  			if (!page) {  				btrfs_delalloc_release_metadata(inode,  							PAGE_CACHE_SIZE); @@ -3323,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,  	}  	key.objectid = ref_objectid; -	key.offset = ref_offset;  	key.type = BTRFS_EXTENT_DATA_KEY; +	if (ref_offset > ((u64)-1 << 32)) +		key.offset = 0; +	else +		key.offset = ref_offset;  	path->search_commit_root = 1;  	path->skip_locking = 1; @@ -3645,14 +3647,11 @@ int prepare_to_relocate(struct reloc_control *rc)  	 * btrfs_init_reloc_root will use them when there  	 * is no reservation in transaction handle.  	 */ -	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, +	ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,  				  rc->extent_root->nodesize * 256);  	if (ret)  		return ret; -	rc->block_rsv->refill_used = 1; -	btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); -  	memset(&rc->cluster, 0, sizeof(rc->cluster));  	rc->search_start = rc->block_group->key.objectid;  	rc->extents_found = 0; @@ -3777,8 +3776,7 @@ restart:  			}  		} -		ret = btrfs_block_rsv_check(trans, rc->extent_root, -					    rc->block_rsv, 0, 5); +		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);  		if (ret < 0) {  			if (ret != -EAGAIN) {  				err = ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5..ed11d3866af 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,10 +17,14 @@   */  #include <linux/blkdev.h> +#include <linux/ratelimit.h>  #include "ctree.h"  #include "volumes.h"  #include "disk-io.h"  #include "ordered-data.h" +#include "transaction.h" +#include "backref.h" +#include "extent_io.h"  /*   * This is only the first step towards a full-features scrub. It reads all @@ -29,15 +33,12 @@   * any can be found.   *   * Future enhancements: - *  - To enhance the performance, better read-ahead strategies for the - *    extent-tree can be employed.   *  - In case an unrepairable extent is encountered, track which files are   *    affected and report them   *  - In case of a read error on files with nodatasum, map the file and read   *    the extent to trigger a writeback of the good copy   *  - track and record media errors, throw out bad devices   *  - add a mode to also read unallocated space - *  - make the prefetch cancellable   */  struct scrub_bio; @@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);  struct scrub_page {  	u64			flags;  /* extent flags */  	u64			generation; -	u64			mirror_num; +	int			mirror_num;  	int			have_csum;  	u8			csum[BTRFS_CSUM_SIZE];  }; @@ -87,6 +88,7 @@ struct scrub_dev {  	int			first_free;  	int			curr;  	atomic_t		in_flight; +	atomic_t		fixup_cnt;  	spinlock_t		list_lock;  	wait_queue_head_t	list_wait;  	u16			csum_size; @@ -100,6 +102,27 @@ struct scrub_dev {  	spinlock_t		stat_lock;  }; +struct scrub_fixup_nodatasum { +	struct scrub_dev	*sdev; +	u64			logical; +	struct btrfs_root	*root; +	struct btrfs_work	work; +	int			mirror_num; +}; + +struct scrub_warning { +	struct btrfs_path	*path; +	u64			extent_item_size; +	char			*scratch_buf; +	char			*msg_buf; +	const char		*errstr; +	sector_t		sector; +	u64			logical; +	struct btrfs_device	*dev; +	int			msg_bufsize; +	int			scratch_bufsize; +}; +  static void scrub_free_csums(struct scrub_dev *sdev)  {  	while (!list_empty(&sdev->csum_list)) { @@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)  		if (i != SCRUB_BIOS_PER_DEV-1)  			sdev->bios[i]->next_free = i + 1; -		 else +		else  			sdev->bios[i]->next_free = -1;  	}  	sdev->first_free = 0;  	sdev->curr = -1;  	atomic_set(&sdev->in_flight, 0); +	atomic_set(&sdev->fixup_cnt, 0);  	atomic_set(&sdev->cancel_req, 0); -	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); +	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);  	INIT_LIST_HEAD(&sdev->csum_list);  	spin_lock_init(&sdev->list_lock); @@ -195,24 +219,361 @@ nomem:  	return ERR_PTR(-ENOMEM);  } +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ +	u64 isize; +	u32 nlink; +	int ret; +	int i; +	struct extent_buffer *eb; +	struct btrfs_inode_item *inode_item; +	struct scrub_warning *swarn = ctx; +	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; +	struct inode_fs_paths *ipath = NULL; +	struct btrfs_root *local_root; +	struct btrfs_key root_key; + +	root_key.objectid = root; +	root_key.type = BTRFS_ROOT_ITEM_KEY; +	root_key.offset = (u64)-1; +	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); +	if (IS_ERR(local_root)) { +		ret = PTR_ERR(local_root); +		goto err; +	} + +	ret = inode_item_info(inum, 0, local_root, swarn->path); +	if (ret) { +		btrfs_release_path(swarn->path); +		goto err; +	} + +	eb = swarn->path->nodes[0]; +	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], +					struct btrfs_inode_item); +	isize = btrfs_inode_size(eb, inode_item); +	nlink = btrfs_inode_nlink(eb, inode_item); +	btrfs_release_path(swarn->path); + +	ipath = init_ipath(4096, local_root, swarn->path); +	ret = paths_from_inode(inum, ipath); + +	if (ret < 0) +		goto err; + +	/* +	 * we deliberately ignore the bit ipath might have been too small to +	 * hold all of the paths here +	 */ +	for (i = 0; i < ipath->fspath->elem_cnt; ++i) +		printk(KERN_WARNING "btrfs: %s at logical %llu on dev " +			"%s, sector %llu, root %llu, inode %llu, offset %llu, " +			"length %llu, links %u (path: %s)\n", swarn->errstr, +			swarn->logical, swarn->dev->name, +			(unsigned long long)swarn->sector, root, inum, offset, +			min(isize - offset, (u64)PAGE_SIZE), nlink, +			(char *)ipath->fspath->val[i]); + +	free_ipath(ipath); +	return 0; + +err: +	printk(KERN_WARNING "btrfs: %s at logical %llu on dev " +		"%s, sector %llu, root %llu, inode %llu, offset %llu: path " +		"resolving failed with ret=%d\n", swarn->errstr, +		swarn->logical, swarn->dev->name, +		(unsigned long long)swarn->sector, root, inum, offset, ret); + +	free_ipath(ipath); +	return 0; +} + +static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, +				int ix) +{ +	struct btrfs_device *dev = sbio->sdev->dev; +	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; +	struct btrfs_path *path; +	struct btrfs_key found_key; +	struct extent_buffer *eb; +	struct btrfs_extent_item *ei; +	struct scrub_warning swarn; +	u32 item_size; +	int ret; +	u64 ref_root; +	u8 ref_level; +	unsigned long ptr = 0; +	const int bufsize = 4096; +	u64 extent_offset; + +	path = btrfs_alloc_path(); + +	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); +	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); +	swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; +	swarn.logical = sbio->logical + ix * PAGE_SIZE; +	swarn.errstr = errstr; +	swarn.dev = dev; +	swarn.msg_bufsize = bufsize; +	swarn.scratch_bufsize = bufsize; + +	if (!path || !swarn.scratch_buf || !swarn.msg_buf) +		goto out; + +	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); +	if (ret < 0) +		goto out; + +	extent_offset = swarn.logical - found_key.objectid; +	swarn.extent_item_size = found_key.offset; + +	eb = path->nodes[0]; +	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); +	item_size = btrfs_item_size_nr(eb, path->slots[0]); + +	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { +		do { +			ret = tree_backref_for_extent(&ptr, eb, ei, item_size, +							&ref_root, &ref_level); +			printk(KERN_WARNING "%s at logical %llu on dev %s, " +				"sector %llu: metadata %s (level %d) in tree " +				"%llu\n", errstr, swarn.logical, dev->name, +				(unsigned long long)swarn.sector, +				ref_level ? "node" : "leaf", +				ret < 0 ? -1 : ref_level, +				ret < 0 ? -1 : ref_root); +		} while (ret != 1); +	} else { +		swarn.path = path; +		iterate_extent_inodes(fs_info, path, found_key.objectid, +					extent_offset, +					scrub_print_warning_inode, &swarn); +	} + +out: +	btrfs_free_path(path); +	kfree(swarn.scratch_buf); +	kfree(swarn.msg_buf); +} + +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +{ +	struct page *page = NULL; +	unsigned long index; +	struct scrub_fixup_nodatasum *fixup = ctx; +	int ret; +	int corrected = 0; +	struct btrfs_key key; +	struct inode *inode = NULL; +	u64 end = offset + PAGE_SIZE - 1; +	struct btrfs_root *local_root; + +	key.objectid = root; +	key.type = BTRFS_ROOT_ITEM_KEY; +	key.offset = (u64)-1; +	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); +	if (IS_ERR(local_root)) +		return PTR_ERR(local_root); + +	key.type = BTRFS_INODE_ITEM_KEY; +	key.objectid = inum; +	key.offset = 0; +	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); +	if (IS_ERR(inode)) +		return PTR_ERR(inode); + +	index = offset >> PAGE_CACHE_SHIFT; + +	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); +	if (!page) { +		ret = -ENOMEM; +		goto out; +	} + +	if (PageUptodate(page)) { +		struct btrfs_mapping_tree *map_tree; +		if (PageDirty(page)) { +			/* +			 * we need to write the data to the defect sector. the +			 * data that was in that sector is not in memory, +			 * because the page was modified. we must not write the +			 * modified page to that sector. +			 * +			 * TODO: what could be done here: wait for the delalloc +			 *       runner to write out that page (might involve +			 *       COW) and see whether the sector is still +			 *       referenced afterwards. +			 * +			 * For the meantime, we'll treat this error +			 * incorrectable, although there is a chance that a +			 * later scrub will find the bad sector again and that +			 * there's no dirty page in memory, then. +			 */ +			ret = -EIO; +			goto out; +		} +		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; +		ret = repair_io_failure(map_tree, offset, PAGE_SIZE, +					fixup->logical, page, +					fixup->mirror_num); +		unlock_page(page); +		corrected = !ret; +	} else { +		/* +		 * we need to get good data first. the general readpage path +		 * will call repair_io_failure for us, we just have to make +		 * sure we read the bad mirror. +		 */ +		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, +					EXTENT_DAMAGED, GFP_NOFS); +		if (ret) { +			/* set_extent_bits should give proper error */ +			WARN_ON(ret > 0); +			if (ret > 0) +				ret = -EFAULT; +			goto out; +		} + +		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, +						btrfs_get_extent, +						fixup->mirror_num); +		wait_on_page_locked(page); + +		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, +						end, EXTENT_DAMAGED, 0, NULL); +		if (!corrected) +			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, +						EXTENT_DAMAGED, GFP_NOFS); +	} + +out: +	if (page) +		put_page(page); +	if (inode) +		iput(inode); + +	if (ret < 0) +		return ret; + +	if (ret == 0 && corrected) { +		/* +		 * we only need to call readpage for one of the inodes belonging +		 * to this extent. so make iterate_extent_inodes stop +		 */ +		return 1; +	} + +	return -EIO; +} + +static void scrub_fixup_nodatasum(struct btrfs_work *work) +{ +	int ret; +	struct scrub_fixup_nodatasum *fixup; +	struct scrub_dev *sdev; +	struct btrfs_trans_handle *trans = NULL; +	struct btrfs_fs_info *fs_info; +	struct btrfs_path *path; +	int uncorrectable = 0; + +	fixup = container_of(work, struct scrub_fixup_nodatasum, work); +	sdev = fixup->sdev; +	fs_info = fixup->root->fs_info; + +	path = btrfs_alloc_path(); +	if (!path) { +		spin_lock(&sdev->stat_lock); +		++sdev->stat.malloc_errors; +		spin_unlock(&sdev->stat_lock); +		uncorrectable = 1; +		goto out; +	} + +	trans = btrfs_join_transaction(fixup->root); +	if (IS_ERR(trans)) { +		uncorrectable = 1; +		goto out; +	} + +	/* +	 * the idea is to trigger a regular read through the standard path. we +	 * read a page from the (failed) logical address by specifying the +	 * corresponding copynum of the failed sector. thus, that readpage is +	 * expected to fail. +	 * that is the point where on-the-fly error correction will kick in +	 * (once it's finished) and rewrite the failed sector if a good copy +	 * can be found. +	 */ +	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, +						path, scrub_fixup_readpage, +						fixup); +	if (ret < 0) { +		uncorrectable = 1; +		goto out; +	} +	WARN_ON(ret != 1); + +	spin_lock(&sdev->stat_lock); +	++sdev->stat.corrected_errors; +	spin_unlock(&sdev->stat_lock); + +out: +	if (trans && !IS_ERR(trans)) +		btrfs_end_transaction(trans, fixup->root); +	if (uncorrectable) { +		spin_lock(&sdev->stat_lock); +		++sdev->stat.uncorrectable_errors; +		spin_unlock(&sdev->stat_lock); +		printk_ratelimited(KERN_ERR "btrfs: unable to fixup " +					"(nodatasum) error at logical %llu\n", +					fixup->logical); +	} + +	btrfs_free_path(path); +	kfree(fixup); + +	/* see caller why we're pretending to be paused in the scrub counters */ +	mutex_lock(&fs_info->scrub_lock); +	atomic_dec(&fs_info->scrubs_running); +	atomic_dec(&fs_info->scrubs_paused); +	mutex_unlock(&fs_info->scrub_lock); +	atomic_dec(&sdev->fixup_cnt); +	wake_up(&fs_info->scrub_pause_wait); +	wake_up(&sdev->list_wait); +} +  /*   * scrub_recheck_error gets called when either verification of the page   * failed or the bio failed to read, e.g. with EIO. In the latter case,   * recheck_error gets called for every page in the bio, even though only   * one may be bad   */ -static void scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_recheck_error(struct scrub_bio *sbio, int ix)  { +	struct scrub_dev *sdev = sbio->sdev; +	u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; +	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, +					DEFAULT_RATELIMIT_BURST); +  	if (sbio->err) { -		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, -				   (sbio->physical + ix * PAGE_SIZE) >> 9, +		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,  				   sbio->bio->bi_io_vec[ix].bv_page) == 0) {  			if (scrub_fixup_check(sbio, ix) == 0) -				return; +				return 0;  		} +		if (__ratelimit(&_rs)) +			scrub_print_warning("i/o error", sbio, ix); +	} else { +		if (__ratelimit(&_rs)) +			scrub_print_warning("checksum error", sbio, ix);  	} +	spin_lock(&sdev->stat_lock); +	++sdev->stat.read_errors; +	spin_unlock(&sdev->stat_lock); +  	scrub_fixup(sbio, ix); +	return 1;  }  static int scrub_fixup_check(struct scrub_bio *sbio, int ix) @@ -250,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)  	struct scrub_dev *sdev = sbio->sdev;  	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;  	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; -	struct btrfs_multi_bio *multi = NULL; +	struct btrfs_bio *bbio = NULL; +	struct scrub_fixup_nodatasum *fixup;  	u64 logical = sbio->logical + ix * PAGE_SIZE;  	u64 length;  	int i; @@ -259,38 +621,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)  	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&  	    (sbio->spag[ix].have_csum == 0)) { +		fixup = kzalloc(sizeof(*fixup), GFP_NOFS); +		if (!fixup) +			goto uncorrectable; +		fixup->sdev = sdev; +		fixup->logical = logical; +		fixup->root = fs_info->extent_root; +		fixup->mirror_num = sbio->spag[ix].mirror_num;  		/* -		 * nodatasum, don't try to fix anything -		 * FIXME: we can do better, open the inode and trigger a -		 * writeback +		 * increment scrubs_running to prevent cancel requests from +		 * completing as long as a fixup worker is running. we must also +		 * increment scrubs_paused to prevent deadlocking on pause +		 * requests used for transactions commits (as the worker uses a +		 * transaction context). it is safe to regard the fixup worker +		 * as paused for all matters practical. effectively, we only +		 * avoid cancellation requests from completing.  		 */ -		goto uncorrectable; +		mutex_lock(&fs_info->scrub_lock); +		atomic_inc(&fs_info->scrubs_running); +		atomic_inc(&fs_info->scrubs_paused); +		mutex_unlock(&fs_info->scrub_lock); +		atomic_inc(&sdev->fixup_cnt); +		fixup->work.func = scrub_fixup_nodatasum; +		btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); +		return;  	}  	length = PAGE_SIZE;  	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, -			      &multi, 0); -	if (ret || !multi || length < PAGE_SIZE) { +			      &bbio, 0); +	if (ret || !bbio || length < PAGE_SIZE) {  		printk(KERN_ERR  		       "scrub_fixup: btrfs_map_block failed us for %llu\n",  		       (unsigned long long)logical);  		WARN_ON(1); +		kfree(bbio);  		return;  	} -	if (multi->num_stripes == 1) +	if (bbio->num_stripes == 1)  		/* there aren't any replicas */  		goto uncorrectable;  	/*  	 * first find a good copy  	 */ -	for (i = 0; i < multi->num_stripes; ++i) { -		if (i == sbio->spag[ix].mirror_num) +	for (i = 0; i < bbio->num_stripes; ++i) { +		if (i + 1 == sbio->spag[ix].mirror_num)  			continue; -		if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, -				   multi->stripes[i].physical >> 9, +		if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, +				   bbio->stripes[i].physical >> 9,  				   sbio->bio->bi_io_vec[ix].bv_page)) {  			/* I/O-error, this is not a good copy */  			continue; @@ -299,7 +680,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)  		if (scrub_fixup_check(sbio, ix) == 0)  			break;  	} -	if (i == multi->num_stripes) +	if (i == bbio->num_stripes)  		goto uncorrectable;  	if (!sdev->readonly) { @@ -314,25 +695,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)  		}  	} -	kfree(multi); +	kfree(bbio);  	spin_lock(&sdev->stat_lock);  	++sdev->stat.corrected_errors;  	spin_unlock(&sdev->stat_lock); -	if (printk_ratelimit()) -		printk(KERN_ERR "btrfs: fixed up at %llu\n", -		       (unsigned long long)logical); +	printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", +			       (unsigned long long)logical);  	return;  uncorrectable: -	kfree(multi); +	kfree(bbio);  	spin_lock(&sdev->stat_lock);  	++sdev->stat.uncorrectable_errors;  	spin_unlock(&sdev->stat_lock); -	if (printk_ratelimit()) -		printk(KERN_ERR "btrfs: unable to fixup at %llu\n", -			 (unsigned long long)logical); +	printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " +				"logical %llu\n", (unsigned long long)logical);  }  static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, @@ -382,8 +761,14 @@ static void scrub_checksum(struct btrfs_work *work)  	int ret;  	if (sbio->err) { +		ret = 0;  		for (i = 0; i < sbio->count; ++i) -			scrub_recheck_error(sbio, i); +			ret |= scrub_recheck_error(sbio, i); +		if (!ret) { +			spin_lock(&sdev->stat_lock); +			++sdev->stat.unverified_errors; +			spin_unlock(&sdev->stat_lock); +		}  		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);  		sbio->bio->bi_flags |= 1 << BIO_UPTODATE; @@ -396,10 +781,6 @@ static void scrub_checksum(struct btrfs_work *work)  			bi->bv_offset = 0;  			bi->bv_len = PAGE_SIZE;  		} - -		spin_lock(&sdev->stat_lock); -		++sdev->stat.read_errors; -		spin_unlock(&sdev->stat_lock);  		goto out;  	}  	for (i = 0; i < sbio->count; ++i) { @@ -420,8 +801,14 @@ static void scrub_checksum(struct btrfs_work *work)  			WARN_ON(1);  		}  		kunmap_atomic(buffer, KM_USER0); -		if (ret) -			scrub_recheck_error(sbio, i); +		if (ret) { +			ret = scrub_recheck_error(sbio, i); +			if (!ret) { +				spin_lock(&sdev->stat_lock); +				++sdev->stat.unverified_errors; +				spin_unlock(&sdev->stat_lock); +			} +		}  	}  out: @@ -604,7 +991,7 @@ nomem:  }  static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, -		      u64 physical, u64 flags, u64 gen, u64 mirror_num, +		      u64 physical, u64 flags, u64 gen, int mirror_num,  		      u8 *csum, int force)  {  	struct scrub_bio *sbio; @@ -701,7 +1088,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,  /* scrub extent tries to collect up to 64 kB for each bio */  static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, -			u64 physical, u64 flags, u64 gen, u64 mirror_num) +			u64 physical, u64 flags, u64 gen, int mirror_num)  {  	int ret;  	u8 csum[BTRFS_CSUM_SIZE]; @@ -741,13 +1128,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  	int slot;  	int i;  	u64 nstripes; -	int start_stripe;  	struct extent_buffer *l;  	struct btrfs_key key;  	u64 physical;  	u64 logical;  	u64 generation; -	u64 mirror_num; +	int mirror_num; +	struct reada_control *reada1; +	struct reada_control *reada2; +	struct btrfs_key key_start; +	struct btrfs_key key_end;  	u64 increment = map->stripe_len;  	u64 offset; @@ -758,102 +1148,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {  		offset = map->stripe_len * num;  		increment = map->stripe_len * map->num_stripes; -		mirror_num = 0; +		mirror_num = 1;  	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {  		int factor = map->num_stripes / map->sub_stripes;  		offset = map->stripe_len * (num / map->sub_stripes);  		increment = map->stripe_len * factor; -		mirror_num = num % map->sub_stripes; +		mirror_num = num % map->sub_stripes + 1;  	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {  		increment = map->stripe_len; -		mirror_num = num % map->num_stripes; +		mirror_num = num % map->num_stripes + 1;  	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {  		increment = map->stripe_len; -		mirror_num = num % map->num_stripes; +		mirror_num = num % map->num_stripes + 1;  	} else {  		increment = map->stripe_len; -		mirror_num = 0; +		mirror_num = 1;  	}  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; -	path->reada = 2;  	path->search_commit_root = 1;  	path->skip_locking = 1;  	/* -	 * find all extents for each stripe and just read them to get -	 * them into the page cache -	 * FIXME: we can do better. build a more intelligent prefetching +	 * trigger the readahead for extent tree csum tree and wait for +	 * completion. During readahead, the scrub is officially paused +	 * to not hold off transaction commits  	 */  	logical = base + offset; -	physical = map->stripes[num].physical; -	ret = 0; -	for (i = 0; i < nstripes; ++i) { -		key.objectid = logical; -		key.type = BTRFS_EXTENT_ITEM_KEY; -		key.offset = (u64)0; -		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); -		if (ret < 0) -			goto out_noplug; +	wait_event(sdev->list_wait, +		   atomic_read(&sdev->in_flight) == 0); +	atomic_inc(&fs_info->scrubs_paused); +	wake_up(&fs_info->scrub_pause_wait); -		/* -		 * we might miss half an extent here, but that doesn't matter, -		 * as it's only the prefetch -		 */ -		while (1) { -			l = path->nodes[0]; -			slot = path->slots[0]; -			if (slot >= btrfs_header_nritems(l)) { -				ret = btrfs_next_leaf(root, path); -				if (ret == 0) -					continue; -				if (ret < 0) -					goto out_noplug; +	/* FIXME it might be better to start readahead at commit root */ +	key_start.objectid = logical; +	key_start.type = BTRFS_EXTENT_ITEM_KEY; +	key_start.offset = (u64)0; +	key_end.objectid = base + offset + nstripes * increment; +	key_end.type = BTRFS_EXTENT_ITEM_KEY; +	key_end.offset = (u64)0; +	reada1 = btrfs_reada_add(root, &key_start, &key_end); -				break; -			} -			btrfs_item_key_to_cpu(l, &key, slot); +	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; +	key_start.type = BTRFS_EXTENT_CSUM_KEY; +	key_start.offset = logical; +	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; +	key_end.type = BTRFS_EXTENT_CSUM_KEY; +	key_end.offset = base + offset + nstripes * increment; +	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); -			if (key.objectid >= logical + map->stripe_len) -				break; +	if (!IS_ERR(reada1)) +		btrfs_reada_wait(reada1); +	if (!IS_ERR(reada2)) +		btrfs_reada_wait(reada2); -			path->slots[0]++; -		} -		btrfs_release_path(path); -		logical += increment; -		physical += map->stripe_len; -		cond_resched(); +	mutex_lock(&fs_info->scrub_lock); +	while (atomic_read(&fs_info->scrub_pause_req)) { +		mutex_unlock(&fs_info->scrub_lock); +		wait_event(fs_info->scrub_pause_wait, +		   atomic_read(&fs_info->scrub_pause_req) == 0); +		mutex_lock(&fs_info->scrub_lock);  	} +	atomic_dec(&fs_info->scrubs_paused); +	mutex_unlock(&fs_info->scrub_lock); +	wake_up(&fs_info->scrub_pause_wait);  	/*  	 * collect all data csums for the stripe to avoid seeking during  	 * the scrub. This might currently (crc32) end up to be about 1MB  	 */ -	start_stripe = 0;  	blk_start_plug(&plug); -again: -	logical = base + offset + start_stripe * increment; -	for (i = start_stripe; i < nstripes; ++i) { -		ret = btrfs_lookup_csums_range(csum_root, logical, -					       logical + map->stripe_len - 1, -					       &sdev->csum_list, 1); -		if (ret) -			goto out; -		logical += increment; -		cond_resched(); -	}  	/*  	 * now find all extents for each stripe and scrub them  	 */ -	logical = base + offset + start_stripe * increment; -	physical = map->stripes[num].physical + start_stripe * map->stripe_len; +	logical = base + offset; +	physical = map->stripes[num].physical;  	ret = 0; -	for (i = start_stripe; i < nstripes; ++i) { +	for (i = 0; i < nstripes; ++i) {  		/*  		 * canceled?  		 */ @@ -882,11 +1258,14 @@ again:  			atomic_dec(&fs_info->scrubs_paused);  			mutex_unlock(&fs_info->scrub_lock);  			wake_up(&fs_info->scrub_pause_wait); -			scrub_free_csums(sdev); -			start_stripe = i; -			goto again;  		} +		ret = btrfs_lookup_csums_range(csum_root, logical, +					       logical + map->stripe_len - 1, +					       &sdev->csum_list, 1); +		if (ret) +			goto out; +  		key.objectid = logical;  		key.type = BTRFS_EXTENT_ITEM_KEY;  		key.offset = (u64)0; @@ -982,7 +1361,6 @@ next:  out:  	blk_finish_plug(&plug); -out_noplug:  	btrfs_free_path(path);  	return ret < 0 ? ret : 0;  } @@ -1253,10 +1631,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,  		ret = scrub_enumerate_chunks(sdev, start, end);  	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); -  	atomic_dec(&fs_info->scrubs_running);  	wake_up(&fs_info->scrub_pause_wait); +	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); +  	if (progress)  		memcpy(progress, &sdev->stat, sizeof(*progress)); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 15634d4648d..57080dffdfc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -40,6 +40,7 @@  #include <linux/magic.h>  #include <linux/slab.h>  #include <linux/cleancache.h> +#include <linux/mnt_namespace.h>  #include "compat.h"  #include "delayed-inode.h"  #include "ctree.h" @@ -58,6 +59,7 @@  #include <trace/events/btrfs.h>  static const struct super_operations btrfs_super_ops; +static struct file_system_type btrfs_fs_type;  static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,  				      char nbuf[16]) @@ -162,7 +164,7 @@ enum {  	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,  	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,  	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, -	Opt_inode_cache, Opt_err, +	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,  };  static match_table_t tokens = { @@ -195,6 +197,8 @@ static match_table_t tokens = {  	{Opt_subvolrootid, "subvolrootid=%d"},  	{Opt_defrag, "autodefrag"},  	{Opt_inode_cache, "inode_cache"}, +	{Opt_no_space_cache, "no_space_cache"}, +	{Opt_recovery, "recovery"},  	{Opt_err, NULL},  }; @@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  {  	struct btrfs_fs_info *info = root->fs_info;  	substring_t args[MAX_OPT_ARGS]; -	char *p, *num, *orig; +	char *p, *num, *orig = NULL; +	u64 cache_gen;  	int intarg;  	int ret = 0;  	char *compress_type;  	bool compress_force = false; +	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); +	if (cache_gen) +		btrfs_set_opt(info->mount_opt, SPACE_CACHE); +  	if (!options) -		return 0; +		goto out;  	/*  	 * strsep changes the string, duplicate it because parse_options @@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  			btrfs_set_opt(info->mount_opt, DISCARD);  			break;  		case Opt_space_cache: -			printk(KERN_INFO "btrfs: enabling disk space caching\n");  			btrfs_set_opt(info->mount_opt, SPACE_CACHE);  			break; +		case Opt_no_space_cache: +			printk(KERN_INFO "btrfs: disabling disk space caching\n"); +			btrfs_clear_opt(info->mount_opt, SPACE_CACHE); +			break;  		case Opt_inode_cache:  			printk(KERN_INFO "btrfs: enabling inode map caching\n");  			btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); @@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  			printk(KERN_INFO "btrfs: enabling auto defrag");  			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);  			break; +		case Opt_recovery: +			printk(KERN_INFO "btrfs: enabling auto recovery"); +			btrfs_set_opt(info->mount_opt, RECOVERY); +			break;  		case Opt_err:  			printk(KERN_INFO "btrfs: unrecognized mount option "  			       "'%s'\n", p); @@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  		}  	}  out: +	if (!ret && btrfs_test_opt(root, SPACE_CACHE)) +		printk(KERN_INFO "btrfs: disk space caching is enabled\n");  	kfree(orig);  	return ret;  } @@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,  		u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)  {  	substring_t args[MAX_OPT_ARGS]; -	char *opts, *orig, *p; +	char *device_name, *opts, *orig, *p;  	int error = 0;  	int intarg;  	if (!options) -		goto out; +		return 0;  	/*  	 * strsep changes the string, duplicate it because parse_options @@ -457,29 +475,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,  			}  			break;  		case Opt_device: -			error = btrfs_scan_one_device(match_strdup(&args[0]), +			device_name = match_strdup(&args[0]); +			if (!device_name) { +				error = -ENOMEM; +				goto out; +			} +			error = btrfs_scan_one_device(device_name,  					flags, holder, fs_devices); +			kfree(device_name);  			if (error) -				goto out_free_opts; +				goto out;  			break;  		default:  			break;  		}  	} - out_free_opts: +out:  	kfree(orig); - out: -	/* -	 * If no subvolume name is specified we use the default one.  Allocate -	 * a copy of the string "." here so that code later in the -	 * mount path doesn't care if it's the default volume or another one. -	 */ -	if (!*subvol_name) { -		*subvol_name = kstrdup(".", GFP_KERNEL); -		if (!*subvol_name) -			return -ENOMEM; -	}  	return error;  } @@ -492,7 +505,6 @@ static struct dentry *get_default_root(struct super_block *sb,  	struct btrfs_path *path;  	struct btrfs_key location;  	struct inode *inode; -	struct dentry *dentry;  	u64 dir_id;  	int new = 0; @@ -517,7 +529,7 @@ static struct dentry *get_default_root(struct super_block *sb,  	 * will mount by default if we haven't been given a specific subvolume  	 * to mount.  	 */ -	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); +	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);  	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);  	if (IS_ERR(di)) {  		btrfs_free_path(path); @@ -566,29 +578,7 @@ setup_root:  		return dget(sb->s_root);  	} -	if (new) { -		const struct qstr name = { .name = "/", .len = 1 }; - -		/* -		 * New inode, we need to make the dentry a sibling of s_root so -		 * everything gets cleaned up properly on unmount. -		 */ -		dentry = d_alloc(sb->s_root, &name); -		if (!dentry) { -			iput(inode); -			return ERR_PTR(-ENOMEM); -		} -		d_splice_alias(inode, dentry); -	} else { -		/* -		 * We found the inode in cache, just find a dentry for it and -		 * put the reference to the inode we just got. -		 */ -		dentry = d_find_alias(inode); -		iput(inode); -	} - -	return dentry; +	return d_obtain_alias(inode);  }  static int btrfs_fill_super(struct super_block *sb, @@ -719,6 +709,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)  		seq_puts(seq, ",noacl");  	if (btrfs_test_opt(root, SPACE_CACHE))  		seq_puts(seq, ",space_cache"); +	else +		seq_puts(seq, ",no_space_cache");  	if (btrfs_test_opt(root, CLEAR_CACHE))  		seq_puts(seq, ",clear_cache");  	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) @@ -753,6 +745,137 @@ static int btrfs_set_super(struct super_block *s, void *data)  	return set_anon_super(s, data);  } +/* + * subvolumes are identified by ino 256 + */ +static inline int is_subvolume_inode(struct inode *inode) +{ +	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) +		return 1; +	return 0; +} + +/* + * This will strip out the subvol=%s argument for an argument string and add + * subvolid=0 to make sure we get the actual tree root for path walking to the + * subvol we want. + */ +static char *setup_root_args(char *args) +{ +	unsigned copied = 0; +	unsigned len = strlen(args) + 2; +	char *pos; +	char *ret; + +	/* +	 * We need the same args as before, but minus +	 * +	 * subvol=a +	 * +	 * and add +	 * +	 * subvolid=0 +	 * +	 * which is a difference of 2 characters, so we allocate strlen(args) + +	 * 2 characters. +	 */ +	ret = kzalloc(len * sizeof(char), GFP_NOFS); +	if (!ret) +		return NULL; +	pos = strstr(args, "subvol="); + +	/* This shouldn't happen, but just in case.. */ +	if (!pos) { +		kfree(ret); +		return NULL; +	} + +	/* +	 * The subvol=<> arg is not at the front of the string, copy everybody +	 * up to that into ret. +	 */ +	if (pos != args) { +		*pos = '\0'; +		strcpy(ret, args); +		copied += strlen(args); +		pos++; +	} + +	strncpy(ret + copied, "subvolid=0", len - copied); + +	/* Length of subvolid=0 */ +	copied += 10; + +	/* +	 * If there is no , after the subvol= option then we know there's no +	 * other options and we can just return. +	 */ +	pos = strchr(pos, ','); +	if (!pos) +		return ret; + +	/* Copy the rest of the arguments into our buffer */ +	strncpy(ret + copied, pos, len - copied); +	copied += strlen(pos); + +	return ret; +} + +static struct dentry *mount_subvol(const char *subvol_name, int flags, +				   const char *device_name, char *data) +{ +	struct super_block *s; +	struct dentry *root; +	struct vfsmount *mnt; +	struct mnt_namespace *ns_private; +	char *newargs; +	struct path path; +	int error; + +	newargs = setup_root_args(data); +	if (!newargs) +		return ERR_PTR(-ENOMEM); +	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, +			     newargs); +	kfree(newargs); +	if (IS_ERR(mnt)) +		return ERR_CAST(mnt); + +	ns_private = create_mnt_ns(mnt); +	if (IS_ERR(ns_private)) { +		mntput(mnt); +		return ERR_CAST(ns_private); +	} + +	/* +	 * This will trigger the automount of the subvol so we can just +	 * drop the mnt we have here and return the dentry that we +	 * found. +	 */ +	error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name, +				LOOKUP_FOLLOW, &path); +	put_mnt_ns(ns_private); +	if (error) +		return ERR_PTR(error); + +	if (!is_subvolume_inode(path.dentry->d_inode)) { +		path_put(&path); +		mntput(mnt); +		error = -EINVAL; +		printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", +				subvol_name); +		return ERR_PTR(-EINVAL); +	} + +	/* Get a ref to the sb and the dentry we found and return it */ +	s = path.mnt->mnt_sb; +	atomic_inc(&s->s_active); +	root = dget(path.dentry); +	path_put(&path); +	down_write(&s->s_umount); + +	return root; +}  /*   * Find a superblock for the given device / mount point. @@ -784,13 +907,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  	if (error)  		return ERR_PTR(error); +	if (subvol_name) { +		root = mount_subvol(subvol_name, flags, device_name, data); +		kfree(subvol_name); +		return root; +	} +  	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);  	if (error) -		goto error_free_subvol_name; +		return ERR_PTR(error);  	error = btrfs_open_devices(fs_devices, mode, fs_type);  	if (error) -		goto error_free_subvol_name; +		return ERR_PTR(error);  	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {  		error = -EACCES; @@ -813,88 +942,57 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  	fs_info->fs_devices = fs_devices;  	tree_root->fs_info = fs_info; +	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); +	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); +	if (!fs_info->super_copy || !fs_info->super_for_commit) { +		error = -ENOMEM; +		goto error_close_devices; +	} +  	bdev = fs_devices->latest_bdev;  	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); -	if (IS_ERR(s)) -		goto error_s; +	if (IS_ERR(s)) { +		error = PTR_ERR(s); +		goto error_close_devices; +	}  	if (s->s_root) {  		if ((flags ^ s->s_flags) & MS_RDONLY) {  			deactivate_locked_super(s); -			error = -EBUSY; -			goto error_close_devices; +			return ERR_PTR(-EBUSY);  		}  		btrfs_close_devices(fs_devices); -		kfree(fs_info); +		free_fs_info(fs_info);  		kfree(tree_root);  	} else {  		char b[BDEVNAME_SIZE];  		s->s_flags = flags | MS_NOSEC;  		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); +		btrfs_sb(s)->fs_info->bdev_holder = fs_type;  		error = btrfs_fill_super(s, fs_devices, data,  					 flags & MS_SILENT ? 1 : 0);  		if (error) {  			deactivate_locked_super(s); -			goto error_free_subvol_name; +			return ERR_PTR(error);  		} -		btrfs_sb(s)->fs_info->bdev_holder = fs_type;  		s->s_flags |= MS_ACTIVE;  	} -	/* if they gave us a subvolume name bind mount into that */ -	if (strcmp(subvol_name, ".")) { -		struct dentry *new_root; - -		root = get_default_root(s, subvol_rootid); -		if (IS_ERR(root)) { -			error = PTR_ERR(root); -			deactivate_locked_super(s); -			goto error_free_subvol_name; -		} - -		mutex_lock(&root->d_inode->i_mutex); -		new_root = lookup_one_len(subvol_name, root, -				      strlen(subvol_name)); -		mutex_unlock(&root->d_inode->i_mutex); - -		if (IS_ERR(new_root)) { -			dput(root); -			deactivate_locked_super(s); -			error = PTR_ERR(new_root); -			goto error_free_subvol_name; -		} -		if (!new_root->d_inode) { -			dput(root); -			dput(new_root); -			deactivate_locked_super(s); -			error = -ENXIO; -			goto error_free_subvol_name; -		} -		dput(root); -		root = new_root; -	} else { -		root = get_default_root(s, subvol_objectid); -		if (IS_ERR(root)) { -			error = PTR_ERR(root); -			deactivate_locked_super(s); -			goto error_free_subvol_name; -		} +	root = get_default_root(s, subvol_objectid); +	if (IS_ERR(root)) { +		deactivate_locked_super(s); +		return root;  	} -	kfree(subvol_name);  	return root; -error_s: -	error = PTR_ERR(s);  error_close_devices:  	btrfs_close_devices(fs_devices); -	kfree(fs_info); +	free_fs_info(fs_info);  	kfree(tree_root); -error_free_subvol_name: -	kfree(subvol_name);  	return ERR_PTR(error);  } @@ -919,7 +1017,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  		if (root->fs_info->fs_devices->rw_devices == 0)  			return -EACCES; -		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) +		if (btrfs_super_log_root(root->fs_info->super_copy) != 0)  			return -EINVAL;  		ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -1085,7 +1183,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)  static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)  {  	struct btrfs_root *root = btrfs_sb(dentry->d_sb); -	struct btrfs_super_block *disk_super = &root->fs_info->super_copy; +	struct btrfs_super_block *disk_super = root->fs_info->super_copy;  	struct list_head *head = &root->fs_info->space_info;  	struct btrfs_space_info *found;  	u64 total_used = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e24b7964a15..960835eaf4d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)  	struct btrfs_transaction *cur_trans;  	spin_lock(&root->fs_info->trans_lock); +loop:  	if (root->fs_info->trans_no_join) {  		if (!nofail) {  			spin_unlock(&root->fs_info->trans_lock); @@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)  	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);  	if (!cur_trans)  		return -ENOMEM; +  	spin_lock(&root->fs_info->trans_lock);  	if (root->fs_info->running_transaction) { +		/* +		 * someone started a transaction after we unlocked.  Make sure +		 * to redo the trans_no_join checks above +		 */  		kmem_cache_free(btrfs_transaction_cachep, cur_trans);  		cur_trans = root->fs_info->running_transaction; -		atomic_inc(&cur_trans->use_count); -		atomic_inc(&cur_trans->num_writers); -		cur_trans->num_joined++; -		spin_unlock(&root->fs_info->trans_lock); -		return 0; +		goto loop;  	} +  	atomic_set(&cur_trans->num_writers, 1);  	cur_trans->num_joined = 0;  	init_waitqueue_head(&cur_trans->writer_wait); @@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,  	 */  	if (num_items > 0 && root != root->fs_info->chunk_root) {  		num_bytes = btrfs_calc_trans_metadata_size(root, num_items); -		ret = btrfs_block_rsv_add(NULL, root, +		ret = btrfs_block_rsv_add(root,  					  &root->fs_info->trans_block_rsv,  					  num_bytes);  		if (ret) @@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,  				  struct btrfs_root *root)  {  	int ret; -	ret = btrfs_block_rsv_check(trans, root, -				    &root->fs_info->global_block_rsv, 0, 5); + +	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);  	return ret ? 1 : 0;  } @@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,  				 struct btrfs_root *root)  {  	struct btrfs_transaction *cur_trans = trans->transaction; +	struct btrfs_block_rsv *rsv = trans->block_rsv;  	int updates;  	smp_mb();  	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)  		return 1; +	/* +	 * We need to do this in case we're deleting csums so the global block +	 * rsv get's used instead of the csum block rsv. +	 */ +	trans->block_rsv = NULL; +  	updates = trans->delayed_ref_updates;  	trans->delayed_ref_updates = 0;  	if (updates)  		btrfs_run_delayed_refs(trans, root, updates); +	trans->block_rsv = rsv; +  	return should_end_transaction(trans, root);  } @@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  		return 0;  	} +	btrfs_trans_release_metadata(trans, root); +	trans->block_rsv = NULL;  	while (count < 4) {  		unsigned long cur = trans->delayed_ref_updates;  		trans->delayed_ref_updates = 0; @@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  		count++;  	} -	btrfs_trans_release_metadata(trans, root); -  	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&  	    should_end_transaction(trans, root)) {  		trans->transaction->blocked = 1; @@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,  int btrfs_write_marked_extents(struct btrfs_root *root,  			       struct extent_io_tree *dirty_pages, int mark)  { -	int ret;  	int err = 0;  	int werr = 0; -	struct page *page; -	struct inode *btree_inode = root->fs_info->btree_inode; +	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;  	u64 start = 0;  	u64 end; -	unsigned long index; - -	while (1) { -		ret = find_first_extent_bit(dirty_pages, start, &start, &end, -					    mark); -		if (ret) -			break; -		while (start <= end) { -			cond_resched(); -			index = start >> PAGE_CACHE_SHIFT; -			start = (u64)(index + 1) << PAGE_CACHE_SHIFT; -			page = find_get_page(btree_inode->i_mapping, index); -			if (!page) -				continue; - -			btree_lock_page_hook(page); -			if (!page->mapping) { -				unlock_page(page); -				page_cache_release(page); -				continue; -			} - -			if (PageWriteback(page)) { -				if (PageDirty(page)) -					wait_on_page_writeback(page); -				else { -					unlock_page(page); -					page_cache_release(page); -					continue; -				} -			} -			err = write_one_page(page, 0); -			if (err) -				werr = err; -			page_cache_release(page); -		} +	while (!find_first_extent_bit(dirty_pages, start, &start, &end, +				      mark)) { +		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, +				   GFP_NOFS); +		err = filemap_fdatawrite_range(mapping, start, end); +		if (err) +			werr = err; +		cond_resched(); +		start = end + 1;  	}  	if (err)  		werr = err; @@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,  int btrfs_wait_marked_extents(struct btrfs_root *root,  			      struct extent_io_tree *dirty_pages, int mark)  { -	int ret;  	int err = 0;  	int werr = 0; -	struct page *page; -	struct inode *btree_inode = root->fs_info->btree_inode; +	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;  	u64 start = 0;  	u64 end; -	unsigned long index; - -	while (1) { -		ret = find_first_extent_bit(dirty_pages, start, &start, &end, -					    mark); -		if (ret) -			break; -		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); -		while (start <= end) { -			index = start >> PAGE_CACHE_SHIFT; -			start = (u64)(index + 1) << PAGE_CACHE_SHIFT; -			page = find_get_page(btree_inode->i_mapping, index); -			if (!page) -				continue; -			if (PageDirty(page)) { -				btree_lock_page_hook(page); -				wait_on_page_writeback(page); -				err = write_one_page(page, 0); -				if (err) -					werr = err; -			} -			wait_on_page_writeback(page); -			page_cache_release(page); -			cond_resched(); -		} +	while (!find_first_extent_bit(dirty_pages, start, &start, &end, +				      EXTENT_NEED_WAIT)) { +		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); +		err = filemap_fdatawait_range(mapping, start, end); +		if (err) +			werr = err; +		cond_resched(); +		start = end + 1;  	}  	if (err)  		werr = err; @@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,  	ret = btrfs_write_marked_extents(root, dirty_pages, mark);  	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); -	return ret || ret2; + +	if (ret) +		return ret; +	if (ret2) +		return ret2; +	return 0;  }  int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, @@ -911,10 +880,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	}  	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); -	btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);  	if (to_reserve > 0) { -		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, +		ret = btrfs_block_rsv_add(root, &pending->block_rsv,  					  to_reserve);  		if (ret) {  			pending->error = ret; @@ -1002,7 +970,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	BUG_ON(IS_ERR(pending->snap));  	btrfs_reloc_post_snapshot(trans, pending); -	btrfs_orphan_post_snapshot(trans, pending);  fail:  	kfree(new_root_item);  	trans->block_rsv = rsv; @@ -1032,7 +999,7 @@ static void update_super_roots(struct btrfs_root *root)  	struct btrfs_root_item *root_item;  	struct btrfs_super_block *super; -	super = &root->fs_info->super_copy; +	super = root->fs_info->super_copy;  	root_item = &root->fs_info->chunk_root->root_item;  	super->chunk_root = root_item->bytenr; @@ -1043,7 +1010,7 @@ static void update_super_roots(struct btrfs_root *root)  	super->root = root_item->bytenr;  	super->generation = root_item->generation;  	super->root_level = root_item->level; -	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) +	if (btrfs_test_opt(root, SPACE_CACHE))  		super->cache_generation = root_item->generation;  } @@ -1168,14 +1135,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	btrfs_run_ordered_operations(root, 0); +	btrfs_trans_release_metadata(trans, root); +	trans->block_rsv = NULL; +  	/* make a pass through all the delayed refs we have so far  	 * any runnings procs may add more while we are here  	 */  	ret = btrfs_run_delayed_refs(trans, root, 0);  	BUG_ON(ret); -	btrfs_trans_release_metadata(trans, root); -  	cur_trans = trans->transaction;  	/*  	 * set the flushing flag so procs in this transaction have to @@ -1341,12 +1309,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	update_super_roots(root);  	if (!root->fs_info->log_root_recovering) { -		btrfs_set_super_log_root(&root->fs_info->super_copy, 0); -		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); +		btrfs_set_super_log_root(root->fs_info->super_copy, 0); +		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);  	} -	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, -	       sizeof(root->fs_info->super_copy)); +	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, +	       sizeof(*root->fs_info->super_copy));  	trans->transaction->blocked = 0;  	spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0618aa39740..3568374d419 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,  			      struct walk_control *wc, u64 gen)  {  	if (wc->pin) -		btrfs_pin_extent(log->fs_info->extent_root, -				 eb->start, eb->len, 0); +		btrfs_pin_extent_for_log_replay(wc->trans, +						log->fs_info->extent_root, +						eb->start, eb->len);  	if (btrfs_buffer_uptodate(eb, gen)) {  		if (wc->write) @@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,  				WARN_ON(root_owner !=  					BTRFS_TREE_LOG_OBJECTID); -				ret = btrfs_free_reserved_extent(root, +				ret = btrfs_free_and_pin_reserved_extent(root,  							 bytenr, blocksize);  				BUG_ON(ret);  			} @@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,  				btrfs_tree_unlock(next);  				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); -				ret = btrfs_free_reserved_extent(root, +				ret = btrfs_free_and_pin_reserved_extent(root,  						path->nodes[*level]->start,  						path->nodes[*level]->len);  				BUG_ON(ret); @@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,  			WARN_ON(log->root_key.objectid !=  				BTRFS_TREE_LOG_OBJECTID); -			ret = btrfs_free_reserved_extent(log, next->start, +			ret = btrfs_free_and_pin_reserved_extent(log, next->start,  							 next->len);  			BUG_ON(ret);  		} @@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	/* wait for previous tree log sync to complete */  	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))  		wait_log_commit(trans, root, root->log_transid - 1); -  	while (1) {  		unsigned long batch = root->log_batch; -		if (root->log_multiple_pids) { +		/* when we're on an ssd, just kick the log commit out */ +		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {  			mutex_unlock(&root->log_mutex);  			schedule_timeout_uninterruptible(1);  			mutex_lock(&root->log_mutex); @@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	BUG_ON(ret);  	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); -	btrfs_set_super_log_root(&root->fs_info->super_for_commit, +	btrfs_set_super_log_root(root->fs_info->super_for_commit,  				log_root_tree->node->start); -	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, +	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,  				btrfs_header_level(log_root_tree->node));  	log_root_tree->log_batch = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da6..f8e2943101a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,  		}  		INIT_LIST_HEAD(&device->dev_alloc_list); +		/* init readahead state */ +		spin_lock_init(&device->reada_lock); +		device->reada_curr_zone = NULL; +		atomic_set(&device->reada_in_flight, 0); +		device->reada_next = 0; +		INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); +		INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); +  		mutex_lock(&fs_devices->device_list_mutex);  		list_add_rcu(&device->dev_list, &fs_devices->devices);  		mutex_unlock(&fs_devices->device_list_mutex); @@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  		set_blocksize(bdev, 4096);  		bh = btrfs_read_dev_super(bdev); -		if (!bh) { -			ret = -EINVAL; +		if (!bh)  			goto error_close; -		}  		disk_super = (struct btrfs_super_block *)bh->b_data;  		devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -655,7 +661,7 @@ error:  		continue;  	}  	if (fs_devices->open_devices == 0) { -		ret = -EIO; +		ret = -EINVAL;  		goto out;  	}  	fs_devices->seeding = seeding; @@ -1013,8 +1019,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,  	}  	BUG_ON(ret); -	if (device->bytes_used > 0) -		device->bytes_used -= btrfs_dev_extent_length(leaf, extent); +	if (device->bytes_used > 0) { +		u64 len = btrfs_dev_extent_length(leaf, extent); +		device->bytes_used -= len; +		spin_lock(&root->fs_info->free_chunk_lock); +		root->fs_info->free_chunk_space += len; +		spin_unlock(&root->fs_info->free_chunk_lock); +	}  	ret = btrfs_del_item(trans, root, path);  out: @@ -1356,6 +1367,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	if (ret)  		goto error_undo; +	spin_lock(&root->fs_info->free_chunk_lock); +	root->fs_info->free_chunk_space = device->total_bytes - +		device->bytes_used; +	spin_unlock(&root->fs_info->free_chunk_lock); +  	device->in_fs_metadata = 0;  	btrfs_scrub_cancel_dev(root, device); @@ -1387,8 +1403,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	call_rcu(&device->rcu, free_device);  	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; -	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); +	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; +	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);  	if (cur_devices->open_devices == 0) {  		struct btrfs_fs_devices *fs_devices; @@ -1450,7 +1466,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,  	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;  	struct btrfs_fs_devices *old_devices;  	struct btrfs_fs_devices *seed_devices; -	struct btrfs_super_block *disk_super = &root->fs_info->super_copy; +	struct btrfs_super_block *disk_super = root->fs_info->super_copy;  	struct btrfs_device *device;  	u64 super_flags; @@ -1691,15 +1707,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  		root->fs_info->fs_devices->num_can_discard++;  	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; +	spin_lock(&root->fs_info->free_chunk_lock); +	root->fs_info->free_chunk_space += device->total_bytes; +	spin_unlock(&root->fs_info->free_chunk_lock); +  	if (!blk_queue_nonrot(bdev_get_queue(bdev)))  		root->fs_info->fs_devices->rotating = 1; -	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); -	btrfs_set_super_total_bytes(&root->fs_info->super_copy, +	total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); +	btrfs_set_super_total_bytes(root->fs_info->super_copy,  				    total_bytes + device->total_bytes); -	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); -	btrfs_set_super_num_devices(&root->fs_info->super_copy, +	total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); +	btrfs_set_super_num_devices(root->fs_info->super_copy,  				    total_bytes + 1);  	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -1790,7 +1810,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,  		      struct btrfs_device *device, u64 new_size)  {  	struct btrfs_super_block *super_copy = -		&device->dev_root->fs_info->super_copy; +		device->dev_root->fs_info->super_copy;  	u64 old_total = btrfs_super_total_bytes(super_copy);  	u64 diff = new_size - device->total_bytes; @@ -1849,7 +1869,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,  static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64  			chunk_offset)  { -	struct btrfs_super_block *super_copy = &root->fs_info->super_copy; +	struct btrfs_super_block *super_copy = root->fs_info->super_copy;  	struct btrfs_disk_key *disk_key;  	struct btrfs_chunk *chunk;  	u8 *ptr; @@ -2175,7 +2195,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)  	bool retried = false;  	struct extent_buffer *l;  	struct btrfs_key key; -	struct btrfs_super_block *super_copy = &root->fs_info->super_copy; +	struct btrfs_super_block *super_copy = root->fs_info->super_copy;  	u64 old_total = btrfs_super_total_bytes(super_copy);  	u64 old_size = device->total_bytes;  	u64 diff = device->total_bytes - new_size; @@ -2192,8 +2212,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)  	lock_chunks(root);  	device->total_bytes = new_size; -	if (device->writeable) +	if (device->writeable) {  		device->fs_devices->total_rw_bytes -= diff; +		spin_lock(&root->fs_info->free_chunk_lock); +		root->fs_info->free_chunk_space -= diff; +		spin_unlock(&root->fs_info->free_chunk_lock); +	}  	unlock_chunks(root);  again: @@ -2257,6 +2281,9 @@ again:  		device->total_bytes = old_size;  		if (device->writeable)  			device->fs_devices->total_rw_bytes += diff; +		spin_lock(&root->fs_info->free_chunk_lock); +		root->fs_info->free_chunk_space += diff; +		spin_unlock(&root->fs_info->free_chunk_lock);  		unlock_chunks(root);  		goto done;  	} @@ -2292,7 +2319,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,  			   struct btrfs_key *key,  			   struct btrfs_chunk *chunk, int item_size)  { -	struct btrfs_super_block *super_copy = &root->fs_info->super_copy; +	struct btrfs_super_block *super_copy = root->fs_info->super_copy;  	struct btrfs_disk_key disk_key;  	u32 array_size;  	u8 *ptr; @@ -2615,6 +2642,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,  		index++;  	} +	spin_lock(&extent_root->fs_info->free_chunk_lock); +	extent_root->fs_info->free_chunk_space -= (stripe_size * +						   map->num_stripes); +	spin_unlock(&extent_root->fs_info->free_chunk_lock); +  	index = 0;  	stripe = &chunk->stripe;  	while (index < map->num_stripes) { @@ -2848,7 +2880,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,  static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  			     u64 logical, u64 *length, -			     struct btrfs_multi_bio **multi_ret, +			     struct btrfs_bio **bbio_ret,  			     int mirror_num)  {  	struct extent_map *em; @@ -2866,18 +2898,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	int i;  	int num_stripes;  	int max_errors = 0; -	struct btrfs_multi_bio *multi = NULL; +	struct btrfs_bio *bbio = NULL; -	if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) +	if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))  		stripes_allocated = 1;  again: -	if (multi_ret) { -		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), +	if (bbio_ret) { +		bbio = kzalloc(btrfs_bio_size(stripes_allocated),  				GFP_NOFS); -		if (!multi) +		if (!bbio)  			return -ENOMEM; -		atomic_set(&multi->error, 0); +		atomic_set(&bbio->error, 0);  	}  	read_lock(&em_tree->lock); @@ -2898,7 +2930,7 @@ again:  	if (mirror_num > map->num_stripes)  		mirror_num = 0; -	/* if our multi bio struct is too small, back off and try again */ +	/* if our btrfs_bio struct is too small, back off and try again */  	if (rw & REQ_WRITE) {  		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |  				 BTRFS_BLOCK_GROUP_DUP)) { @@ -2917,11 +2949,11 @@ again:  			stripes_required = map->num_stripes;  		}  	} -	if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && +	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&  	    stripes_allocated < stripes_required) {  		stripes_allocated = map->num_stripes;  		free_extent_map(em); -		kfree(multi); +		kfree(bbio);  		goto again;  	}  	stripe_nr = offset; @@ -2950,7 +2982,7 @@ again:  		*length = em->len - offset;  	} -	if (!multi_ret) +	if (!bbio_ret)  		goto out;  	num_stripes = 1; @@ -2975,13 +3007,17 @@ again:  			stripe_index = find_live_mirror(map, 0,  					    map->num_stripes,  					    current->pid % map->num_stripes); +			mirror_num = stripe_index + 1;  		}  	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) { -		if (rw & (REQ_WRITE | REQ_DISCARD)) +		if (rw & (REQ_WRITE | REQ_DISCARD)) {  			num_stripes = map->num_stripes; -		else if (mirror_num) +		} else if (mirror_num) {  			stripe_index = mirror_num - 1; +		} else { +			mirror_num = 1; +		}  	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {  		int factor = map->num_stripes / map->sub_stripes; @@ -3001,6 +3037,7 @@ again:  			stripe_index = find_live_mirror(map, stripe_index,  					      map->sub_stripes, stripe_index +  					      current->pid % map->sub_stripes); +			mirror_num = stripe_index + 1;  		}  	} else {  		/* @@ -3009,15 +3046,16 @@ again:  		 * stripe_index is the number of our device in the stripe array  		 */  		stripe_index = do_div(stripe_nr, map->num_stripes); +		mirror_num = stripe_index + 1;  	}  	BUG_ON(stripe_index >= map->num_stripes);  	if (rw & REQ_DISCARD) {  		for (i = 0; i < num_stripes; i++) { -			multi->stripes[i].physical = +			bbio->stripes[i].physical =  				map->stripes[stripe_index].physical +  				stripe_offset + stripe_nr * map->stripe_len; -			multi->stripes[i].dev = map->stripes[stripe_index].dev; +			bbio->stripes[i].dev = map->stripes[stripe_index].dev;  			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {  				u64 stripes; @@ -3038,16 +3076,16 @@ again:  				}  				stripes = stripe_nr_end - 1 - j;  				do_div(stripes, map->num_stripes); -				multi->stripes[i].length = map->stripe_len * +				bbio->stripes[i].length = map->stripe_len *  					(stripes - stripe_nr + 1);  				if (i == 0) { -					multi->stripes[i].length -= +					bbio->stripes[i].length -=  						stripe_offset;  					stripe_offset = 0;  				}  				if (stripe_index == last_stripe) -					multi->stripes[i].length -= +					bbio->stripes[i].length -=  						stripe_end_offset;  			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {  				u64 stripes; @@ -3072,11 +3110,11 @@ again:  				}  				stripes = stripe_nr_end - 1 - j;  				do_div(stripes, factor); -				multi->stripes[i].length = map->stripe_len * +				bbio->stripes[i].length = map->stripe_len *  					(stripes - stripe_nr + 1);  				if (i < map->sub_stripes) { -					multi->stripes[i].length -= +					bbio->stripes[i].length -=  						stripe_offset;  					if (i == map->sub_stripes - 1)  						stripe_offset = 0; @@ -3084,11 +3122,11 @@ again:  				if (stripe_index >= last_stripe &&  				    stripe_index <= (last_stripe +  						     map->sub_stripes - 1)) { -					multi->stripes[i].length -= +					bbio->stripes[i].length -=  						stripe_end_offset;  				}  			} else -				multi->stripes[i].length = *length; +				bbio->stripes[i].length = *length;  			stripe_index++;  			if (stripe_index == map->num_stripes) { @@ -3099,19 +3137,20 @@ again:  		}  	} else {  		for (i = 0; i < num_stripes; i++) { -			multi->stripes[i].physical = +			bbio->stripes[i].physical =  				map->stripes[stripe_index].physical +  				stripe_offset +  				stripe_nr * map->stripe_len; -			multi->stripes[i].dev = +			bbio->stripes[i].dev =  				map->stripes[stripe_index].dev;  			stripe_index++;  		}  	} -	if (multi_ret) { -		*multi_ret = multi; -		multi->num_stripes = num_stripes; -		multi->max_errors = max_errors; +	if (bbio_ret) { +		*bbio_ret = bbio; +		bbio->num_stripes = num_stripes; +		bbio->max_errors = max_errors; +		bbio->mirror_num = mirror_num;  	}  out:  	free_extent_map(em); @@ -3120,9 +3159,9 @@ out:  int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  		      u64 logical, u64 *length, -		      struct btrfs_multi_bio **multi_ret, int mirror_num) +		      struct btrfs_bio **bbio_ret, int mirror_num)  { -	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, +	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,  				 mirror_num);  } @@ -3191,28 +3230,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	return 0;  } -static void end_bio_multi_stripe(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err)  { -	struct btrfs_multi_bio *multi = bio->bi_private; +	struct btrfs_bio *bbio = bio->bi_private;  	int is_orig_bio = 0;  	if (err) -		atomic_inc(&multi->error); +		atomic_inc(&bbio->error); -	if (bio == multi->orig_bio) +	if (bio == bbio->orig_bio)  		is_orig_bio = 1; -	if (atomic_dec_and_test(&multi->stripes_pending)) { +	if (atomic_dec_and_test(&bbio->stripes_pending)) {  		if (!is_orig_bio) {  			bio_put(bio); -			bio = multi->orig_bio; +			bio = bbio->orig_bio;  		} -		bio->bi_private = multi->private; -		bio->bi_end_io = multi->end_io; +		bio->bi_private = bbio->private; +		bio->bi_end_io = bbio->end_io; +		bio->bi_bdev = (struct block_device *) +					(unsigned long)bbio->mirror_num;  		/* only send an error to the higher layers if it is  		 * beyond the tolerance of the multi-bio  		 */ -		if (atomic_read(&multi->error) > multi->max_errors) { +		if (atomic_read(&bbio->error) > bbio->max_errors) {  			err = -EIO;  		} else if (err) {  			/* @@ -3222,7 +3263,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)  			set_bit(BIO_UPTODATE, &bio->bi_flags);  			err = 0;  		} -		kfree(multi); +		kfree(bbio);  		bio_endio(bio, err);  	} else if (!is_orig_bio) { @@ -3302,20 +3343,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	u64 logical = (u64)bio->bi_sector << 9;  	u64 length = 0;  	u64 map_length; -	struct btrfs_multi_bio *multi = NULL;  	int ret;  	int dev_nr = 0;  	int total_devs = 1; +	struct btrfs_bio *bbio = NULL;  	length = bio->bi_size;  	map_tree = &root->fs_info->mapping_tree;  	map_length = length; -	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, +	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,  			      mirror_num);  	BUG_ON(ret); -	total_devs = multi->num_stripes; +	total_devs = bbio->num_stripes;  	if (map_length < length) {  		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "  		       "len %llu\n", (unsigned long long)logical, @@ -3323,25 +3364,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  		       (unsigned long long)map_length);  		BUG();  	} -	multi->end_io = first_bio->bi_end_io; -	multi->private = first_bio->bi_private; -	multi->orig_bio = first_bio; -	atomic_set(&multi->stripes_pending, multi->num_stripes); + +	bbio->orig_bio = first_bio; +	bbio->private = first_bio->bi_private; +	bbio->end_io = first_bio->bi_end_io; +	atomic_set(&bbio->stripes_pending, bbio->num_stripes);  	while (dev_nr < total_devs) { -		if (total_devs > 1) { -			if (dev_nr < total_devs - 1) { -				bio = bio_clone(first_bio, GFP_NOFS); -				BUG_ON(!bio); -			} else { -				bio = first_bio; -			} -			bio->bi_private = multi; -			bio->bi_end_io = end_bio_multi_stripe; +		if (dev_nr < total_devs - 1) { +			bio = bio_clone(first_bio, GFP_NOFS); +			BUG_ON(!bio); +		} else { +			bio = first_bio;  		} -		bio->bi_sector = multi->stripes[dev_nr].physical >> 9; -		dev = multi->stripes[dev_nr].dev; +		bio->bi_private = bbio; +		bio->bi_end_io = btrfs_end_bio; +		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; +		dev = bbio->stripes[dev_nr].dev;  		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { +			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " +				 "(%s id %llu), size=%u\n", rw, +				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, +				 dev->name, dev->devid, bio->bi_size);  			bio->bi_bdev = dev->bdev;  			if (async_submit)  				schedule_bio(root, dev, rw, bio); @@ -3354,8 +3398,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  		}  		dev_nr++;  	} -	if (total_devs == 1) -		kfree(multi);  	return 0;  } @@ -3616,15 +3658,20 @@ static int read_one_dev(struct btrfs_root *root,  	fill_device_from_item(leaf, dev_item, device);  	device->dev_root = root->fs_info->dev_root;  	device->in_fs_metadata = 1; -	if (device->writeable) +	if (device->writeable) {  		device->fs_devices->total_rw_bytes += device->total_bytes; +		spin_lock(&root->fs_info->free_chunk_lock); +		root->fs_info->free_chunk_space += device->total_bytes - +			device->bytes_used; +		spin_unlock(&root->fs_info->free_chunk_lock); +	}  	ret = 0;  	return ret;  }  int btrfs_read_sys_array(struct btrfs_root *root)  { -	struct btrfs_super_block *super_copy = &root->fs_info->super_copy; +	struct btrfs_super_block *super_copy = root->fs_info->super_copy;  	struct extent_buffer *sb;  	struct btrfs_disk_key *disk_key;  	struct btrfs_chunk *chunk; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6d866db4e17..ab5b1c49f35 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -92,6 +92,14 @@ struct btrfs_device {  	struct btrfs_work work;  	struct rcu_head rcu;  	struct work_struct rcu_work; + +	/* readahead state */ +	spinlock_t reada_lock; +	atomic_t reada_in_flight; +	u64 reada_next; +	struct reada_zone *reada_curr_zone; +	struct radix_tree_root reada_zones; +	struct radix_tree_root reada_extents;  };  struct btrfs_fs_devices { @@ -136,7 +144,10 @@ struct btrfs_bio_stripe {  	u64 length; /* only used for discard mappings */  }; -struct btrfs_multi_bio { +struct btrfs_bio; +typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); + +struct btrfs_bio {  	atomic_t stripes_pending;  	bio_end_io_t *end_io;  	struct bio *orig_bio; @@ -144,6 +155,7 @@ struct btrfs_multi_bio {  	atomic_t error;  	int max_errors;  	int num_stripes; +	int mirror_num;  	struct btrfs_bio_stripe stripes[];  }; @@ -171,7 +183,7 @@ struct map_lookup {  int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,  				   u64 end, u64 *length); -#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ +#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \  			    (sizeof(struct btrfs_bio_stripe) * (n)))  int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, @@ -180,7 +192,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,  			   u64 chunk_offset, u64 start, u64 num_bytes);  int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  		    u64 logical, u64 *length, -		    struct btrfs_multi_bio **multi_ret, int mirror_num); +		    struct btrfs_bio **bbio_ret, int mirror_num);  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  		     u64 chunk_start, u64 physical, u64 devid,  		     u64 **logical, int *naddrs, int *stripe_len); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 426aa464f1a..3848b04e310 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,  again:  	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),  				      name, name_len, value, size); +	/* +	 * If we're setting an xattr to a new value but the new value is say +	 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting +	 * back from split_leaf.  This is because it thinks we'll be extending +	 * the existing item size, but we're asking for enough space to add the +	 * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let +	 * the rest of the function figure it out. +	 */ +	if (ret == -EOVERFLOW) +		ret = -EEXIST; +  	if (ret == -EEXIST) {  		if (flags & XATTR_CREATE)  			goto out;  |