diff options
Diffstat (limited to 'fs/btrfs')
36 files changed, 3572 insertions, 1260 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 89b156d85d6..761e2cd8fed 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,  		if (ret > 0) {  			/* we need an acl */  			ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); +		} else { +			cache_no_acl(inode);  		} +	} else { +		cache_no_acl(inode);  	}  failed:  	posix_acl_release(acl); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index bcec0675023..a383c18e74e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -24,22 +24,135 @@  #include "delayed-ref.h"  #include "locking.h" +struct extent_inode_elem { +	u64 inum; +	u64 offset; +	struct extent_inode_elem *next; +}; + +static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, +				struct btrfs_file_extent_item *fi, +				u64 extent_item_pos, +				struct extent_inode_elem **eie) +{ +	u64 data_offset; +	u64 data_len; +	struct extent_inode_elem *e; + +	data_offset = btrfs_file_extent_offset(eb, fi); +	data_len = btrfs_file_extent_num_bytes(eb, fi); + +	if (extent_item_pos < data_offset || +	    extent_item_pos >= data_offset + data_len) +		return 1; + +	e = kmalloc(sizeof(*e), GFP_NOFS); +	if (!e) +		return -ENOMEM; + +	e->next = *eie; +	e->inum = key->objectid; +	e->offset = key->offset + (extent_item_pos - data_offset); +	*eie = e; + +	return 0; +} + +static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, +				u64 extent_item_pos, +				struct extent_inode_elem **eie) +{ +	u64 disk_byte; +	struct btrfs_key key; +	struct btrfs_file_extent_item *fi; +	int slot; +	int nritems; +	int extent_type; +	int ret; + +	/* +	 * from the shared data ref, we only have the leaf but we need +	 * the key. thus, we must look into all items and see that we +	 * find one (some) with a reference to our extent item. +	 */ +	nritems = btrfs_header_nritems(eb); +	for (slot = 0; slot < nritems; ++slot) { +		btrfs_item_key_to_cpu(eb, &key, slot); +		if (key.type != BTRFS_EXTENT_DATA_KEY) +			continue; +		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); +		extent_type = btrfs_file_extent_type(eb, fi); +		if (extent_type == BTRFS_FILE_EXTENT_INLINE) +			continue; +		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ +		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); +		if (disk_byte != wanted_disk_byte) +			continue; + +		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); +		if (ret < 0) +			return ret; +	} + +	return 0; +} +  /*   * this structure records all encountered refs on the way up to the root   */  struct __prelim_ref {  	struct list_head list;  	u64 root_id; -	struct btrfs_key key; +	struct btrfs_key key_for_search;  	int level;  	int count; +	struct extent_inode_elem *inode_list;  	u64 parent;  	u64 wanted_disk_byte;  }; +/* + * the rules for all callers of this function are: + * - obtaining the parent is the goal + * - if you add a key, you must know that it is a correct key + * - if you cannot add the parent or a correct key, then we will look into the + *   block later to set a correct key + * + * delayed refs + * ============ + *        backref type | shared | indirect | shared | indirect + * information         |   tree |     tree |   data |     data + * --------------------+--------+----------+--------+---------- + *      parent logical |    y   |     -    |    -   |     - + *      key to resolve |    -   |     y    |    y   |     y + *  tree block logical |    -   |     -    |    -   |     - + *  root for resolving |    y   |     y    |    y   |     y + * + * - column 1:       we've the parent -> done + * - column 2, 3, 4: we use the key to find the parent + * + * on disk refs (inline or keyed) + * ============================== + *        backref type | shared | indirect | shared | indirect + * information         |   tree |     tree |   data |     data + * --------------------+--------+----------+--------+---------- + *      parent logical |    y   |     -    |    y   |     - + *      key to resolve |    -   |     -    |    -   |     y + *  tree block logical |    y   |     y    |    y   |     y + *  root for resolving |    -   |     y    |    y   |     y + * + * - column 1, 3: we've the parent -> done + * - column 2:    we take the first key from the block to find the parent + *                (see __add_missing_keys) + * - column 4:    we use the key to find the parent + * + * additional information that's available but not required to find the parent + * block might help in merging entries to gain some speed. + */ +  static int __add_prelim_ref(struct list_head *head, u64 root_id, -			    struct btrfs_key *key, int level, u64 parent, -			    u64 wanted_disk_byte, int count) +			    struct btrfs_key *key, int level, +			    u64 parent, u64 wanted_disk_byte, int count)  {  	struct __prelim_ref *ref; @@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,  	ref->root_id = root_id;  	if (key) -		ref->key = *key; +		ref->key_for_search = *key;  	else -		memset(&ref->key, 0, sizeof(ref->key)); +		memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); +	ref->inode_list = NULL;  	ref->level = level;  	ref->count = count;  	ref->parent = parent; @@ -64,52 +178,75 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,  }  static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, -				struct ulist *parents, -				struct extent_buffer *eb, int level, -				u64 wanted_objectid, u64 wanted_disk_byte) +				struct ulist *parents, int level, +				struct btrfs_key *key_for_search, u64 time_seq, +				u64 wanted_disk_byte, +				const u64 *extent_item_pos)  { -	int ret; +	int ret = 0;  	int slot; -	struct btrfs_file_extent_item *fi; +	struct extent_buffer *eb;  	struct btrfs_key key; +	struct btrfs_file_extent_item *fi; +	struct extent_inode_elem *eie = NULL;  	u64 disk_byte; -add_parent: -	ret = ulist_add(parents, eb->start, 0, GFP_NOFS); -	if (ret < 0) -		return ret; - -	if (level != 0) +	if (level != 0) { +		eb = path->nodes[level]; +		ret = ulist_add(parents, eb->start, 0, GFP_NOFS); +		if (ret < 0) +			return ret;  		return 0; +	}  	/* -	 * if the current leaf is full with EXTENT_DATA items, we must -	 * check the next one if that holds a reference as well. -	 * ref->count cannot be used to skip this check. -	 * repeat this until we don't find any additional EXTENT_DATA items. +	 * We normally enter this function with the path already pointing to +	 * the first item to check. But sometimes, we may enter it with +	 * slot==nritems. In that case, go to the next leaf before we continue.  	 */ -	while (1) { -		ret = btrfs_next_leaf(root, path); -		if (ret < 0) -			return ret; -		if (ret) -			return 0; +	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) +		ret = btrfs_next_old_leaf(root, path, time_seq); +	while (!ret) {  		eb = path->nodes[0]; -		for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { -			btrfs_item_key_to_cpu(eb, &key, slot); -			if (key.objectid != wanted_objectid || -			    key.type != BTRFS_EXTENT_DATA_KEY) -				return 0; -			fi = btrfs_item_ptr(eb, slot, -						struct btrfs_file_extent_item); -			disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); -			if (disk_byte == wanted_disk_byte) -				goto add_parent; +		slot = path->slots[0]; + +		btrfs_item_key_to_cpu(eb, &key, slot); + +		if (key.objectid != key_for_search->objectid || +		    key.type != BTRFS_EXTENT_DATA_KEY) +			break; + +		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); +		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + +		if (disk_byte == wanted_disk_byte) { +			eie = NULL; +			if (extent_item_pos) { +				ret = check_extent_in_eb(&key, eb, fi, +						*extent_item_pos, +						&eie); +				if (ret < 0) +					break; +			} +			if (!ret) { +				ret = ulist_add(parents, eb->start, +						(unsigned long)eie, GFP_NOFS); +				if (ret < 0) +					break; +				if (!extent_item_pos) { +					ret = btrfs_next_old_leaf(root, path, +							time_seq); +					continue; +				} +			}  		} +		ret = btrfs_next_old_item(root, path, time_seq);  	} -	return 0; +	if (ret > 0) +		ret = 0; +	return ret;  }  /* @@ -118,13 +255,14 @@ add_parent:   */  static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,  					int search_commit_root, +					u64 time_seq,  					struct __prelim_ref *ref, -					struct ulist *parents) +					struct ulist *parents, +					const u64 *extent_item_pos)  {  	struct btrfs_path *path;  	struct btrfs_root *root;  	struct btrfs_key root_key; -	struct btrfs_key key = {0};  	struct extent_buffer *eb;  	int ret = 0;  	int root_level; @@ -152,36 +290,30 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,  		goto out;  	path->lowest_level = level; -	ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); +	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);  	pr_debug("search slot in root %llu (level %d, ref count %d) returned "  		 "%d for key (%llu %u %llu)\n",  		 (unsigned long long)ref->root_id, level, ref->count, ret, -		 (unsigned long long)ref->key.objectid, ref->key.type, -		 (unsigned long long)ref->key.offset); +		 (unsigned long long)ref->key_for_search.objectid, +		 ref->key_for_search.type, +		 (unsigned long long)ref->key_for_search.offset);  	if (ret < 0)  		goto out;  	eb = path->nodes[level]; -	if (!eb) { -		WARN_ON(1); -		ret = 1; -		goto out; -	} - -	if (level == 0) { -		if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { -			ret = btrfs_next_leaf(root, path); -			if (ret) -				goto out; -			eb = path->nodes[0]; +	while (!eb) { +		if (!level) { +			WARN_ON(1); +			ret = 1; +			goto out;  		} - -		btrfs_item_key_to_cpu(eb, &key, path->slots[0]); +		level--; +		eb = path->nodes[level];  	} -	/* the last two parameters will only be used for level == 0 */ -	ret = add_all_parents(root, path, parents, eb, level, key.objectid, -				ref->wanted_disk_byte); +	ret = add_all_parents(root, path, parents, level, &ref->key_for_search, +				time_seq, ref->wanted_disk_byte, +				extent_item_pos);  out:  	btrfs_free_path(path);  	return ret; @@ -191,8 +323,9 @@ out:   * resolve all indirect backrefs from the list   */  static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, -				   int search_commit_root, -				   struct list_head *head) +				   int search_commit_root, u64 time_seq, +				   struct list_head *head, +				   const u64 *extent_item_pos)  {  	int err;  	int ret = 0; @@ -201,6 +334,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  	struct __prelim_ref *new_ref;  	struct ulist *parents;  	struct ulist_node *node; +	struct ulist_iterator uiter;  	parents = ulist_alloc(GFP_NOFS);  	if (!parents) @@ -217,7 +351,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  		if (ref->count == 0)  			continue;  		err = __resolve_indirect_ref(fs_info, search_commit_root, -					     ref, parents); +					     time_seq, ref, parents, +					     extent_item_pos);  		if (err) {  			if (ret == 0)  				ret = err; @@ -225,11 +360,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  		}  		/* we put the first parent into the ref at hand */ -		node = ulist_next(parents, NULL); +		ULIST_ITER_INIT(&uiter); +		node = ulist_next(parents, &uiter);  		ref->parent = node ? node->val : 0; +		ref->inode_list = +			node ? (struct extent_inode_elem *)node->aux : 0;  		/* additional parents require new refs being added here */ -		while ((node = ulist_next(parents, node))) { +		while ((node = ulist_next(parents, &uiter))) {  			new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);  			if (!new_ref) {  				ret = -ENOMEM; @@ -237,6 +375,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  			}  			memcpy(new_ref, ref, sizeof(*ref));  			new_ref->parent = node->val; +			new_ref->inode_list = +					(struct extent_inode_elem *)node->aux;  			list_add(&new_ref->list, &ref->list);  		}  		ulist_reinit(parents); @@ -246,10 +386,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  	return ret;  } +static inline int ref_for_same_block(struct __prelim_ref *ref1, +				     struct __prelim_ref *ref2) +{ +	if (ref1->level != ref2->level) +		return 0; +	if (ref1->root_id != ref2->root_id) +		return 0; +	if (ref1->key_for_search.type != ref2->key_for_search.type) +		return 0; +	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) +		return 0; +	if (ref1->key_for_search.offset != ref2->key_for_search.offset) +		return 0; +	if (ref1->parent != ref2->parent) +		return 0; + +	return 1; +} + +/* + * read tree blocks and add keys where required. + */ +static int __add_missing_keys(struct btrfs_fs_info *fs_info, +			      struct list_head *head) +{ +	struct list_head *pos; +	struct extent_buffer *eb; + +	list_for_each(pos, head) { +		struct __prelim_ref *ref; +		ref = list_entry(pos, struct __prelim_ref, list); + +		if (ref->parent) +			continue; +		if (ref->key_for_search.type) +			continue; +		BUG_ON(!ref->wanted_disk_byte); +		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, +				     fs_info->tree_root->leafsize, 0); +		BUG_ON(!eb); +		btrfs_tree_read_lock(eb); +		if (btrfs_header_level(eb) == 0) +			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); +		else +			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); +		btrfs_tree_read_unlock(eb); +		free_extent_buffer(eb); +	} +	return 0; +} +  /*   * merge two lists of backrefs and adjust counts accordingly   *   * mode = 1: merge identical keys, if key is set + *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here. + *           additionally, we could even add a key range for the blocks we + *           looked into to merge even more (-> replace unresolved refs by those + *           having a parent).   * mode = 2: merge identical parents   */  static int __merge_refs(struct list_head *head, int mode) @@ -263,20 +458,21 @@ static int __merge_refs(struct list_head *head, int mode)  		ref1 = list_entry(pos1, struct __prelim_ref, list); -		if (mode == 1 && ref1->key.type == 0) -			continue;  		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;  		     pos2 = n2, n2 = pos2->next) {  			struct __prelim_ref *ref2; +			struct __prelim_ref *xchg;  			ref2 = list_entry(pos2, struct __prelim_ref, list);  			if (mode == 1) { -				if (memcmp(&ref1->key, &ref2->key, -					   sizeof(ref1->key)) || -				    ref1->level != ref2->level || -				    ref1->root_id != ref2->root_id) +				if (!ref_for_same_block(ref1, ref2))  					continue; +				if (!ref1->parent && ref2->parent) { +					xchg = ref1; +					ref1 = ref2; +					ref2 = xchg; +				}  				ref1->count += ref2->count;  			} else {  				if (ref1->parent != ref2->parent) @@ -296,16 +492,17 @@ static int __merge_refs(struct list_head *head, int mode)   * smaller or equal that seq to the list   */  static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, -			      struct btrfs_key *info_key,  			      struct list_head *prefs)  {  	struct btrfs_delayed_extent_op *extent_op = head->extent_op;  	struct rb_node *n = &head->node.rb_node; +	struct btrfs_key key; +	struct btrfs_key op_key = {0};  	int sgn;  	int ret = 0;  	if (extent_op && extent_op->update_key) -		btrfs_disk_key_to_cpu(info_key, &extent_op->key); +		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);  	while ((n = rb_prev(n))) {  		struct btrfs_delayed_ref_node *node; @@ -337,7 +534,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,  			struct btrfs_delayed_tree_ref *ref;  			ref = btrfs_delayed_node_to_tree_ref(node); -			ret = __add_prelim_ref(prefs, ref->root, info_key, +			ret = __add_prelim_ref(prefs, ref->root, &op_key,  					       ref->level + 1, 0, node->bytenr,  					       node->ref_mod * sgn);  			break; @@ -346,7 +543,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,  			struct btrfs_delayed_tree_ref *ref;  			ref = btrfs_delayed_node_to_tree_ref(node); -			ret = __add_prelim_ref(prefs, ref->root, info_key, +			ret = __add_prelim_ref(prefs, ref->root, NULL,  					       ref->level + 1, ref->parent,  					       node->bytenr,  					       node->ref_mod * sgn); @@ -354,8 +551,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,  		}  		case BTRFS_EXTENT_DATA_REF_KEY: {  			struct btrfs_delayed_data_ref *ref; -			struct btrfs_key key; -  			ref = btrfs_delayed_node_to_data_ref(node);  			key.objectid = ref->objectid; @@ -368,7 +563,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,  		}  		case BTRFS_SHARED_DATA_REF_KEY: {  			struct btrfs_delayed_data_ref *ref; -			struct btrfs_key key;  			ref = btrfs_delayed_node_to_data_ref(node); @@ -394,8 +588,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,   */  static int __add_inline_refs(struct btrfs_fs_info *fs_info,  			     struct btrfs_path *path, u64 bytenr, -			     struct btrfs_key *info_key, int *info_level, -			     struct list_head *prefs) +			     int *info_level, struct list_head *prefs)  {  	int ret = 0;  	int slot; @@ -411,7 +604,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  	 * enumerate all inline refs  	 */  	leaf = path->nodes[0]; -	slot = path->slots[0] - 1; +	slot = path->slots[0];  	item_size = btrfs_item_size_nr(leaf, slot);  	BUG_ON(item_size < sizeof(*ei)); @@ -424,12 +617,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {  		struct btrfs_tree_block_info *info; -		struct btrfs_disk_key disk_key;  		info = (struct btrfs_tree_block_info *)ptr;  		*info_level = btrfs_tree_block_level(leaf, info); -		btrfs_tree_block_key(leaf, info, &disk_key); -		btrfs_disk_key_to_cpu(info_key, &disk_key);  		ptr += sizeof(struct btrfs_tree_block_info);  		BUG_ON(ptr > end);  	} else { @@ -447,7 +637,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  		switch (type) {  		case BTRFS_SHARED_BLOCK_REF_KEY: -			ret = __add_prelim_ref(prefs, 0, info_key, +			ret = __add_prelim_ref(prefs, 0, NULL,  						*info_level + 1, offset,  						bytenr, 1);  			break; @@ -462,8 +652,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  			break;  		}  		case BTRFS_TREE_BLOCK_REF_KEY: -			ret = __add_prelim_ref(prefs, offset, info_key, -					       *info_level + 1, 0, bytenr, 1); +			ret = __add_prelim_ref(prefs, offset, NULL, +					       *info_level + 1, 0, +					       bytenr, 1);  			break;  		case BTRFS_EXTENT_DATA_REF_KEY: {  			struct btrfs_extent_data_ref *dref; @@ -477,8 +668,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  			key.type = BTRFS_EXTENT_DATA_KEY;  			key.offset = btrfs_extent_data_ref_offset(leaf, dref);  			root = btrfs_extent_data_ref_root(leaf, dref); -			ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, -						count); +			ret = __add_prelim_ref(prefs, root, &key, 0, 0, +					       bytenr, count);  			break;  		}  		default: @@ -496,8 +687,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,   */  static int __add_keyed_refs(struct btrfs_fs_info *fs_info,  			    struct btrfs_path *path, u64 bytenr, -			    struct btrfs_key *info_key, int info_level, -			    struct list_head *prefs) +			    int info_level, struct list_head *prefs)  {  	struct btrfs_root *extent_root = fs_info->extent_root;  	int ret; @@ -527,7 +717,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,  		switch (key.type) {  		case BTRFS_SHARED_BLOCK_REF_KEY: -			ret = __add_prelim_ref(prefs, 0, info_key, +			ret = __add_prelim_ref(prefs, 0, NULL,  						info_level + 1, key.offset,  						bytenr, 1);  			break; @@ -543,8 +733,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,  			break;  		}  		case BTRFS_TREE_BLOCK_REF_KEY: -			ret = __add_prelim_ref(prefs, key.offset, info_key, -						info_level + 1, 0, bytenr, 1); +			ret = __add_prelim_ref(prefs, key.offset, NULL, +					       info_level + 1, 0, +					       bytenr, 1);  			break;  		case BTRFS_EXTENT_DATA_REF_KEY: {  			struct btrfs_extent_data_ref *dref; @@ -560,7 +751,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,  			key.offset = btrfs_extent_data_ref_offset(leaf, dref);  			root = btrfs_extent_data_ref_root(leaf, dref);  			ret = __add_prelim_ref(prefs, root, &key, 0, 0, -						bytenr, count); +					       bytenr, count);  			break;  		}  		default: @@ -582,11 +773,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,   */  static int find_parent_nodes(struct btrfs_trans_handle *trans,  			     struct btrfs_fs_info *fs_info, u64 bytenr, -			     u64 seq, struct ulist *refs, struct ulist *roots) +			     u64 delayed_ref_seq, u64 time_seq, +			     struct ulist *refs, struct ulist *roots, +			     const u64 *extent_item_pos)  {  	struct btrfs_key key;  	struct btrfs_path *path; -	struct btrfs_key info_key = { 0 };  	struct btrfs_delayed_ref_root *delayed_refs = NULL;  	struct btrfs_delayed_ref_head *head;  	int info_level = 0; @@ -645,8 +837,9 @@ again:  				btrfs_put_delayed_ref(&head->node);  				goto again;  			} -			ret = __add_delayed_refs(head, seq, &info_key, +			ret = __add_delayed_refs(head, delayed_ref_seq,  						 &prefs_delayed); +			mutex_unlock(&head->mutex);  			if (ret) {  				spin_unlock(&delayed_refs->lock);  				goto out; @@ -659,16 +852,17 @@ again:  		struct extent_buffer *leaf;  		int slot; +		path->slots[0]--;  		leaf = path->nodes[0]; -		slot = path->slots[0] - 1; +		slot = path->slots[0];  		btrfs_item_key_to_cpu(leaf, &key, slot);  		if (key.objectid == bytenr &&  		    key.type == BTRFS_EXTENT_ITEM_KEY) {  			ret = __add_inline_refs(fs_info, path, bytenr, -						&info_key, &info_level, &prefs); +						&info_level, &prefs);  			if (ret)  				goto out; -			ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, +			ret = __add_keyed_refs(fs_info, path, bytenr,  					       info_level, &prefs);  			if (ret)  				goto out; @@ -676,21 +870,18 @@ again:  	}  	btrfs_release_path(path); -	/* -	 * when adding the delayed refs above, the info_key might not have -	 * been known yet. Go over the list and replace the missing keys -	 */ -	list_for_each_entry(ref, &prefs_delayed, list) { -		if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) -			memcpy(&ref->key, &info_key, sizeof(ref->key)); -	}  	list_splice_init(&prefs_delayed, &prefs); +	ret = __add_missing_keys(fs_info, &prefs); +	if (ret) +		goto out; +  	ret = __merge_refs(&prefs, 1);  	if (ret)  		goto out; -	ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); +	ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, +				      &prefs, extent_item_pos);  	if (ret)  		goto out; @@ -709,15 +900,39 @@ again:  			BUG_ON(ret < 0);  		}  		if (ref->count && ref->parent) { -			ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); +			struct extent_inode_elem *eie = NULL; +			if (extent_item_pos && !ref->inode_list) { +				u32 bsz; +				struct extent_buffer *eb; +				bsz = btrfs_level_size(fs_info->extent_root, +							info_level); +				eb = read_tree_block(fs_info->extent_root, +							   ref->parent, bsz, 0); +				BUG_ON(!eb); +				ret = find_extent_in_eb(eb, bytenr, +							*extent_item_pos, &eie); +				ref->inode_list = eie; +				free_extent_buffer(eb); +			} +			ret = ulist_add_merge(refs, ref->parent, +					      (unsigned long)ref->inode_list, +					      (unsigned long *)&eie, GFP_NOFS); +			if (!ret && extent_item_pos) { +				/* +				 * we've recorded that parent, so we must extend +				 * its inode list here +				 */ +				BUG_ON(!eie); +				while (eie->next) +					eie = eie->next; +				eie->next = ref->inode_list; +			}  			BUG_ON(ret < 0);  		}  		kfree(ref);  	}  out: -	if (head) -		mutex_unlock(&head->mutex);  	btrfs_free_path(path);  	while (!list_empty(&prefs)) {  		ref = list_first_entry(&prefs, struct __prelim_ref, list); @@ -734,6 +949,28 @@ out:  	return ret;  } +static void free_leaf_list(struct ulist *blocks) +{ +	struct ulist_node *node = NULL; +	struct extent_inode_elem *eie; +	struct extent_inode_elem *eie_next; +	struct ulist_iterator uiter; + +	ULIST_ITER_INIT(&uiter); +	while ((node = ulist_next(blocks, &uiter))) { +		if (!node->aux) +			continue; +		eie = (struct extent_inode_elem *)node->aux; +		for (; eie; eie = eie_next) { +			eie_next = eie->next; +			kfree(eie); +		} +		node->aux = 0; +	} + +	ulist_free(blocks); +} +  /*   * Finds all leafs with a reference to the specified combination of bytenr and   * offset. key_list_head will point to a list of corresponding keys (caller must @@ -744,7 +981,9 @@ out:   */  static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,  				struct btrfs_fs_info *fs_info, u64 bytenr, -				u64 num_bytes, u64 seq, struct ulist **leafs) +				u64 delayed_ref_seq, u64 time_seq, +				struct ulist **leafs, +				const u64 *extent_item_pos)  {  	struct ulist *tmp;  	int ret; @@ -758,11 +997,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,  		return -ENOMEM;  	} -	ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); +	ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, +				time_seq, *leafs, tmp, extent_item_pos);  	ulist_free(tmp);  	if (ret < 0 && ret != -ENOENT) { -		ulist_free(*leafs); +		free_leaf_list(*leafs);  		return ret;  	} @@ -784,10 +1024,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,   */  int btrfs_find_all_roots(struct btrfs_trans_handle *trans,  				struct btrfs_fs_info *fs_info, u64 bytenr, -				u64 num_bytes, u64 seq, struct ulist **roots) +				u64 delayed_ref_seq, u64 time_seq, +				struct ulist **roots)  {  	struct ulist *tmp;  	struct ulist_node *node = NULL; +	struct ulist_iterator uiter;  	int ret;  	tmp = ulist_alloc(GFP_NOFS); @@ -799,15 +1041,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,  		return -ENOMEM;  	} +	ULIST_ITER_INIT(&uiter);  	while (1) { -		ret = find_parent_nodes(trans, fs_info, bytenr, seq, -					tmp, *roots); +		ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, +					time_seq, tmp, *roots, NULL);  		if (ret < 0 && ret != -ENOENT) {  			ulist_free(tmp);  			ulist_free(*roots);  			return ret;  		} -		node = ulist_next(tmp, node); +		node = ulist_next(tmp, &uiter);  		if (!node)  			break;  		bytenr = node->val; @@ -1093,67 +1336,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,  	return 0;  } -static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, -				u64 orig_extent_item_objectid, -				u64 extent_item_pos, u64 root, +static int iterate_leaf_refs(struct extent_inode_elem *inode_list, +				u64 root, u64 extent_item_objectid,  				iterate_extent_inodes_t *iterate, void *ctx)  { -	u64 disk_byte; -	struct btrfs_key key; -	struct btrfs_file_extent_item *fi; -	struct extent_buffer *eb; -	int slot; -	int nritems; +	struct extent_inode_elem *eie;  	int ret = 0; -	int extent_type; -	u64 data_offset; -	u64 data_len; - -	eb = read_tree_block(fs_info->tree_root, logical, -				fs_info->tree_root->leafsize, 0); -	if (!eb) -		return -EIO; - -	/* -	 * from the shared data ref, we only have the leaf but we need -	 * the key. thus, we must look into all items and see that we -	 * find one (some) with a reference to our extent item. -	 */ -	nritems = btrfs_header_nritems(eb); -	for (slot = 0; slot < nritems; ++slot) { -		btrfs_item_key_to_cpu(eb, &key, slot); -		if (key.type != BTRFS_EXTENT_DATA_KEY) -			continue; -		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); -		extent_type = btrfs_file_extent_type(eb, fi); -		if (extent_type == BTRFS_FILE_EXTENT_INLINE) -			continue; -		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ -		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); -		if (disk_byte != orig_extent_item_objectid) -			continue; - -		data_offset = btrfs_file_extent_offset(eb, fi); -		data_len = btrfs_file_extent_num_bytes(eb, fi); - -		if (extent_item_pos < data_offset || -		    extent_item_pos >= data_offset + data_len) -			continue; +	for (eie = inode_list; eie; eie = eie->next) {  		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " -				"root %llu\n", orig_extent_item_objectid, -				key.objectid, key.offset, root); -		ret = iterate(key.objectid, -				key.offset + (extent_item_pos - data_offset), -				root, ctx); +			 "root %llu\n", extent_item_objectid, +			 eie->inum, eie->offset, root); +		ret = iterate(eie->inum, eie->offset, root, ctx);  		if (ret) { -			pr_debug("stopping iteration because ret=%d\n", ret); +			pr_debug("stopping iteration for %llu due to ret=%d\n", +				 extent_item_objectid, ret);  			break;  		}  	} -	free_extent_buffer(eb); -  	return ret;  } @@ -1175,7 +1376,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,  	struct ulist *roots = NULL;  	struct ulist_node *ref_node = NULL;  	struct ulist_node *root_node = NULL; -	struct seq_list seq_elem; +	struct seq_list seq_elem = {}; +	struct seq_list tree_mod_seq_elem = {}; +	struct ulist_iterator ref_uiter; +	struct ulist_iterator root_uiter;  	struct btrfs_delayed_ref_root *delayed_refs = NULL;  	pr_debug("resolving all inodes for extent %llu\n", @@ -1192,34 +1396,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,  		spin_lock(&delayed_refs->lock);  		btrfs_get_delayed_seq(delayed_refs, &seq_elem);  		spin_unlock(&delayed_refs->lock); +		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);  	}  	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, -				   extent_item_pos, seq_elem.seq, -				   &refs); - +				   seq_elem.seq, tree_mod_seq_elem.seq, &refs, +				   &extent_item_pos);  	if (ret)  		goto out; -	while (!ret && (ref_node = ulist_next(refs, ref_node))) { -		ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, -						seq_elem.seq, &roots); +	ULIST_ITER_INIT(&ref_uiter); +	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { +		ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, +						seq_elem.seq, +						tree_mod_seq_elem.seq, &roots);  		if (ret)  			break; -		while (!ret && (root_node = ulist_next(roots, root_node))) { -			pr_debug("root %llu references leaf %llu\n", -					root_node->val, ref_node->val); -			ret = iterate_leaf_refs(fs_info, ref_node->val, -						extent_item_objectid, -						extent_item_pos, root_node->val, -						iterate, ctx); +		ULIST_ITER_INIT(&root_uiter); +		while (!ret && (root_node = ulist_next(roots, &root_uiter))) { +			pr_debug("root %llu references leaf %llu, data list " +				 "%#lx\n", root_node->val, ref_node->val, +				 ref_node->aux); +			ret = iterate_leaf_refs( +				(struct extent_inode_elem *)ref_node->aux, +				root_node->val, extent_item_objectid, +				iterate, ctx);  		} +		ulist_free(roots); +		roots = NULL;  	} -	ulist_free(refs); +	free_leaf_list(refs);  	ulist_free(roots);  out:  	if (!search_commit_root) { +		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);  		btrfs_put_delayed_seq(delayed_refs, &seq_elem);  		btrfs_end_transaction(trans, fs_info->extent_root);  	} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 57ea2e959e4..c18d8ac7b79 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);  int btrfs_find_all_roots(struct btrfs_trans_handle *trans,  				struct btrfs_fs_info *fs_info, u64 bytenr, -				u64 num_bytes, u64 seq, struct ulist **roots); +				u64 delayed_ref_seq, u64 time_seq, +				struct ulist **roots);  struct btrfs_data_container *init_data_container(u32 total_bytes);  struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9b9b15fd520..12394a90d60 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -24,6 +24,21 @@  #include "ordered-data.h"  #include "delayed-inode.h" +/* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero.  When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + */ +#define BTRFS_INODE_ORDERED_DATA_CLOSE		0 +#define BTRFS_INODE_ORPHAN_META_RESERVED	1 +#define BTRFS_INODE_DUMMY			2 +#define BTRFS_INODE_IN_DEFRAG			3 +#define BTRFS_INODE_DELALLOC_META_RESERVED	4 +#define BTRFS_INODE_HAS_ORPHAN_ITEM		5 +#define BTRFS_INODE_HAS_ASYNC_EXTENT		6 +  /* in memory btrfs inode */  struct btrfs_inode {  	/* which subvolume this inode belongs to */ @@ -57,9 +72,6 @@ struct btrfs_inode {  	/* used to order data wrt metadata */  	struct btrfs_ordered_inode_tree ordered_tree; -	/* for keeping track of orphaned inodes */ -	struct list_head i_orphan; -  	/* list of all the delalloc inodes in the FS.  There are times we need  	 * to write all the delalloc pages to disk, and this list is used  	 * to walk them all. @@ -78,14 +90,13 @@ struct btrfs_inode {  	/* the space_info for where this inode's data allocations are done */  	struct btrfs_space_info *space_info; +	unsigned long runtime_flags; +  	/* full 64 bit generation number, struct vfs_inode doesn't have a big  	 * enough field for this.  	 */  	u64 generation; -	/* sequence number for NFS changes */ -	u64 sequence; -  	/*  	 * transid of the trans_handle that last modified this inode  	 */ @@ -145,22 +156,9 @@ struct btrfs_inode {  	unsigned reserved_extents;  	/* -	 * ordered_data_close is set by truncate when a file that used -	 * to have good data has been truncated to zero.  When it is set -	 * the btrfs file release call will add this inode to the -	 * ordered operations list so that we make sure to flush out any -	 * new data the application may have written before commit. -	 */ -	unsigned ordered_data_close:1; -	unsigned orphan_meta_reserved:1; -	unsigned dummy_inode:1; -	unsigned in_defrag:1; -	unsigned delalloc_meta_reserved:1; - -	/*  	 * always compress this one file  	 */ -	unsigned force_compress:4; +	unsigned force_compress;  	struct btrfs_delayed_node *delayed_node; @@ -202,4 +200,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,  	return false;  } +static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	int ret = 0; + +	mutex_lock(&root->log_mutex); +	if (BTRFS_I(inode)->logged_trans == generation && +	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) +		ret = 1; +	mutex_unlock(&root->log_mutex); +	return ret; +} +  #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index c053e90f200..da6e9364a5e 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -93,6 +93,7 @@  #include "print-tree.h"  #include "locking.h"  #include "check-integrity.h" +#include "rcu-string.h"  #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000  #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 @@ -103,8 +104,6 @@  #define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300  #define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)	/* in characters,  							 * excluding " [...]" */ -#define BTRFSIC_BLOCK_SIZE PAGE_SIZE -  #define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)  /* @@ -210,8 +209,9 @@ struct btrfsic_block_data_ctx {  	u64 dev_bytenr;		/* physical bytenr on device */  	u32 len;  	struct btrfsic_dev_state *dev; -	char *data; -	struct buffer_head *bh;	/* do not use if set to NULL */ +	char **datav; +	struct page **pagev; +	void *mem_to_free;  };  /* This structure is used to implement recursion without occupying @@ -243,6 +243,8 @@ struct btrfsic_state {  	struct btrfs_root *root;  	u64 max_superblock_generation;  	struct btrfsic_block *latest_superblock; +	u32 metablock_size; +	u32 datablock_size;  };  static void btrfsic_block_init(struct btrfsic_block *b); @@ -290,8 +292,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,  static int btrfsic_process_metablock(struct btrfsic_state *state,  				     struct btrfsic_block *block,  				     struct btrfsic_block_data_ctx *block_ctx, -				     struct btrfs_header *hdr,  				     int limit_nesting, int force_iodone_flag); +static void btrfsic_read_from_block_data( +	struct btrfsic_block_data_ctx *block_ctx, +	void *dst, u32 offset, size_t len);  static int btrfsic_create_link_to_next_block(  		struct btrfsic_state *state,  		struct btrfsic_block *block, @@ -318,12 +322,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);  static int btrfsic_read_block(struct btrfsic_state *state,  			      struct btrfsic_block_data_ctx *block_ctx);  static void btrfsic_dump_database(struct btrfsic_state *state); +static void btrfsic_complete_bio_end_io(struct bio *bio, int err);  static int btrfsic_test_for_metadata(struct btrfsic_state *state, -				     const u8 *data, unsigned int size); +				     char **datav, unsigned int num_pages);  static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, -					  u64 dev_bytenr, u8 *mapped_data, -					  unsigned int len, struct bio *bio, -					  int *bio_is_patched, +					  u64 dev_bytenr, char **mapped_datav, +					  unsigned int num_pages, +					  struct bio *bio, int *bio_is_patched,  					  struct buffer_head *bh,  					  int submit_bio_bh_rw);  static int btrfsic_process_written_superblock( @@ -375,7 +380,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(  static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,  					   u64 bytenr,  					   struct btrfsic_dev_state *dev_state, -					   u64 dev_bytenr, char *data); +					   u64 dev_bytenr);  static struct mutex btrfsic_mutex;  static int btrfsic_is_initialized; @@ -651,7 +656,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,  	int pass;  	BUG_ON(NULL == state); -	selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); +	selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);  	if (NULL == selected_super) {  		printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");  		return -1; @@ -718,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,  		num_copies =  		    btrfs_num_copies(&state->root->fs_info->mapping_tree, -				     next_bytenr, PAGE_SIZE); +				     next_bytenr, state->metablock_size);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",  			       (unsigned long long)next_bytenr, num_copies); @@ -727,9 +732,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,  			struct btrfsic_block *next_block;  			struct btrfsic_block_data_ctx tmp_next_block_ctx;  			struct btrfsic_block_link *l; -			struct btrfs_header *hdr; -			ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, +			ret = btrfsic_map_block(state, next_bytenr, +						state->metablock_size,  						&tmp_next_block_ctx,  						mirror_num);  			if (ret) { @@ -758,7 +763,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,  			BUG_ON(NULL == l);  			ret = btrfsic_read_block(state, &tmp_next_block_ctx); -			if (ret < (int)BTRFSIC_BLOCK_SIZE) { +			if (ret < (int)PAGE_CACHE_SIZE) {  				printk(KERN_INFO  				       "btrfsic: read @logical %llu failed!\n",  				       (unsigned long long) @@ -768,11 +773,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,  				return -1;  			} -			hdr = (struct btrfs_header *)tmp_next_block_ctx.data;  			ret = btrfsic_process_metablock(state,  							next_block,  							&tmp_next_block_ctx, -							hdr,  							BTRFS_MAX_LEVEL + 3, 1);  			btrfsic_release_block_ctx(&tmp_next_block_ctx);  		} @@ -799,7 +802,10 @@ static int btrfsic_process_superblock_dev_mirror(  	/* super block bytenr is always the unmapped device bytenr */  	dev_bytenr = btrfs_sb_offset(superblock_mirror_num); -	bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); +	if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) +		return -1; +	bh = __bread(superblock_bdev, dev_bytenr / 4096, +		     BTRFS_SUPER_INFO_SIZE);  	if (NULL == bh)  		return -1;  	super_tmp = (struct btrfs_super_block *) @@ -808,7 +814,10 @@ static int btrfsic_process_superblock_dev_mirror(  	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||  	    strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,  		    sizeof(super_tmp->magic)) || -	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { +	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || +	    btrfs_super_nodesize(super_tmp) != state->metablock_size || +	    btrfs_super_leafsize(super_tmp) != state->metablock_size || +	    btrfs_super_sectorsize(super_tmp) != state->datablock_size) {  		brelse(bh);  		return 0;  	} @@ -835,13 +844,14 @@ static int btrfsic_process_superblock_dev_mirror(  		superblock_tmp->never_written = 0;  		superblock_tmp->mirror_num = 1 + superblock_mirror_num;  		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) -			printk(KERN_INFO "New initial S-block (bdev %p, %s)" -			       " @%llu (%s/%llu/%d)\n", -			       superblock_bdev, device->name, -			       (unsigned long long)dev_bytenr, -			       dev_state->name, -			       (unsigned long long)dev_bytenr, -			       superblock_mirror_num); +			printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" +				     " @%llu (%s/%llu/%d)\n", +				     superblock_bdev, +				     rcu_str_deref(device->name), +				     (unsigned long long)dev_bytenr, +				     dev_state->name, +				     (unsigned long long)dev_bytenr, +				     superblock_mirror_num);  		list_add(&superblock_tmp->all_blocks_node,  			 &state->all_blocks_list);  		btrfsic_block_hashtable_add(superblock_tmp, @@ -893,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(  		num_copies =  		    btrfs_num_copies(&state->root->fs_info->mapping_tree, -				     next_bytenr, PAGE_SIZE); +				     next_bytenr, state->metablock_size);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",  			       (unsigned long long)next_bytenr, num_copies); @@ -902,7 +912,8 @@ static int btrfsic_process_superblock_dev_mirror(  			struct btrfsic_block_data_ctx tmp_next_block_ctx;  			struct btrfsic_block_link *l; -			if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, +			if (btrfsic_map_block(state, next_bytenr, +					      state->metablock_size,  					      &tmp_next_block_ctx,  					      mirror_num)) {  				printk(KERN_INFO "btrfsic: btrfsic_map_block(" @@ -966,13 +977,15 @@ static int btrfsic_process_metablock(  		struct btrfsic_state *state,  		struct btrfsic_block *const first_block,  		struct btrfsic_block_data_ctx *const first_block_ctx, -		struct btrfs_header *const first_hdr,  		int first_limit_nesting, int force_iodone_flag)  {  	struct btrfsic_stack_frame initial_stack_frame = { 0 };  	struct btrfsic_stack_frame *sf;  	struct btrfsic_stack_frame *next_stack; +	struct btrfs_header *const first_hdr = +		(struct btrfs_header *)first_block_ctx->datav[0]; +	BUG_ON(!first_hdr);  	sf = &initial_stack_frame;  	sf->error = 0;  	sf->i = -1; @@ -1012,21 +1025,47 @@ continue_with_current_leaf_stack_frame:  		}  		if (sf->i < sf->nr) { -			struct btrfs_item *disk_item = leafhdr->items + sf->i; -			struct btrfs_disk_key *disk_key = &disk_item->key; +			struct btrfs_item disk_item; +			u32 disk_item_offset = +				(uintptr_t)(leafhdr->items + sf->i) - +				(uintptr_t)leafhdr; +			struct btrfs_disk_key *disk_key;  			u8 type; -			const u32 item_offset = le32_to_cpu(disk_item->offset); +			u32 item_offset; +			if (disk_item_offset + sizeof(struct btrfs_item) > +			    sf->block_ctx->len) { +leaf_item_out_of_bounce_error: +				printk(KERN_INFO +				       "btrfsic: leaf item out of bounce at logical %llu, dev %s\n", +				       sf->block_ctx->start, +				       sf->block_ctx->dev->name); +				goto one_stack_frame_backwards; +			} +			btrfsic_read_from_block_data(sf->block_ctx, +						     &disk_item, +						     disk_item_offset, +						     sizeof(struct btrfs_item)); +			item_offset = le32_to_cpu(disk_item.offset); +			disk_key = &disk_item.key;  			type = disk_key->type;  			if (BTRFS_ROOT_ITEM_KEY == type) { -				const struct btrfs_root_item *const root_item = -				    (struct btrfs_root_item *) -				    (sf->block_ctx->data + -				     offsetof(struct btrfs_leaf, items) + -				     item_offset); -				const u64 next_bytenr = -				    le64_to_cpu(root_item->bytenr); +				struct btrfs_root_item root_item; +				u32 root_item_offset; +				u64 next_bytenr; + +				root_item_offset = item_offset + +					offsetof(struct btrfs_leaf, items); +				if (root_item_offset + +				    sizeof(struct btrfs_root_item) > +				    sf->block_ctx->len) +					goto leaf_item_out_of_bounce_error; +				btrfsic_read_from_block_data( +					sf->block_ctx, &root_item, +					root_item_offset, +					sizeof(struct btrfs_root_item)); +				next_bytenr = le64_to_cpu(root_item.bytenr);  				sf->error =  				    btrfsic_create_link_to_next_block( @@ -1041,7 +1080,7 @@ continue_with_current_leaf_stack_frame:  						&sf->num_copies,  						&sf->mirror_num,  						disk_key, -						le64_to_cpu(root_item-> +						le64_to_cpu(root_item.  						generation));  				if (sf->error)  					goto one_stack_frame_backwards; @@ -1049,7 +1088,7 @@ continue_with_current_leaf_stack_frame:  				if (NULL != sf->next_block) {  					struct btrfs_header *const next_hdr =  					    (struct btrfs_header *) -					    sf->next_block_ctx.data; +					    sf->next_block_ctx.datav[0];  					next_stack =  					    btrfsic_stack_frame_alloc(); @@ -1111,10 +1150,24 @@ continue_with_current_node_stack_frame:  		}  		if (sf->i < sf->nr) { -			struct btrfs_key_ptr *disk_key_ptr = -			    nodehdr->ptrs + sf->i; -			const u64 next_bytenr = -			    le64_to_cpu(disk_key_ptr->blockptr); +			struct btrfs_key_ptr key_ptr; +			u32 key_ptr_offset; +			u64 next_bytenr; + +			key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - +					  (uintptr_t)nodehdr; +			if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > +			    sf->block_ctx->len) { +				printk(KERN_INFO +				       "btrfsic: node item out of bounce at logical %llu, dev %s\n", +				       sf->block_ctx->start, +				       sf->block_ctx->dev->name); +				goto one_stack_frame_backwards; +			} +			btrfsic_read_from_block_data( +				sf->block_ctx, &key_ptr, key_ptr_offset, +				sizeof(struct btrfs_key_ptr)); +			next_bytenr = le64_to_cpu(key_ptr.blockptr);  			sf->error = btrfsic_create_link_to_next_block(  					state, @@ -1127,15 +1180,15 @@ continue_with_current_node_stack_frame:  					force_iodone_flag,  					&sf->num_copies,  					&sf->mirror_num, -					&disk_key_ptr->key, -					le64_to_cpu(disk_key_ptr->generation)); +					&key_ptr.key, +					le64_to_cpu(key_ptr.generation));  			if (sf->error)  				goto one_stack_frame_backwards;  			if (NULL != sf->next_block) {  				struct btrfs_header *const next_hdr =  				    (struct btrfs_header *) -				    sf->next_block_ctx.data; +				    sf->next_block_ctx.datav[0];  				next_stack = btrfsic_stack_frame_alloc();  				if (NULL == next_stack) @@ -1181,6 +1234,35 @@ one_stack_frame_backwards:  	return sf->error;  } +static void btrfsic_read_from_block_data( +	struct btrfsic_block_data_ctx *block_ctx, +	void *dstv, u32 offset, size_t len) +{ +	size_t cur; +	size_t offset_in_page; +	char *kaddr; +	char *dst = (char *)dstv; +	size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); +	unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; + +	WARN_ON(offset + len > block_ctx->len); +	offset_in_page = (start_offset + offset) & +			 ((unsigned long)PAGE_CACHE_SIZE - 1); + +	while (len > 0) { +		cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); +		BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> +			    PAGE_CACHE_SHIFT); +		kaddr = block_ctx->datav[i]; +		memcpy(dst, kaddr + offset_in_page, cur); + +		dst += cur; +		len -= cur; +		offset_in_page = 0; +		i++; +	} +} +  static int btrfsic_create_link_to_next_block(  		struct btrfsic_state *state,  		struct btrfsic_block *block, @@ -1204,7 +1286,7 @@ static int btrfsic_create_link_to_next_block(  	if (0 == *num_copiesp) {  		*num_copiesp =  		    btrfs_num_copies(&state->root->fs_info->mapping_tree, -				     next_bytenr, PAGE_SIZE); +				     next_bytenr, state->metablock_size);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",  			       (unsigned long long)next_bytenr, *num_copiesp); @@ -1219,7 +1301,7 @@ static int btrfsic_create_link_to_next_block(  		       "btrfsic_create_link_to_next_block(mirror_num=%d)\n",  		       *mirror_nump);  	ret = btrfsic_map_block(state, next_bytenr, -				BTRFSIC_BLOCK_SIZE, +				state->metablock_size,  				next_block_ctx, *mirror_nump);  	if (ret) {  		printk(KERN_INFO @@ -1314,7 +1396,7 @@ static int btrfsic_create_link_to_next_block(  	if (limit_nesting > 0 && did_alloc_block_link) {  		ret = btrfsic_read_block(state, next_block_ctx); -		if (ret < (int)BTRFSIC_BLOCK_SIZE) { +		if (ret < (int)next_block_ctx->len) {  			printk(KERN_INFO  			       "btrfsic: read block @logical %llu failed!\n",  			       (unsigned long long)next_bytenr); @@ -1339,43 +1421,74 @@ static int btrfsic_handle_extent_data(  		u32 item_offset, int force_iodone_flag)  {  	int ret; -	struct btrfs_file_extent_item *file_extent_item = -	    (struct btrfs_file_extent_item *)(block_ctx->data + -					      offsetof(struct btrfs_leaf, -						       items) + item_offset); -	u64 next_bytenr = -	    le64_to_cpu(file_extent_item->disk_bytenr) + -	    le64_to_cpu(file_extent_item->offset); -	u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); -	u64 generation = le64_to_cpu(file_extent_item->generation); +	struct btrfs_file_extent_item file_extent_item; +	u64 file_extent_item_offset; +	u64 next_bytenr; +	u64 num_bytes; +	u64 generation;  	struct btrfsic_block_link *l; +	file_extent_item_offset = offsetof(struct btrfs_leaf, items) + +				  item_offset; +	if (file_extent_item_offset + +	    offsetof(struct btrfs_file_extent_item, disk_num_bytes) > +	    block_ctx->len) { +		printk(KERN_INFO +		       "btrfsic: file item out of bounce at logical %llu, dev %s\n", +		       block_ctx->start, block_ctx->dev->name); +		return -1; +	} + +	btrfsic_read_from_block_data(block_ctx, &file_extent_item, +		file_extent_item_offset, +		offsetof(struct btrfs_file_extent_item, disk_num_bytes)); +	if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || +	    ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) { +		if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) +			printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", +			       file_extent_item.type, +			       (unsigned long long) +			       le64_to_cpu(file_extent_item.disk_bytenr)); +		return 0; +	} + +	if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > +	    block_ctx->len) { +		printk(KERN_INFO +		       "btrfsic: file item out of bounce at logical %llu, dev %s\n", +		       block_ctx->start, block_ctx->dev->name); +		return -1; +	} +	btrfsic_read_from_block_data(block_ctx, &file_extent_item, +				     file_extent_item_offset, +				     sizeof(struct btrfs_file_extent_item)); +	next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) + +		      le64_to_cpu(file_extent_item.offset); +	generation = le64_to_cpu(file_extent_item.generation); +	num_bytes = le64_to_cpu(file_extent_item.num_bytes); +	generation = le64_to_cpu(file_extent_item.generation); +  	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)  		printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"  		       " offset = %llu, num_bytes = %llu\n", -		       file_extent_item->type, -		       (unsigned long long) -		       le64_to_cpu(file_extent_item->disk_bytenr), -		       (unsigned long long) -		       le64_to_cpu(file_extent_item->offset), +		       file_extent_item.type,  		       (unsigned long long) -		       le64_to_cpu(file_extent_item->num_bytes)); -	if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || -	    ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) -		return 0; +		       le64_to_cpu(file_extent_item.disk_bytenr), +		       (unsigned long long)le64_to_cpu(file_extent_item.offset), +		       (unsigned long long)num_bytes);  	while (num_bytes > 0) {  		u32 chunk_len;  		int num_copies;  		int mirror_num; -		if (num_bytes > BTRFSIC_BLOCK_SIZE) -			chunk_len = BTRFSIC_BLOCK_SIZE; +		if (num_bytes > state->datablock_size) +			chunk_len = state->datablock_size;  		else  			chunk_len = num_bytes;  		num_copies =  		    btrfs_num_copies(&state->root->fs_info->mapping_tree, -				     next_bytenr, PAGE_SIZE); +				     next_bytenr, state->datablock_size);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",  			       (unsigned long long)next_bytenr, num_copies); @@ -1475,8 +1588,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,  	block_ctx_out->dev_bytenr = multi->stripes[0].physical;  	block_ctx_out->start = bytenr;  	block_ctx_out->len = len; -	block_ctx_out->data = NULL; -	block_ctx_out->bh = NULL; +	block_ctx_out->datav = NULL; +	block_ctx_out->pagev = NULL; +	block_ctx_out->mem_to_free = NULL;  	if (0 == ret)  		kfree(multi); @@ -1496,8 +1610,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,  	block_ctx_out->dev_bytenr = bytenr;  	block_ctx_out->start = bytenr;  	block_ctx_out->len = len; -	block_ctx_out->data = NULL; -	block_ctx_out->bh = NULL; +	block_ctx_out->datav = NULL; +	block_ctx_out->pagev = NULL; +	block_ctx_out->mem_to_free = NULL;  	if (NULL != block_ctx_out->dev) {  		return 0;  	} else { @@ -1508,38 +1623,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,  static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)  { -	if (NULL != block_ctx->bh) { -		brelse(block_ctx->bh); -		block_ctx->bh = NULL; +	if (block_ctx->mem_to_free) { +		unsigned int num_pages; + +		BUG_ON(!block_ctx->datav); +		BUG_ON(!block_ctx->pagev); +		num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> +			    PAGE_CACHE_SHIFT; +		while (num_pages > 0) { +			num_pages--; +			if (block_ctx->datav[num_pages]) { +				kunmap(block_ctx->pagev[num_pages]); +				block_ctx->datav[num_pages] = NULL; +			} +			if (block_ctx->pagev[num_pages]) { +				__free_page(block_ctx->pagev[num_pages]); +				block_ctx->pagev[num_pages] = NULL; +			} +		} + +		kfree(block_ctx->mem_to_free); +		block_ctx->mem_to_free = NULL; +		block_ctx->pagev = NULL; +		block_ctx->datav = NULL;  	}  }  static int btrfsic_read_block(struct btrfsic_state *state,  			      struct btrfsic_block_data_ctx *block_ctx)  { -	block_ctx->bh = NULL; -	if (block_ctx->dev_bytenr & 4095) { +	unsigned int num_pages; +	unsigned int i; +	u64 dev_bytenr; +	int ret; + +	BUG_ON(block_ctx->datav); +	BUG_ON(block_ctx->pagev); +	BUG_ON(block_ctx->mem_to_free); +	if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {  		printk(KERN_INFO  		       "btrfsic: read_block() with unaligned bytenr %llu\n",  		       (unsigned long long)block_ctx->dev_bytenr);  		return -1;  	} -	if (block_ctx->len > 4096) { -		printk(KERN_INFO -		       "btrfsic: read_block() with too huge size %d\n", -		       block_ctx->len); + +	num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> +		    PAGE_CACHE_SHIFT; +	block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + +					  sizeof(*block_ctx->pagev)) * +					 num_pages, GFP_NOFS); +	if (!block_ctx->mem_to_free)  		return -1; +	block_ctx->datav = block_ctx->mem_to_free; +	block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); +	for (i = 0; i < num_pages; i++) { +		block_ctx->pagev[i] = alloc_page(GFP_NOFS); +		if (!block_ctx->pagev[i]) +			return -1;  	} -	block_ctx->bh = __bread(block_ctx->dev->bdev, -				block_ctx->dev_bytenr >> 12, 4096); -	if (NULL == block_ctx->bh) -		return -1; -	block_ctx->data = block_ctx->bh->b_data; +	dev_bytenr = block_ctx->dev_bytenr; +	for (i = 0; i < num_pages;) { +		struct bio *bio; +		unsigned int j; +		DECLARE_COMPLETION_ONSTACK(complete); + +		bio = bio_alloc(GFP_NOFS, num_pages - i); +		if (!bio) { +			printk(KERN_INFO +			       "btrfsic: bio_alloc() for %u pages failed!\n", +			       num_pages - i); +			return -1; +		} +		bio->bi_bdev = block_ctx->dev->bdev; +		bio->bi_sector = dev_bytenr >> 9; +		bio->bi_end_io = btrfsic_complete_bio_end_io; +		bio->bi_private = &complete; + +		for (j = i; j < num_pages; j++) { +			ret = bio_add_page(bio, block_ctx->pagev[j], +					   PAGE_CACHE_SIZE, 0); +			if (PAGE_CACHE_SIZE != ret) +				break; +		} +		if (j == i) { +			printk(KERN_INFO +			       "btrfsic: error, failed to add a single page!\n"); +			return -1; +		} +		submit_bio(READ, bio); + +		/* this will also unplug the queue */ +		wait_for_completion(&complete); + +		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { +			printk(KERN_INFO +			       "btrfsic: read error at logical %llu dev %s!\n", +			       block_ctx->start, block_ctx->dev->name); +			bio_put(bio); +			return -1; +		} +		bio_put(bio); +		dev_bytenr += (j - i) * PAGE_CACHE_SIZE; +		i = j; +	} +	for (i = 0; i < num_pages; i++) { +		block_ctx->datav[i] = kmap(block_ctx->pagev[i]); +		if (!block_ctx->datav[i]) { +			printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n", +			       block_ctx->dev->name); +			return -1; +		} +	}  	return block_ctx->len;  } +static void btrfsic_complete_bio_end_io(struct bio *bio, int err) +{ +	complete((struct completion *)bio->bi_private); +} +  static void btrfsic_dump_database(struct btrfsic_state *state)  {  	struct list_head *elem_all; @@ -1617,32 +1821,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state)   * (note that this test fails for the super block)   */  static int btrfsic_test_for_metadata(struct btrfsic_state *state, -				     const u8 *data, unsigned int size) +				     char **datav, unsigned int num_pages)  {  	struct btrfs_header *h;  	u8 csum[BTRFS_CSUM_SIZE];  	u32 crc = ~(u32)0; -	int fail = 0; -	int crc_fail = 0; +	unsigned int i; -	h = (struct btrfs_header *)data; +	if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) +		return 1; /* not metadata */ +	num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; +	h = (struct btrfs_header *)datav[0];  	if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) -		fail++; +		return 1; -	crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); +	for (i = 0; i < num_pages; i++) { +		u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); +		size_t sublen = i ? PAGE_CACHE_SIZE : +				    (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); + +		crc = crc32c(crc, data, sublen); +	}  	btrfs_csum_final(crc, csum);  	if (memcmp(csum, h->csum, state->csum_size)) -		crc_fail++; +		return 1; -	return fail || crc_fail; +	return 0; /* is metadata */  }  static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, -					  u64 dev_bytenr, -					  u8 *mapped_data, unsigned int len, -					  struct bio *bio, -					  int *bio_is_patched, +					  u64 dev_bytenr, char **mapped_datav, +					  unsigned int num_pages, +					  struct bio *bio, int *bio_is_patched,  					  struct buffer_head *bh,  					  int submit_bio_bh_rw)  { @@ -1652,12 +1863,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  	int ret;  	struct btrfsic_state *state = dev_state->state;  	struct block_device *bdev = dev_state->bdev; +	unsigned int processed_len; -	WARN_ON(len > PAGE_SIZE); -	is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));  	if (NULL != bio_is_patched)  		*bio_is_patched = 0; +again: +	if (num_pages == 0) +		return; + +	processed_len = 0; +	is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, +						      num_pages)); +  	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,  					       &state->block_hashtable);  	if (NULL != block) { @@ -1667,8 +1885,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  		if (block->is_superblock) {  			bytenr = le64_to_cpu(((struct btrfs_super_block *) -					      mapped_data)->bytenr); +					      mapped_datav[0])->bytenr); +			if (num_pages * PAGE_CACHE_SIZE < +			    BTRFS_SUPER_INFO_SIZE) { +				printk(KERN_INFO +				       "btrfsic: cannot work with too short bios!\n"); +				return; +			}  			is_metadata = 1; +			BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); +			processed_len = BTRFS_SUPER_INFO_SIZE;  			if (state->print_mask &  			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {  				printk(KERN_INFO @@ -1678,12 +1904,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  		}  		if (is_metadata) {  			if (!block->is_superblock) { +				if (num_pages * PAGE_CACHE_SIZE < +				    state->metablock_size) { +					printk(KERN_INFO +					       "btrfsic: cannot work with too short bios!\n"); +					return; +				} +				processed_len = state->metablock_size;  				bytenr = le64_to_cpu(((struct btrfs_header *) -						      mapped_data)->bytenr); +						      mapped_datav[0])->bytenr);  				btrfsic_cmp_log_and_dev_bytenr(state, bytenr,  							       dev_state, -							       dev_bytenr, -							       mapped_data); +							       dev_bytenr);  			}  			if (block->logical_bytenr != bytenr) {  				printk(KERN_INFO @@ -1710,6 +1942,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  				       block->mirror_num,  				       btrfsic_get_block_type(state, block));  		} else { +			if (num_pages * PAGE_CACHE_SIZE < +			    state->datablock_size) { +				printk(KERN_INFO +				       "btrfsic: cannot work with too short bios!\n"); +				return; +			} +			processed_len = state->datablock_size;  			bytenr = block->logical_bytenr;  			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)  				printk(KERN_INFO @@ -1747,7 +1986,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  			       le64_to_cpu(block->disk_key.offset),  			       (unsigned long long)  			       le64_to_cpu(((struct btrfs_header *) -					    mapped_data)->generation), +					    mapped_datav[0])->generation),  			       (unsigned long long)  			       state->max_superblock_generation);  			btrfsic_dump_tree(state); @@ -1765,10 +2004,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  			       (unsigned long long)block->generation,  			       (unsigned long long)  			       le64_to_cpu(((struct btrfs_header *) -					    mapped_data)->generation)); +					    mapped_datav[0])->generation));  			/* it would not be safe to go on */  			btrfsic_dump_tree(state); -			return; +			goto continue_loop;  		}  		/* @@ -1796,18 +2035,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  		}  		if (block->is_superblock) -			ret = btrfsic_map_superblock(state, bytenr, len, +			ret = btrfsic_map_superblock(state, bytenr, +						     processed_len,  						     bdev, &block_ctx);  		else -			ret = btrfsic_map_block(state, bytenr, len, +			ret = btrfsic_map_block(state, bytenr, processed_len,  						&block_ctx, 0);  		if (ret) {  			printk(KERN_INFO  			       "btrfsic: btrfsic_map_block(root @%llu)"  			       " failed!\n", (unsigned long long)bytenr); -			return; +			goto continue_loop;  		} -		block_ctx.data = mapped_data; +		block_ctx.datav = mapped_datav;  		/* the following is required in case of writes to mirrors,  		 * use the same that was used for the lookup */  		block_ctx.dev = dev_state; @@ -1863,11 +2103,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  			block->logical_bytenr = bytenr;  			block->is_metadata = 1;  			if (block->is_superblock) { +				BUG_ON(PAGE_CACHE_SIZE != +				       BTRFS_SUPER_INFO_SIZE);  				ret = btrfsic_process_written_superblock(  						state,  						block,  						(struct btrfs_super_block *) -						mapped_data); +						mapped_datav[0]);  				if (state->print_mask &  				    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {  					printk(KERN_INFO @@ -1880,8 +2122,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  						state,  						block,  						&block_ctx, -						(struct btrfs_header *) -						block_ctx.data,  						0, 0);  			}  			if (ret) @@ -1912,26 +2152,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  		u64 bytenr;  		if (!is_metadata) { +			processed_len = state->datablock_size;  			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)  				printk(KERN_INFO "Written block (%s/%llu/?)"  				       " !found in hash table, D.\n",  				       dev_state->name,  				       (unsigned long long)dev_bytenr); -			if (!state->include_extent_data) -				return;	/* ignore that written D block */ +			if (!state->include_extent_data) { +				/* ignore that written D block */ +				goto continue_loop; +			}  			/* this is getting ugly for the  			 * include_extent_data case... */  			bytenr = 0;	/* unknown */  			block_ctx.start = bytenr; -			block_ctx.len = len; -			block_ctx.bh = NULL; +			block_ctx.len = processed_len; +			block_ctx.mem_to_free = NULL; +			block_ctx.pagev = NULL;  		} else { +			processed_len = state->metablock_size;  			bytenr = le64_to_cpu(((struct btrfs_header *) -					      mapped_data)->bytenr); +					      mapped_datav[0])->bytenr);  			btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, -						       dev_bytenr, -						       mapped_data); +						       dev_bytenr);  			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)  				printk(KERN_INFO  				       "Written block @%llu (%s/%llu/?)" @@ -1940,17 +2184,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  				       dev_state->name,  				       (unsigned long long)dev_bytenr); -			ret = btrfsic_map_block(state, bytenr, len, &block_ctx, -						0); +			ret = btrfsic_map_block(state, bytenr, processed_len, +						&block_ctx, 0);  			if (ret) {  				printk(KERN_INFO  				       "btrfsic: btrfsic_map_block(root @%llu)"  				       " failed!\n",  				       (unsigned long long)dev_bytenr); -				return; +				goto continue_loop;  			}  		} -		block_ctx.data = mapped_data; +		block_ctx.datav = mapped_datav;  		/* the following is required in case of writes to mirrors,  		 * use the same that was used for the lookup */  		block_ctx.dev = dev_state; @@ -1960,7 +2204,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  		if (NULL == block) {  			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");  			btrfsic_release_block_ctx(&block_ctx); -			return; +			goto continue_loop;  		}  		block->dev_state = dev_state;  		block->dev_bytenr = dev_bytenr; @@ -2020,9 +2264,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  		if (is_metadata) {  			ret = btrfsic_process_metablock(state, block, -							&block_ctx, -							(struct btrfs_header *) -							block_ctx.data, 0, 0); +							&block_ctx, 0, 0);  			if (ret)  				printk(KERN_INFO  				       "btrfsic: process_metablock(root @%llu)" @@ -2031,6 +2273,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  		}  		btrfsic_release_block_ctx(&block_ctx);  	} + +continue_loop: +	BUG_ON(!processed_len); +	dev_bytenr += processed_len; +	mapped_datav += processed_len >> PAGE_CACHE_SHIFT; +	num_pages -= processed_len >> PAGE_CACHE_SHIFT; +	goto again;  }  static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) @@ -2213,7 +2462,7 @@ static int btrfsic_process_written_superblock(  		num_copies =  		    btrfs_num_copies(&state->root->fs_info->mapping_tree, -				     next_bytenr, PAGE_SIZE); +				     next_bytenr, BTRFS_SUPER_INFO_SIZE);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",  			       (unsigned long long)next_bytenr, num_copies); @@ -2224,7 +2473,8 @@ static int btrfsic_process_written_superblock(  				printk(KERN_INFO  				       "btrfsic_process_written_superblock("  				       "mirror_num=%d)\n", mirror_num); -			ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, +			ret = btrfsic_map_block(state, next_bytenr, +						BTRFS_SUPER_INFO_SIZE,  						&tmp_next_block_ctx,  						mirror_num);  			if (ret) { @@ -2689,7 +2939,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(  static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,  					   u64 bytenr,  					   struct btrfsic_dev_state *dev_state, -					   u64 dev_bytenr, char *data) +					   u64 dev_bytenr)  {  	int num_copies;  	int mirror_num; @@ -2698,10 +2948,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,  	int match = 0;  	num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, -				      bytenr, PAGE_SIZE); +				      bytenr, state->metablock_size);  	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { -		ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, +		ret = btrfsic_map_block(state, bytenr, state->metablock_size,  					&block_ctx, mirror_num);  		if (ret) {  			printk(KERN_INFO "btrfsic:" @@ -2727,7 +2977,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,  		       (unsigned long long)bytenr, dev_state->name,  		       (unsigned long long)dev_bytenr);  		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { -			ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, +			ret = btrfsic_map_block(state, bytenr, +						state->metablock_size,  						&block_ctx, mirror_num);  			if (ret)  				continue; @@ -2781,13 +3032,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)  			       (unsigned long)bh->b_size, bh->b_data,  			       bh->b_bdev);  		btrfsic_process_written_block(dev_state, dev_bytenr, -					      bh->b_data, bh->b_size, NULL, +					      &bh->b_data, 1, NULL,  					      NULL, bh, rw);  	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {  		if (dev_state->state->print_mask &  		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)  			printk(KERN_INFO -			       "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", +			       "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",  			       rw, bh->b_bdev);  		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {  			if ((dev_state->state->print_mask & @@ -2836,6 +3087,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)  		unsigned int i;  		u64 dev_bytenr;  		int bio_is_patched; +		char **mapped_datav;  		dev_bytenr = 512 * bio->bi_sector;  		bio_is_patched = 0; @@ -2848,35 +3100,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio)  			       (unsigned long long)dev_bytenr,  			       bio->bi_bdev); +		mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, +				       GFP_NOFS); +		if (!mapped_datav) +			goto leave;  		for (i = 0; i < bio->bi_vcnt; i++) { -			u8 *mapped_data; - -			mapped_data = kmap(bio->bi_io_vec[i].bv_page); +			BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); +			mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); +			if (!mapped_datav[i]) { +				while (i > 0) { +					i--; +					kunmap(bio->bi_io_vec[i].bv_page); +				} +				kfree(mapped_datav); +				goto leave; +			}  			if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |  			     BTRFSIC_PRINT_MASK_VERBOSE) ==  			    (dev_state->state->print_mask &  			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |  			      BTRFSIC_PRINT_MASK_VERBOSE)))  				printk(KERN_INFO -				       "#%u: page=%p, mapped=%p, len=%u," -				       " offset=%u\n", +				       "#%u: page=%p, len=%u, offset=%u\n",  				       i, bio->bi_io_vec[i].bv_page, -				       mapped_data,  				       bio->bi_io_vec[i].bv_len,  				       bio->bi_io_vec[i].bv_offset); -			btrfsic_process_written_block(dev_state, dev_bytenr, -						      mapped_data, -						      bio->bi_io_vec[i].bv_len, -						      bio, &bio_is_patched, -						      NULL, rw); +		} +		btrfsic_process_written_block(dev_state, dev_bytenr, +					      mapped_datav, bio->bi_vcnt, +					      bio, &bio_is_patched, +					      NULL, rw); +		while (i > 0) { +			i--;  			kunmap(bio->bi_io_vec[i].bv_page); -			dev_bytenr += bio->bi_io_vec[i].bv_len;  		} +		kfree(mapped_datav);  	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {  		if (dev_state->state->print_mask &  		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)  			printk(KERN_INFO -			       "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", +			       "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",  			       rw, bio->bi_bdev);  		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {  			if ((dev_state->state->print_mask & @@ -2903,6 +3166,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)  			bio->bi_end_io = btrfsic_bio_end_io;  		}  	} +leave:  	mutex_unlock(&btrfsic_mutex);  	submit_bio(rw, bio); @@ -2917,6 +3181,30 @@ int btrfsic_mount(struct btrfs_root *root,  	struct list_head *dev_head = &fs_devices->devices;  	struct btrfs_device *device; +	if (root->nodesize != root->leafsize) { +		printk(KERN_INFO +		       "btrfsic: cannot handle nodesize %d != leafsize %d!\n", +		       root->nodesize, root->leafsize); +		return -1; +	} +	if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { +		printk(KERN_INFO +		       "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", +		       root->nodesize, (unsigned long)PAGE_CACHE_SIZE); +		return -1; +	} +	if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { +		printk(KERN_INFO +		       "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", +		       root->leafsize, (unsigned long)PAGE_CACHE_SIZE); +		return -1; +	} +	if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { +		printk(KERN_INFO +		       "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", +		       root->sectorsize, (unsigned long)PAGE_CACHE_SIZE); +		return -1; +	}  	state = kzalloc(sizeof(*state), GFP_NOFS);  	if (NULL == state) {  		printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); @@ -2933,6 +3221,8 @@ int btrfsic_mount(struct btrfs_root *root,  	state->print_mask = print_mask;  	state->include_extent_data = including_extent_data;  	state->csum_size = 0; +	state->metablock_size = root->nodesize; +	state->datablock_size = root->sectorsize;  	INIT_LIST_HEAD(&state->all_blocks_list);  	btrfsic_block_hashtable_init(&state->block_hashtable);  	btrfsic_block_link_hashtable_init(&state->block_link_hashtable); @@ -3049,7 +3339,7 @@ void btrfsic_unmount(struct btrfs_root *root,  				btrfsic_block_link_free(l);  		} -		if (b_all->is_iodone) +		if (b_all->is_iodone || b_all->never_written)  			btrfsic_block_free(b_all);  		else  			printk(KERN_INFO "btrfs: attempt to free %c-block" diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4106264fbc6..8206b390058 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@  #include <linux/sched.h>  #include <linux/slab.h> +#include <linux/rbtree.h>  #include "ctree.h"  #include "disk-io.h"  #include "transaction.h" @@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  			      struct extent_buffer *dst_buf,  			      struct extent_buffer *src_buf);  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		   struct btrfs_path *path, int level, int slot); +		    struct btrfs_path *path, int level, int slot, +		    int tree_mod_log); +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +				 struct extent_buffer *eb); +struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, +					  u32 blocksize, u64 parent_transid, +					  u64 time_seq); +struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, +						u64 bytenr, u32 blocksize, +						u64 time_seq);  struct btrfs_path *btrfs_alloc_path(void)  { @@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,  				     new_root_objectid, &disk_key, level, -				     buf->start, 0, 1); +				     buf->start, 0);  	if (IS_ERR(cow))  		return PTR_ERR(cow); @@ -288,6 +298,449 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	return 0;  } +enum mod_log_op { +	MOD_LOG_KEY_REPLACE, +	MOD_LOG_KEY_ADD, +	MOD_LOG_KEY_REMOVE, +	MOD_LOG_KEY_REMOVE_WHILE_FREEING, +	MOD_LOG_KEY_REMOVE_WHILE_MOVING, +	MOD_LOG_MOVE_KEYS, +	MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { +	int dst_slot; +	int nr_items; +}; + +struct tree_mod_root { +	u64 logical; +	u8 level; +}; + +struct tree_mod_elem { +	struct rb_node node; +	u64 index;		/* shifted logical */ +	struct seq_list elem; +	enum mod_log_op op; + +	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ +	int slot; + +	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ +	u64 generation; + +	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ +	struct btrfs_disk_key key; +	u64 blockptr; + +	/* this is used for op == MOD_LOG_MOVE_KEYS */ +	struct tree_mod_move move; + +	/* this is used for op == MOD_LOG_ROOT_REPLACE */ +	struct tree_mod_root old_root; +}; + +static inline void +__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +{ +	elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); +	list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +} + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem) +{ +	elem->flags = 1; +	spin_lock(&fs_info->tree_mod_seq_lock); +	__get_tree_mod_seq(fs_info, elem); +	spin_unlock(&fs_info->tree_mod_seq_lock); +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem) +{ +	struct rb_root *tm_root; +	struct rb_node *node; +	struct rb_node *next; +	struct seq_list *cur_elem; +	struct tree_mod_elem *tm; +	u64 min_seq = (u64)-1; +	u64 seq_putting = elem->seq; + +	if (!seq_putting) +		return; + +	BUG_ON(!(elem->flags & 1)); +	spin_lock(&fs_info->tree_mod_seq_lock); +	list_del(&elem->list); + +	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { +		if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { +			if (seq_putting > cur_elem->seq) { +				/* +				 * blocker with lower sequence number exists, we +				 * cannot remove anything from the log +				 */ +				goto out; +			} +			min_seq = cur_elem->seq; +		} +	} + +	/* +	 * anything that's lower than the lowest existing (read: blocked) +	 * sequence number can be removed from the tree. +	 */ +	write_lock(&fs_info->tree_mod_log_lock); +	tm_root = &fs_info->tree_mod_log; +	for (node = rb_first(tm_root); node; node = next) { +		next = rb_next(node); +		tm = container_of(node, struct tree_mod_elem, node); +		if (tm->elem.seq > min_seq) +			continue; +		rb_erase(node, tm_root); +		list_del(&tm->elem.list); +		kfree(tm); +	} +	write_unlock(&fs_info->tree_mod_log_lock); +out: +	spin_unlock(&fs_info->tree_mod_seq_lock); +} + +/* + * key order of the log: + *       index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ +	struct rb_root *tm_root; +	struct rb_node **new; +	struct rb_node *parent = NULL; +	struct tree_mod_elem *cur; +	int ret = 0; + +	BUG_ON(!tm || !tm->elem.seq); + +	write_lock(&fs_info->tree_mod_log_lock); +	tm_root = &fs_info->tree_mod_log; +	new = &tm_root->rb_node; +	while (*new) { +		cur = container_of(*new, struct tree_mod_elem, node); +		parent = *new; +		if (cur->index < tm->index) +			new = &((*new)->rb_left); +		else if (cur->index > tm->index) +			new = &((*new)->rb_right); +		else if (cur->elem.seq < tm->elem.seq) +			new = &((*new)->rb_left); +		else if (cur->elem.seq > tm->elem.seq) +			new = &((*new)->rb_right); +		else { +			kfree(tm); +			ret = -EEXIST; +			goto unlock; +		} +	} + +	rb_link_node(&tm->node, parent, new); +	rb_insert_color(&tm->node, tm_root); +unlock: +	write_unlock(&fs_info->tree_mod_log_lock); +	return ret; +} + +static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, +				    struct extent_buffer *eb) { +	smp_mb(); +	if (list_empty(&(fs_info)->tree_mod_seq_list)) +		return 1; +	if (!eb) +		return 0; +	if (btrfs_header_level(eb) == 0) +		return 1; +	return 0; +} + +/* + * This allocates memory and gets a tree modification sequence number when + * needed. + * + * Returns 0 when no sequence number is needed, < 0 on error. + * Returns 1 when a sequence number was added. In this case, + * fs_info->tree_mod_seq_lock was acquired and must be released by the caller + * after inserting into the rb tree. + */ +static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, +				 struct tree_mod_elem **tm_ret) +{ +	struct tree_mod_elem *tm; +	int seq; + +	if (tree_mod_dont_log(fs_info, NULL)) +		return 0; + +	tm = *tm_ret = kzalloc(sizeof(*tm), flags); +	if (!tm) +		return -ENOMEM; + +	tm->elem.flags = 0; +	spin_lock(&fs_info->tree_mod_seq_lock); +	if (list_empty(&fs_info->tree_mod_seq_list)) { +		/* +		 * someone emptied the list while we were waiting for the lock. +		 * we must not add to the list, because no blocker exists. items +		 * are removed from the list only when the existing blocker is +		 * removed from the list. +		 */ +		kfree(tm); +		seq = 0; +		spin_unlock(&fs_info->tree_mod_seq_lock); +	} else { +		__get_tree_mod_seq(fs_info, &tm->elem); +		seq = tm->elem.seq; +	} + +	return seq; +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, +			     struct extent_buffer *eb, int slot, +			     enum mod_log_op op, gfp_t flags) +{ +	struct tree_mod_elem *tm; +	int ret; + +	ret = tree_mod_alloc(fs_info, flags, &tm); +	if (ret <= 0) +		return ret; + +	tm->index = eb->start >> PAGE_CACHE_SHIFT; +	if (op != MOD_LOG_KEY_ADD) { +		btrfs_node_key(eb, &tm->key, slot); +		tm->blockptr = btrfs_node_blockptr(eb, slot); +	} +	tm->op = op; +	tm->slot = slot; +	tm->generation = btrfs_node_ptr_generation(eb, slot); + +	ret = __tree_mod_log_insert(fs_info, tm); +	spin_unlock(&fs_info->tree_mod_seq_lock); +	return ret; +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, +			int slot, enum mod_log_op op) +{ +	return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, +			 struct extent_buffer *eb, int dst_slot, int src_slot, +			 int nr_items, gfp_t flags) +{ +	struct tree_mod_elem *tm; +	int ret; +	int i; + +	if (tree_mod_dont_log(fs_info, eb)) +		return 0; + +	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { +		ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, +					      MOD_LOG_KEY_REMOVE_WHILE_MOVING); +		BUG_ON(ret < 0); +	} + +	ret = tree_mod_alloc(fs_info, flags, &tm); +	if (ret <= 0) +		return ret; + +	tm->index = eb->start >> PAGE_CACHE_SHIFT; +	tm->slot = src_slot; +	tm->move.dst_slot = dst_slot; +	tm->move.nr_items = nr_items; +	tm->op = MOD_LOG_MOVE_KEYS; + +	ret = __tree_mod_log_insert(fs_info, tm); +	spin_unlock(&fs_info->tree_mod_seq_lock); +	return ret; +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, +			 struct extent_buffer *old_root, +			 struct extent_buffer *new_root, gfp_t flags) +{ +	struct tree_mod_elem *tm; +	int ret; + +	ret = tree_mod_alloc(fs_info, flags, &tm); +	if (ret <= 0) +		return ret; + +	tm->index = new_root->start >> PAGE_CACHE_SHIFT; +	tm->old_root.logical = old_root->start; +	tm->old_root.level = btrfs_header_level(old_root); +	tm->generation = btrfs_header_generation(old_root); +	tm->op = MOD_LOG_ROOT_REPLACE; + +	ret = __tree_mod_log_insert(fs_info, tm); +	spin_unlock(&fs_info->tree_mod_seq_lock); +	return ret; +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, +		      int smallest) +{ +	struct rb_root *tm_root; +	struct rb_node *node; +	struct tree_mod_elem *cur = NULL; +	struct tree_mod_elem *found = NULL; +	u64 index = start >> PAGE_CACHE_SHIFT; + +	read_lock(&fs_info->tree_mod_log_lock); +	tm_root = &fs_info->tree_mod_log; +	node = tm_root->rb_node; +	while (node) { +		cur = container_of(node, struct tree_mod_elem, node); +		if (cur->index < index) { +			node = node->rb_left; +		} else if (cur->index > index) { +			node = node->rb_right; +		} else if (cur->elem.seq < min_seq) { +			node = node->rb_left; +		} else if (!smallest) { +			/* we want the node with the highest seq */ +			if (found) +				BUG_ON(found->elem.seq > cur->elem.seq); +			found = cur; +			node = node->rb_left; +		} else if (cur->elem.seq > min_seq) { +			/* we want the node with the smallest seq */ +			if (found) +				BUG_ON(found->elem.seq < cur->elem.seq); +			found = cur; +			node = node->rb_right; +		} else { +			found = cur; +			break; +		} +	} +	read_unlock(&fs_info->tree_mod_log_lock); + +	return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, +			   u64 min_seq) +{ +	return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ +	return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static inline void +tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, +		     struct extent_buffer *src, unsigned long dst_offset, +		     unsigned long src_offset, int nr_items) +{ +	int ret; +	int i; + +	if (tree_mod_dont_log(fs_info, NULL)) +		return; + +	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) +		return; + +	/* speed this up by single seq for all operations? */ +	for (i = 0; i < nr_items; i++) { +		ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, +					      MOD_LOG_KEY_REMOVE); +		BUG_ON(ret < 0); +		ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, +					      MOD_LOG_KEY_ADD); +		BUG_ON(ret < 0); +	} +} + +static inline void +tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, +		     int dst_offset, int src_offset, int nr_items) +{ +	int ret; +	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, +				       nr_items, GFP_NOFS); +	BUG_ON(ret < 0); +} + +static inline void +tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, +			  struct extent_buffer *eb, +			  struct btrfs_disk_key *disk_key, int slot, int atomic) +{ +	int ret; + +	ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, +					   MOD_LOG_KEY_REPLACE, +					   atomic ? GFP_ATOMIC : GFP_NOFS); +	BUG_ON(ret < 0); +} + +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +				 struct extent_buffer *eb) +{ +	int i; +	int ret; +	u32 nritems; + +	if (tree_mod_dont_log(fs_info, eb)) +		return; + +	nritems = btrfs_header_nritems(eb); +	for (i = nritems - 1; i >= 0; i--) { +		ret = tree_mod_log_insert_key(fs_info, eb, i, +					      MOD_LOG_KEY_REMOVE_WHILE_FREEING); +		BUG_ON(ret < 0); +	} +} + +static inline void +tree_mod_log_set_root_pointer(struct btrfs_root *root, +			      struct extent_buffer *new_root_node) +{ +	int ret; +	tree_mod_log_free_eb(root->fs_info, root->node); +	ret = tree_mod_log_insert_root(root->fs_info, root->node, +				       new_root_node, GFP_NOFS); +	BUG_ON(ret < 0); +} +  /*   * check if the tree block can be shared by multiple trees   */ @@ -409,6 +862,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,  			ret = btrfs_dec_ref(trans, root, buf, 1, 1);  			BUG_ON(ret); /* -ENOMEM */  		} +		/* +		 * don't log freeing in case we're freeing the root node, this +		 * is done by tree_mod_log_set_root_pointer later +		 */ +		if (buf != root->node && btrfs_header_level(buf) != 0) +			tree_mod_log_free_eb(root->fs_info, buf);  		clean_tree_block(trans, root, buf);  		*last_ref = 1;  	} @@ -467,7 +926,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,  				     root->root_key.objectid, &disk_key, -				     level, search_start, empty_size, 1); +				     level, search_start, empty_size);  	if (IS_ERR(cow))  		return PTR_ERR(cow); @@ -506,10 +965,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  			parent_start = 0;  		extent_buffer_get(cow); +		tree_mod_log_set_root_pointer(root, cow);  		rcu_assign_pointer(root->node, cow);  		btrfs_free_tree_block(trans, root, buf, parent_start, -				      last_ref, 1); +				      last_ref);  		free_extent_buffer(buf);  		add_root_to_dirty_list(root);  	} else { @@ -519,13 +979,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  			parent_start = 0;  		WARN_ON(trans->transid != btrfs_header_generation(parent)); +		tree_mod_log_insert_key(root->fs_info, parent, parent_slot, +					MOD_LOG_KEY_REPLACE);  		btrfs_set_node_blockptr(parent, parent_slot,  					cow->start);  		btrfs_set_node_ptr_generation(parent, parent_slot,  					      trans->transid);  		btrfs_mark_buffer_dirty(parent);  		btrfs_free_tree_block(trans, root, buf, parent_start, -				      last_ref, 1); +				      last_ref);  	}  	if (unlock_orig)  		btrfs_tree_unlock(buf); @@ -535,6 +997,229 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	return 0;  } +/* + * returns the logical address of the oldest predecessor of the given root. + * entries older than time_seq are ignored. + */ +static struct tree_mod_elem * +__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, +			   struct btrfs_root *root, u64 time_seq) +{ +	struct tree_mod_elem *tm; +	struct tree_mod_elem *found = NULL; +	u64 root_logical = root->node->start; +	int looped = 0; + +	if (!time_seq) +		return 0; + +	/* +	 * the very last operation that's logged for a root is the replacement +	 * operation (if it is replaced at all). this has the index of the *new* +	 * root, making it the very first operation that's logged for this root. +	 */ +	while (1) { +		tm = tree_mod_log_search_oldest(fs_info, root_logical, +						time_seq); +		if (!looped && !tm) +			return 0; +		/* +		 * if there are no tree operation for the oldest root, we simply +		 * return it. this should only happen if that (old) root is at +		 * level 0. +		 */ +		if (!tm) +			break; + +		/* +		 * if there's an operation that's not a root replacement, we +		 * found the oldest version of our root. normally, we'll find a +		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. +		 */ +		if (tm->op != MOD_LOG_ROOT_REPLACE) +			break; + +		found = tm; +		root_logical = tm->old_root.logical; +		BUG_ON(root_logical == root->node->start); +		looped = 1; +	} + +	/* if there's no old root to return, return what we found instead */ +	if (!found) +		found = tm; + +	return found; +} + +/* + * tm is a pointer to the first operation to rewind within eb. then, all + * previous operations will be rewinded (until we reach something older than + * time_seq). + */ +static void +__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, +		      struct tree_mod_elem *first_tm) +{ +	u32 n; +	struct rb_node *next; +	struct tree_mod_elem *tm = first_tm; +	unsigned long o_dst; +	unsigned long o_src; +	unsigned long p_size = sizeof(struct btrfs_key_ptr); + +	n = btrfs_header_nritems(eb); +	while (tm && tm->elem.seq >= time_seq) { +		/* +		 * all the operations are recorded with the operator used for +		 * the modification. as we're going backwards, we do the +		 * opposite of each operation here. +		 */ +		switch (tm->op) { +		case MOD_LOG_KEY_REMOVE_WHILE_FREEING: +			BUG_ON(tm->slot < n); +		case MOD_LOG_KEY_REMOVE_WHILE_MOVING: +		case MOD_LOG_KEY_REMOVE: +			btrfs_set_node_key(eb, &tm->key, tm->slot); +			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); +			btrfs_set_node_ptr_generation(eb, tm->slot, +						      tm->generation); +			n++; +			break; +		case MOD_LOG_KEY_REPLACE: +			BUG_ON(tm->slot >= n); +			btrfs_set_node_key(eb, &tm->key, tm->slot); +			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); +			btrfs_set_node_ptr_generation(eb, tm->slot, +						      tm->generation); +			break; +		case MOD_LOG_KEY_ADD: +			/* if a move operation is needed it's in the log */ +			n--; +			break; +		case MOD_LOG_MOVE_KEYS: +			o_dst = btrfs_node_key_ptr_offset(tm->slot); +			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); +			memmove_extent_buffer(eb, o_dst, o_src, +					      tm->move.nr_items * p_size); +			break; +		case MOD_LOG_ROOT_REPLACE: +			/* +			 * this operation is special. for roots, this must be +			 * handled explicitly before rewinding. +			 * for non-roots, this operation may exist if the node +			 * was a root: root A -> child B; then A gets empty and +			 * B is promoted to the new root. in the mod log, we'll +			 * have a root-replace operation for B, a tree block +			 * that is no root. we simply ignore that operation. +			 */ +			break; +		} +		next = rb_next(&tm->node); +		if (!next) +			break; +		tm = container_of(next, struct tree_mod_elem, node); +		if (tm->index != first_tm->index) +			break; +	} +	btrfs_set_header_nritems(eb, n); +} + +static struct extent_buffer * +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, +		    u64 time_seq) +{ +	struct extent_buffer *eb_rewin; +	struct tree_mod_elem *tm; + +	if (!time_seq) +		return eb; + +	if (btrfs_header_level(eb) == 0) +		return eb; + +	tm = tree_mod_log_search(fs_info, eb->start, time_seq); +	if (!tm) +		return eb; + +	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { +		BUG_ON(tm->slot != 0); +		eb_rewin = alloc_dummy_extent_buffer(eb->start, +						fs_info->tree_root->nodesize); +		BUG_ON(!eb_rewin); +		btrfs_set_header_bytenr(eb_rewin, eb->start); +		btrfs_set_header_backref_rev(eb_rewin, +					     btrfs_header_backref_rev(eb)); +		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); +		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); +	} else { +		eb_rewin = btrfs_clone_extent_buffer(eb); +		BUG_ON(!eb_rewin); +	} + +	extent_buffer_get(eb_rewin); +	free_extent_buffer(eb); + +	__tree_mod_log_rewind(eb_rewin, time_seq, tm); + +	return eb_rewin; +} + +/* + * get_old_root() rewinds the state of @root's root node to the given @time_seq + * value. If there are no changes, the current root->root_node is returned. If + * anything changed in between, there's a fresh buffer allocated on which the + * rewind operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ +static inline struct extent_buffer * +get_old_root(struct btrfs_root *root, u64 time_seq) +{ +	struct tree_mod_elem *tm; +	struct extent_buffer *eb; +	struct tree_mod_root *old_root = NULL; +	u64 old_generation = 0; +	u64 logical; + +	eb = btrfs_read_lock_root_node(root); +	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); +	if (!tm) +		return root->node; + +	if (tm->op == MOD_LOG_ROOT_REPLACE) { +		old_root = &tm->old_root; +		old_generation = tm->generation; +		logical = old_root->logical; +	} else { +		logical = root->node->start; +	} + +	tm = tree_mod_log_search(root->fs_info, logical, time_seq); +	if (old_root) +		eb = alloc_dummy_extent_buffer(logical, root->nodesize); +	else +		eb = btrfs_clone_extent_buffer(root->node); +	btrfs_tree_read_unlock(root->node); +	free_extent_buffer(root->node); +	if (!eb) +		return NULL; +	btrfs_tree_read_lock(eb); +	if (old_root) { +		btrfs_set_header_bytenr(eb, eb->start); +		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); +		btrfs_set_header_owner(eb, root->root_key.objectid); +		btrfs_set_header_level(eb, old_root->level); +		btrfs_set_header_generation(eb, old_generation); +	} +	if (tm) +		__tree_mod_log_rewind(eb, time_seq, tm); +	else +		WARN_ON(btrfs_header_level(eb) != 0); +	extent_buffer_get(eb); + +	return eb; +} +  static inline int should_cow_block(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root,  				   struct extent_buffer *buf) @@ -739,7 +1424,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  				if (!cur)  					return -EIO;  			} else if (!uptodate) { -				btrfs_read_buffer(cur, gen); +				err = btrfs_read_buffer(cur, gen); +				if (err) { +					free_extent_buffer(cur); +					return err; +				}  			}  		}  		if (search_start == 0) @@ -854,20 +1543,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,  static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,  		      int level, int *slot)  { -	if (level == 0) { +	if (level == 0)  		return generic_bin_search(eb,  					  offsetof(struct btrfs_leaf, items),  					  sizeof(struct btrfs_item),  					  key, btrfs_header_nritems(eb),  					  slot); -	} else { +	else  		return generic_bin_search(eb,  					  offsetof(struct btrfs_node, ptrs),  					  sizeof(struct btrfs_key_ptr),  					  key, btrfs_header_nritems(eb),  					  slot); -	} -	return -1;  }  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, @@ -974,6 +1661,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  			goto enospc;  		} +		tree_mod_log_set_root_pointer(root, child);  		rcu_assign_pointer(root->node, child);  		add_root_to_dirty_list(root); @@ -987,7 +1675,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		free_extent_buffer(mid);  		root_sub_used(root, mid->len); -		btrfs_free_tree_block(trans, root, mid, 0, 1, 0); +		btrfs_free_tree_block(trans, root, mid, 0, 1);  		/* once for the root ptr */  		free_extent_buffer_stale(mid);  		return 0; @@ -996,8 +1684,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)  		return 0; -	btrfs_header_nritems(mid); -  	left = read_node_slot(root, parent, pslot - 1);  	if (left) {  		btrfs_tree_lock(left); @@ -1027,7 +1713,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		wret = push_node_left(trans, root, left, mid, 1);  		if (wret < 0)  			ret = wret; -		btrfs_header_nritems(mid);  	}  	/* @@ -1040,14 +1725,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		if (btrfs_header_nritems(right) == 0) {  			clean_tree_block(trans, root, right);  			btrfs_tree_unlock(right); -			del_ptr(trans, root, path, level + 1, pslot + 1); +			del_ptr(trans, root, path, level + 1, pslot + 1, 1);  			root_sub_used(root, right->len); -			btrfs_free_tree_block(trans, root, right, 0, 1, 0); +			btrfs_free_tree_block(trans, root, right, 0, 1);  			free_extent_buffer_stale(right);  			right = NULL;  		} else {  			struct btrfs_disk_key right_key;  			btrfs_node_key(right, &right_key, 0); +			tree_mod_log_set_node_key(root->fs_info, parent, +						  &right_key, pslot + 1, 0);  			btrfs_set_node_key(parent, &right_key, pslot + 1);  			btrfs_mark_buffer_dirty(parent);  		} @@ -1082,15 +1769,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	if (btrfs_header_nritems(mid) == 0) {  		clean_tree_block(trans, root, mid);  		btrfs_tree_unlock(mid); -		del_ptr(trans, root, path, level + 1, pslot); +		del_ptr(trans, root, path, level + 1, pslot, 1);  		root_sub_used(root, mid->len); -		btrfs_free_tree_block(trans, root, mid, 0, 1, 0); +		btrfs_free_tree_block(trans, root, mid, 0, 1);  		free_extent_buffer_stale(mid);  		mid = NULL;  	} else {  		/* update the parent key to reflect our changes */  		struct btrfs_disk_key mid_key;  		btrfs_node_key(mid, &mid_key, 0); +		tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, +					  pslot, 0);  		btrfs_set_node_key(parent, &mid_key, pslot);  		btrfs_mark_buffer_dirty(parent);  	} @@ -1188,6 +1877,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  			struct btrfs_disk_key disk_key;  			orig_slot += left_nr;  			btrfs_node_key(mid, &disk_key, 0); +			tree_mod_log_set_node_key(root->fs_info, parent, +						  &disk_key, pslot, 0);  			btrfs_set_node_key(parent, &disk_key, pslot);  			btrfs_mark_buffer_dirty(parent);  			if (btrfs_header_nritems(left) > orig_slot) { @@ -1239,6 +1930,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  			struct btrfs_disk_key disk_key;  			btrfs_node_key(right, &disk_key, 0); +			tree_mod_log_set_node_key(root->fs_info, parent, +						  &disk_key, pslot + 1, 0);  			btrfs_set_node_key(parent, &disk_key, pslot + 1);  			btrfs_mark_buffer_dirty(parent); @@ -1496,7 +2189,7 @@ static int  read_block_for_search(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct btrfs_path *p,  		       struct extent_buffer **eb_ret, int level, int slot, -		       struct btrfs_key *key) +		       struct btrfs_key *key, u64 time_seq)  {  	u64 blocknr;  	u64 gen; @@ -1850,7 +2543,7 @@ cow_done:  			}  			err = read_block_for_search(trans, root, p, -						    &b, level, slot, key); +						    &b, level, slot, key, 0);  			if (err == -EAGAIN)  				goto again;  			if (err) { @@ -1922,6 +2615,113 @@ done:  }  /* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, +			  struct btrfs_path *p, u64 time_seq) +{ +	struct extent_buffer *b; +	int slot; +	int ret; +	int err; +	int level; +	int lowest_unlock = 1; +	u8 lowest_level = 0; + +	lowest_level = p->lowest_level; +	WARN_ON(p->nodes[0] != NULL); + +	if (p->search_commit_root) { +		BUG_ON(time_seq); +		return btrfs_search_slot(NULL, root, key, p, 0, 0); +	} + +again: +	b = get_old_root(root, time_seq); +	level = btrfs_header_level(b); +	p->locks[level] = BTRFS_READ_LOCK; + +	while (b) { +		level = btrfs_header_level(b); +		p->nodes[level] = b; +		btrfs_clear_path_blocking(p, NULL, 0); + +		/* +		 * we have a lock on b and as long as we aren't changing +		 * the tree, there is no way to for the items in b to change. +		 * It is safe to drop the lock on our parent before we +		 * go through the expensive btree search on b. +		 */ +		btrfs_unlock_up_safe(p, level + 1); + +		ret = bin_search(b, key, level, &slot); + +		if (level != 0) { +			int dec = 0; +			if (ret && slot > 0) { +				dec = 1; +				slot -= 1; +			} +			p->slots[level] = slot; +			unlock_up(p, level, lowest_unlock, 0, NULL); + +			if (level == lowest_level) { +				if (dec) +					p->slots[level]++; +				goto done; +			} + +			err = read_block_for_search(NULL, root, p, &b, level, +						    slot, key, time_seq); +			if (err == -EAGAIN) +				goto again; +			if (err) { +				ret = err; +				goto done; +			} + +			level = btrfs_header_level(b); +			err = btrfs_try_tree_read_lock(b); +			if (!err) { +				btrfs_set_path_blocking(p); +				btrfs_tree_read_lock(b); +				btrfs_clear_path_blocking(p, b, +							  BTRFS_READ_LOCK); +			} +			p->locks[level] = BTRFS_READ_LOCK; +			p->nodes[level] = b; +			b = tree_mod_log_rewind(root->fs_info, b, time_seq); +			if (b != p->nodes[level]) { +				btrfs_tree_unlock_rw(p->nodes[level], +						     p->locks[level]); +				p->locks[level] = 0; +				p->nodes[level] = b; +			} +		} else { +			p->slots[level] = slot; +			unlock_up(p, level, lowest_unlock, 0, NULL); +			goto done; +		} +	} +	ret = 1; +done: +	if (!p->leave_spinning) +		btrfs_set_path_blocking(p); +	if (ret < 0) +		btrfs_release_path(p); + +	return ret; +} + +/*   * adjust the pointers going up the tree, starting at level   * making sure the right key of each node is points to 'key'.   * This is used after shifting pointers to the left, so it stops @@ -1941,6 +2741,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,  		if (!path->nodes[i])  			break;  		t = path->nodes[i]; +		tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);  		btrfs_set_node_key(t, key, tslot);  		btrfs_mark_buffer_dirty(path->nodes[i]);  		if (tslot != 0) @@ -2023,12 +2824,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,  	} else  		push_items = min(src_nritems - 8, push_items); +	tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, +			     push_items);  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(dst_nritems),  			   btrfs_node_key_ptr_offset(0),  			   push_items * sizeof(struct btrfs_key_ptr));  	if (push_items < src_nritems) { +		tree_mod_log_eb_move(root->fs_info, src, 0, push_items, +				     src_nritems - push_items);  		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),  				      btrfs_node_key_ptr_offset(push_items),  				      (src_nritems - push_items) * @@ -2082,11 +2887,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  	if (max_push < push_items)  		push_items = max_push; +	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);  	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),  				      btrfs_node_key_ptr_offset(0),  				      (dst_nritems) *  				      sizeof(struct btrfs_key_ptr)); +	tree_mod_log_eb_copy(root->fs_info, dst, src, 0, +			     src_nritems - push_items, push_items);  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -2129,7 +2937,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,  				   root->root_key.objectid, &lower_key, -				   level, root->node->start, 0, 0); +				   level, root->node->start, 0);  	if (IS_ERR(c))  		return PTR_ERR(c); @@ -2161,6 +2969,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(c);  	old = root->node; +	tree_mod_log_set_root_pointer(root, c);  	rcu_assign_pointer(root->node, c);  	/* the super has an extra ref to root->node */ @@ -2188,6 +2997,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,  {  	struct extent_buffer *lower;  	int nritems; +	int ret;  	BUG_ON(!path->nodes[level]);  	btrfs_assert_tree_locked(path->nodes[level]); @@ -2196,11 +3006,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,  	BUG_ON(slot > nritems);  	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));  	if (slot != nritems) { +		if (level) +			tree_mod_log_eb_move(root->fs_info, lower, slot + 1, +					     slot, nritems - slot);  		memmove_extent_buffer(lower,  			      btrfs_node_key_ptr_offset(slot + 1),  			      btrfs_node_key_ptr_offset(slot),  			      (nritems - slot) * sizeof(struct btrfs_key_ptr));  	} +	if (level) { +		ret = tree_mod_log_insert_key(root->fs_info, lower, slot, +					      MOD_LOG_KEY_ADD); +		BUG_ON(ret < 0); +	}  	btrfs_set_node_key(lower, key, slot);  	btrfs_set_node_blockptr(lower, slot, bytenr);  	WARN_ON(trans->transid == 0); @@ -2252,7 +3070,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,  					root->root_key.objectid, -					&disk_key, level, c->start, 0, 0); +					&disk_key, level, c->start, 0);  	if (IS_ERR(split))  		return PTR_ERR(split); @@ -2271,7 +3089,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  			    (unsigned long)btrfs_header_chunk_tree_uuid(split),  			    BTRFS_UUID_SIZE); - +	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);  	copy_extent_buffer(split, c,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(mid), @@ -3004,7 +3822,7 @@ again:  	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,  					root->root_key.objectid, -					&disk_key, 0, l->start, 0, 0); +					&disk_key, 0, l->start, 0);  	if (IS_ERR(right))  		return PTR_ERR(right); @@ -3749,19 +4567,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root   * empty a node.   */  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		    struct btrfs_path *path, int level, int slot) +		    struct btrfs_path *path, int level, int slot, +		    int tree_mod_log)  {  	struct extent_buffer *parent = path->nodes[level];  	u32 nritems; +	int ret;  	nritems = btrfs_header_nritems(parent);  	if (slot != nritems - 1) { +		if (tree_mod_log && level) +			tree_mod_log_eb_move(root->fs_info, parent, slot, +					     slot + 1, nritems - slot - 1);  		memmove_extent_buffer(parent,  			      btrfs_node_key_ptr_offset(slot),  			      btrfs_node_key_ptr_offset(slot + 1),  			      sizeof(struct btrfs_key_ptr) *  			      (nritems - slot - 1)); +	} else if (tree_mod_log && level) { +		ret = tree_mod_log_insert_key(root->fs_info, parent, slot, +					      MOD_LOG_KEY_REMOVE); +		BUG_ON(ret < 0);  	} +  	nritems--;  	btrfs_set_header_nritems(parent, nritems);  	if (nritems == 0 && parent == root->node) { @@ -3793,7 +4621,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,  				    struct extent_buffer *leaf)  {  	WARN_ON(btrfs_header_generation(leaf) != trans->transid); -	del_ptr(trans, root, path, 1, path->slots[1]); +	del_ptr(trans, root, path, 1, path->slots[1], 1);  	/*  	 * btrfs_free_extent is expensive, we want to make sure we @@ -3804,7 +4632,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,  	root_sub_used(root, leaf->len);  	extent_buffer_get(leaf); -	btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); +	btrfs_free_tree_block(trans, root, leaf, 0, 1);  	free_extent_buffer_stale(leaf);  }  /* @@ -4202,6 +5030,12 @@ next:   */  int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)  { +	return btrfs_next_old_leaf(root, path, 0); +} + +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, +			u64 time_seq) +{  	int slot;  	int level;  	struct extent_buffer *c; @@ -4226,7 +5060,10 @@ again:  	path->keep_locks = 1;  	path->leave_spinning = 1; -	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (time_seq) +		ret = btrfs_search_old_slot(root, &key, path, time_seq); +	else +		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  	path->keep_locks = 0;  	if (ret < 0) @@ -4271,7 +5108,7 @@ again:  		next = c;  		next_rw_lock = path->locks[level];  		ret = read_block_for_search(NULL, root, path, &next, level, -					    slot, &key); +					    slot, &key, 0);  		if (ret == -EAGAIN)  			goto again; @@ -4282,6 +5119,18 @@ again:  		if (!path->skip_locking) {  			ret = btrfs_try_tree_read_lock(next); +			if (!ret && time_seq) { +				/* +				 * If we don't get the lock, we may be racing +				 * with push_leaf_left, holding that lock while +				 * itself waiting for the leaf we've currently +				 * locked. To solve this situation, we give up +				 * on our lock and cycle. +				 */ +				btrfs_release_path(path); +				cond_resched(); +				goto again; +			}  			if (!ret) {  				btrfs_set_path_blocking(path);  				btrfs_tree_read_lock(next); @@ -4308,7 +5157,7 @@ again:  			break;  		ret = read_block_for_search(NULL, root, path, &next, level, -					    0, &key); +					    0, &key, 0);  		if (ret == -EAGAIN)  			goto again; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fd72331d60..fa5c45b3907 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };  #define BTRFS_FT_XATTR		8  #define BTRFS_FT_MAX		9 +/* ioprio of readahead is set to idle */ +#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) +  /*   * The key defines the order in the tree, and so it also defines (optimal)   * block layout. @@ -823,6 +826,14 @@ struct btrfs_csum_item {  	u8 csum;  } __attribute__ ((__packed__)); +struct btrfs_dev_stats_item { +	/* +	 * grow this item struct at the end for future enhancements and keep +	 * the existing values unchanged +	 */ +	__le64 values[BTRFS_DEV_STAT_VALUES_MAX]; +} __attribute__ ((__packed__)); +  /* different types of block groups (and chunks) */  #define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)  #define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1) @@ -1129,6 +1140,15 @@ struct btrfs_fs_info {  	spinlock_t delayed_iput_lock;  	struct list_head delayed_iputs; +	/* this protects tree_mod_seq_list */ +	spinlock_t tree_mod_seq_lock; +	atomic_t tree_mod_seq; +	struct list_head tree_mod_seq_list; + +	/* this protects tree_mod_log */ +	rwlock_t tree_mod_log_lock; +	struct rb_root tree_mod_log; +  	atomic_t nr_async_submits;  	atomic_t async_submit_draining;  	atomic_t nr_async_bios; @@ -1375,7 +1395,7 @@ struct btrfs_root {  	struct list_head root_list;  	spinlock_t orphan_lock; -	struct list_head orphan_list; +	atomic_t orphan_inodes;  	struct btrfs_block_rsv *orphan_block_rsv;  	int orphan_item_inserted;  	int orphan_cleanup_state; @@ -1508,6 +1528,12 @@ struct btrfs_ioctl_defrag_range_args {  #define BTRFS_BALANCE_ITEM_KEY	248  /* + * Persistantly stores the io stats in the device tree. + * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). + */ +#define BTRFS_DEV_STATS_KEY	249 + +/*   * string items are for debugging.  They just store a short string of   * data in the FS   */ @@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,  	return btrfs_item_size(eb, e) - offset;  } +/* btrfs_dev_stats_item */ +static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, +					struct btrfs_dev_stats_item *ptr, +					int index) +{ +	u64 val; + +	read_extent_buffer(eb, &val, +			   offsetof(struct btrfs_dev_stats_item, values) + +			    ((unsigned long)ptr) + (index * sizeof(u64)), +			   sizeof(val)); +	return val; +} + +static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, +					     struct btrfs_dev_stats_item *ptr, +					     int index, u64 val) +{ +	write_extent_buffer(eb, &val, +			    offsetof(struct btrfs_dev_stats_item, values) + +			     ((unsigned long)ptr) + (index * sizeof(u64)), +			    sizeof(val)); +} +  static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)  {  	return sb->s_fs_info; @@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  					struct btrfs_root *root, u32 blocksize,  					u64 parent, u64 root_objectid,  					struct btrfs_disk_key *key, int level, -					u64 hint, u64 empty_size, int for_cow); +					u64 hint, u64 empty_size);  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct extent_buffer *buf, -			   u64 parent, int last_ref, int for_cow); +			   u64 parent, int last_ref);  struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,  					    struct btrfs_root *root,  					    u64 bytenr, u32 blocksize, @@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,  int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root  		      *root, struct btrfs_key *key, struct btrfs_path *p, int  		      ins_len, int cow); +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, +			  struct btrfs_path *p, u64 time_seq);  int btrfs_realloc_node(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct extent_buffer *parent,  		       int start_slot, int cache_only, u64 *last_ret, @@ -2701,13 +2753,20 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,  }  int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); -static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, +			u64 time_seq); +static inline int btrfs_next_old_item(struct btrfs_root *root, +				      struct btrfs_path *p, u64 time_seq)  {  	++p->slots[0];  	if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) -		return btrfs_next_leaf(root, p); +		return btrfs_next_old_leaf(root, p, time_seq);  	return 0;  } +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ +	return btrfs_next_old_item(root, p, 0); +}  int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);  int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);  int __must_check btrfs_drop_snapshot(struct btrfs_root *root, @@ -2922,7 +2981,6 @@ int btrfs_readpage(struct file *file, struct page *page);  void btrfs_evict_inode(struct inode *inode);  int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);  int btrfs_dirty_inode(struct inode *inode); -int btrfs_update_time(struct file *file);  struct inode *btrfs_alloc_inode(struct super_block *sb);  void btrfs_destroy_inode(struct inode *inode);  int btrfs_drop_inode(struct inode *inode); @@ -3098,4 +3156,23 @@ void btrfs_reada_detach(void *handle);  int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,  			 u64 start, int err); +/* delayed seq elem */ +struct seq_list { +	struct list_head list; +	u64 seq; +	u32 flags; +}; + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem); + +static inline int is_fstree(u64 rootid) +{ +	if (rootid == BTRFS_FS_TREE_OBJECTID || +	    (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) +		return 1; +	return 0; +}  #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 03e3748d84d..2399f408691 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(  		return ret;  	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {  		spin_lock(&BTRFS_I(inode)->lock); -		if (BTRFS_I(inode)->delalloc_meta_reserved) { -			BTRFS_I(inode)->delalloc_meta_reserved = 0; +		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, +				       &BTRFS_I(inode)->runtime_flags)) {  			spin_unlock(&BTRFS_I(inode)->lock);  			release = true;  			goto migrate; @@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,  	btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));  	btrfs_set_stack_inode_generation(inode_item,  					 BTRFS_I(inode)->generation); -	btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); +	btrfs_set_stack_inode_sequence(inode_item, inode->i_version);  	btrfs_set_stack_inode_transid(inode_item, trans->transid);  	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);  	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); @@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)  	set_nlink(inode, btrfs_stack_inode_nlink(inode_item));  	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));  	BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); -	BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); +	inode->i_version = btrfs_stack_inode_sequence(inode_item);  	inode->i_rdev = 0;  	*rdev = btrfs_stack_inode_rdev(inode_item);  	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); @@ -1879,3 +1879,21 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)  		}  	}  } + +void btrfs_destroy_delayed_inodes(struct btrfs_root *root) +{ +	struct btrfs_delayed_root *delayed_root; +	struct btrfs_delayed_node *curr_node, *prev_node; + +	delayed_root = btrfs_get_delayed_root(root); + +	curr_node = btrfs_first_delayed_node(delayed_root); +	while (curr_node) { +		__btrfs_kill_delayed_node(curr_node); + +		prev_node = curr_node; +		curr_node = btrfs_next_delayed_node(curr_node); +		btrfs_release_delayed_node(prev_node); +	} +} + diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7083d08b2a2..f5aa4023d3e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -124,6 +124,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev);  /* Used for drop dead root */  void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); +/* Used for clean the transaction */ +void btrfs_destroy_delayed_inodes(struct btrfs_root *root); +  /* Used for readdir() */  void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,  			     struct list_head *del_list); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 69f22e3ab3b..13ae7b04790 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  	ref->is_head = 0;  	ref->in_tree = 1; -	if (need_ref_seq(for_cow, ref_root)) +	if (is_fstree(ref_root))  		seq = inc_delayed_seq(delayed_refs);  	ref->seq = seq; @@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,  	ref->is_head = 0;  	ref->in_tree = 1; -	if (need_ref_seq(for_cow, ref_root)) +	if (is_fstree(ref_root))  		seq = inc_delayed_seq(delayed_refs);  	ref->seq = seq; @@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  	add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,  				   num_bytes, parent, ref_root, level, action,  				   for_cow); -	if (!need_ref_seq(for_cow, ref_root) && +	if (!is_fstree(ref_root) &&  	    waitqueue_active(&delayed_refs->seq_wait))  		wake_up(&delayed_refs->seq_wait);  	spin_unlock(&delayed_refs->lock); +  	return 0;  } @@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,  	add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,  				   num_bytes, parent, ref_root, owner, offset,  				   action, for_cow); -	if (!need_ref_seq(for_cow, ref_root) && +	if (!is_fstree(ref_root) &&  	    waitqueue_active(&delayed_refs->seq_wait))  		wake_up(&delayed_refs->seq_wait);  	spin_unlock(&delayed_refs->lock); +  	return 0;  } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d8f244d9492..413927fb995 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,  int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,  			   struct list_head *cluster, u64 search_start); -struct seq_list { -	struct list_head list; -	u64 seq; -}; -  static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)  {  	assert_spin_locked(&delayed_refs->lock); @@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,  			    u64 seq);  /* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ -	if (for_cow) -		return 0; - -	if (rootid == BTRFS_FS_TREE_OBJECTID) -		return 1; - -	if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) -		return 1; - -	return 0; -} - -/*   * a node might live in a head or a regular ref, this lets you   * test for the proper type to use.   */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e1fe74a2ce1..2936ca49b3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,6 +44,7 @@  #include "free-space-cache.h"  #include "inode-map.h"  #include "check-integrity.h" +#include "rcu-string.h"  static struct extent_io_ops btree_extent_io_ops;  static void end_workqueue_fn(struct btrfs_work *work); @@ -1153,7 +1154,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,  	root->orphan_block_rsv = NULL;  	INIT_LIST_HEAD(&root->dirty_list); -	INIT_LIST_HEAD(&root->orphan_list);  	INIT_LIST_HEAD(&root->root_list);  	spin_lock_init(&root->orphan_lock);  	spin_lock_init(&root->inode_lock); @@ -1166,6 +1166,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,  	atomic_set(&root->log_commit[0], 0);  	atomic_set(&root->log_commit[1], 0);  	atomic_set(&root->log_writers, 0); +	atomic_set(&root->orphan_inodes, 0);  	root->log_batch = 0;  	root->log_transid = 0;  	root->last_log_commit = 0; @@ -1252,7 +1253,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,  	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,  				      BTRFS_TREE_LOG_OBJECTID, NULL, -				      0, 0, 0, 0); +				      0, 0, 0);  	if (IS_ERR(leaf)) {  		kfree(root);  		return ERR_CAST(leaf); @@ -1914,11 +1915,14 @@ int open_ctree(struct super_block *sb,  	spin_lock_init(&fs_info->delayed_iput_lock);  	spin_lock_init(&fs_info->defrag_inodes_lock);  	spin_lock_init(&fs_info->free_chunk_lock); +	spin_lock_init(&fs_info->tree_mod_seq_lock); +	rwlock_init(&fs_info->tree_mod_log_lock);  	mutex_init(&fs_info->reloc_mutex);  	init_completion(&fs_info->kobj_unregister);  	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);  	INIT_LIST_HEAD(&fs_info->space_info); +	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);  	btrfs_mapping_init(&fs_info->mapping_tree);  	btrfs_init_block_rsv(&fs_info->global_block_rsv);  	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); @@ -1931,12 +1935,14 @@ int open_ctree(struct super_block *sb,  	atomic_set(&fs_info->async_submit_draining, 0);  	atomic_set(&fs_info->nr_async_bios, 0);  	atomic_set(&fs_info->defrag_running, 0); +	atomic_set(&fs_info->tree_mod_seq, 0);  	fs_info->sb = sb;  	fs_info->max_inline = 8192 * 1024;  	fs_info->metadata_ratio = 0;  	fs_info->defrag_inodes = RB_ROOT;  	fs_info->trans_no_join = 0;  	fs_info->free_chunk_space = 0; +	fs_info->tree_mod_log = RB_ROOT;  	/* readahead state */  	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); @@ -2001,7 +2007,8 @@ int open_ctree(struct super_block *sb,  	BTRFS_I(fs_info->btree_inode)->root = tree_root;  	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,  	       sizeof(struct btrfs_key)); -	BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; +	set_bit(BTRFS_INODE_DUMMY, +		&BTRFS_I(fs_info->btree_inode)->runtime_flags);  	insert_inode_hash(fs_info->btree_inode);  	spin_lock_init(&fs_info->block_group_cache_lock); @@ -2112,7 +2119,7 @@ int open_ctree(struct super_block *sb,  	features = btrfs_super_incompat_flags(disk_super);  	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; -	if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) +	if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)  		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;  	/* @@ -2347,12 +2354,24 @@ retry_root_backup:  				  BTRFS_CSUM_TREE_OBJECTID, csum_root);  	if (ret)  		goto recovery_tree_root; -  	csum_root->track_dirty = 1;  	fs_info->generation = generation;  	fs_info->last_trans_committed = generation; +	ret = btrfs_recover_balance(fs_info); +	if (ret) { +		printk(KERN_WARNING "btrfs: failed to recover balance\n"); +		goto fail_block_groups; +	} + +	ret = btrfs_init_dev_stats(fs_info); +	if (ret) { +		printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", +		       ret); +		goto fail_block_groups; +	} +  	ret = btrfs_init_space_info(fs_info);  	if (ret) {  		printk(KERN_ERR "Failed to initial space info: %d\n", ret); @@ -2471,20 +2490,23 @@ retry_root_backup:  		goto fail_trans_kthread;  	} -	if (!(sb->s_flags & MS_RDONLY)) { -		down_read(&fs_info->cleanup_work_sem); -		err = btrfs_orphan_cleanup(fs_info->fs_root); -		if (!err) -			err = btrfs_orphan_cleanup(fs_info->tree_root); -		up_read(&fs_info->cleanup_work_sem); +	if (sb->s_flags & MS_RDONLY) +		return 0; -		if (!err) -			err = btrfs_recover_balance(fs_info->tree_root); +	down_read(&fs_info->cleanup_work_sem); +	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || +	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { +		up_read(&fs_info->cleanup_work_sem); +		close_ctree(tree_root); +		return ret; +	} +	up_read(&fs_info->cleanup_work_sem); -		if (err) { -			close_ctree(tree_root); -			return err; -		} +	ret = btrfs_resume_balance_async(fs_info); +	if (ret) { +		printk(KERN_WARNING "btrfs: failed to resume balance\n"); +		close_ctree(tree_root); +		return ret;  	}  	return 0; @@ -2556,18 +2578,20 @@ recovery_tree_root:  static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)  { -	char b[BDEVNAME_SIZE]; -  	if (uptodate) {  		set_buffer_uptodate(bh);  	} else { -		printk_ratelimited(KERN_WARNING "lost page write due to " -					"I/O error on %s\n", -				       bdevname(bh->b_bdev, b)); +		struct btrfs_device *device = (struct btrfs_device *) +			bh->b_private; + +		printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to " +					  "I/O error on %s\n", +					  rcu_str_deref(device->name));  		/* note, we dont' set_buffer_write_io_error because we have  		 * our own ways of dealing with the IO errors  		 */  		clear_buffer_uptodate(bh); +		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);  	}  	unlock_buffer(bh);  	put_bh(bh); @@ -2682,6 +2706,7 @@ static int write_dev_supers(struct btrfs_device *device,  			set_buffer_uptodate(bh);  			lock_buffer(bh);  			bh->b_end_io = btrfs_end_buffer_write_sync; +			bh->b_private = device;  		}  		/* @@ -2734,12 +2759,15 @@ static int write_dev_flush(struct btrfs_device *device, int wait)  		wait_for_completion(&device->flush_wait);  		if (bio_flagged(bio, BIO_EOPNOTSUPP)) { -			printk("btrfs: disabling barriers on dev %s\n", -			       device->name); +			printk_in_rcu("btrfs: disabling barriers on dev %s\n", +				      rcu_str_deref(device->name));  			device->nobarriers = 1;  		}  		if (!bio_flagged(bio, BIO_UPTODATE)) {  			ret = -EIO; +			if (!bio_flagged(bio, BIO_EOPNOTSUPP)) +				btrfs_dev_stat_inc_and_print(device, +					BTRFS_DEV_STAT_FLUSH_ERRS);  		}  		/* drop the reference from the wait == 0 run */ @@ -2902,19 +2930,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,  	return ret;  } -/* Kill all outstanding I/O */ -void btrfs_abort_devices(struct btrfs_root *root) -{ -	struct list_head *head; -	struct btrfs_device *dev; -	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); -	head = &root->fs_info->fs_devices->devices; -	list_for_each_entry_rcu(dev, head, dev_list) { -		blk_abort_queue(dev->bdev->bd_disk->queue); -	} -	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -} -  void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)  {  	spin_lock(&fs_info->fs_roots_radix_lock); @@ -3395,7 +3410,6 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,  	delayed_refs = &trans->delayed_refs; -again:  	spin_lock(&delayed_refs->lock);  	if (delayed_refs->num_entries == 0) {  		spin_unlock(&delayed_refs->lock); @@ -3403,31 +3417,37 @@ again:  		return ret;  	} -	node = rb_first(&delayed_refs->root); -	while (node) { +	while ((node = rb_first(&delayed_refs->root)) != NULL) {  		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); -		node = rb_next(node); - -		ref->in_tree = 0; -		rb_erase(&ref->rb_node, &delayed_refs->root); -		delayed_refs->num_entries--;  		atomic_set(&ref->refs, 1);  		if (btrfs_delayed_ref_is_head(ref)) {  			struct btrfs_delayed_ref_head *head;  			head = btrfs_delayed_node_to_head(ref); -			spin_unlock(&delayed_refs->lock); -			mutex_lock(&head->mutex); +			if (!mutex_trylock(&head->mutex)) { +				atomic_inc(&ref->refs); +				spin_unlock(&delayed_refs->lock); + +				/* Need to wait for the delayed ref to run */ +				mutex_lock(&head->mutex); +				mutex_unlock(&head->mutex); +				btrfs_put_delayed_ref(ref); + +				spin_lock(&delayed_refs->lock); +				continue; +			} +  			kfree(head->extent_op);  			delayed_refs->num_heads--;  			if (list_empty(&head->cluster))  				delayed_refs->num_heads_ready--;  			list_del_init(&head->cluster); -			mutex_unlock(&head->mutex); -			btrfs_put_delayed_ref(ref); -			goto again;  		} +		ref->in_tree = 0; +		rb_erase(&ref->rb_node, &delayed_refs->root); +		delayed_refs->num_entries--; +  		spin_unlock(&delayed_refs->lock);  		btrfs_put_delayed_ref(ref); @@ -3515,11 +3535,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,  			     &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,  					       offset >> PAGE_CACHE_SHIFT);  			spin_unlock(&dirty_pages->buffer_lock); -			if (eb) { +			if (eb)  				ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,  							 &eb->bflags); -				atomic_set(&eb->refs, 1); -			}  			if (PageWriteback(page))  				end_page_writeback(page); @@ -3533,8 +3551,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,  				spin_unlock_irq(&page->mapping->tree_lock);  			} -			page->mapping->a_ops->invalidatepage(page, 0);  			unlock_page(page); +			page_cache_release(page);  		}  	} @@ -3548,8 +3566,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,  	u64 start;  	u64 end;  	int ret; +	bool loop = true;  	unpin = pinned_extents; +again:  	while (1) {  		ret = find_first_extent_bit(unpin, 0, &start, &end,  					    EXTENT_DIRTY); @@ -3567,6 +3587,15 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,  		cond_resched();  	} +	if (loop) { +		if (unpin == &root->fs_info->freed_extents[0]) +			unpin = &root->fs_info->freed_extents[1]; +		else +			unpin = &root->fs_info->freed_extents[0]; +		loop = false; +		goto again; +	} +  	return 0;  } @@ -3580,21 +3609,23 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,  	/* FIXME: cleanup wait for commit */  	cur_trans->in_commit = 1;  	cur_trans->blocked = 1; -	if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) -		wake_up(&root->fs_info->transaction_blocked_wait); +	wake_up(&root->fs_info->transaction_blocked_wait);  	cur_trans->blocked = 0; -	if (waitqueue_active(&root->fs_info->transaction_wait)) -		wake_up(&root->fs_info->transaction_wait); +	wake_up(&root->fs_info->transaction_wait);  	cur_trans->commit_done = 1; -	if (waitqueue_active(&cur_trans->commit_wait)) -		wake_up(&cur_trans->commit_wait); +	wake_up(&cur_trans->commit_wait); + +	btrfs_destroy_delayed_inodes(root); +	btrfs_assert_delayed_root_empty(root);  	btrfs_destroy_pending_snapshots(cur_trans);  	btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,  				     EXTENT_DIRTY); +	btrfs_destroy_pinned_extent(root, +				    root->fs_info->pinned_extents);  	/*  	memset(cur_trans, 0, sizeof(*cur_trans)); @@ -3643,6 +3674,9 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)  		if (waitqueue_active(&t->commit_wait))  			wake_up(&t->commit_wait); +		btrfs_destroy_delayed_inodes(root); +		btrfs_assert_delayed_root_empty(root); +  		btrfs_destroy_pending_snapshots(t);  		btrfs_destroy_delalloc_inodes(root); @@ -3671,17 +3705,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)  	return 0;  } -static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page, -					  u64 start, u64 end, -					  struct extent_state *state) -{ -	struct super_block *sb = page->mapping->host->i_sb; -	struct btrfs_fs_info *fs_info = btrfs_sb(sb); -	btrfs_error(fs_info, -EIO, -		    "Error occured while writing out btree at %llu", start); -	return -EIO; -} -  static struct extent_io_ops btree_extent_io_ops = {  	.write_cache_pages_lock_hook = btree_lock_page_hook,  	.readpage_end_io_hook = btree_readpage_end_io_hook, @@ -3689,5 +3712,4 @@ static struct extent_io_ops btree_extent_io_ops = {  	.submit_bio_hook = btree_submit_bio_hook,  	/* note we're sharing with inode.c for the merge bio hook */  	.merge_bio_hook = btrfs_merge_bio_hook, -	.writepage_io_failed_hook = btree_writepage_io_failed_hook,  }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index ab1830aaf0e..05b3fab39f7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,  int btrfs_cleanup_transaction(struct btrfs_root *root);  void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,  				  struct btrfs_root *root); -void btrfs_abort_devices(struct btrfs_root *root);  #ifdef CONFIG_DEBUG_LOCK_ALLOC  void btrfs_init_lockdep(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index e887ee62b6d..614f34a899c 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -13,15 +13,14 @@  					     parent_root_objectid) / 4)  #define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) -static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, -			   int connectable) +static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, +			   struct inode *parent)  {  	struct btrfs_fid *fid = (struct btrfs_fid *)fh; -	struct inode *inode = dentry->d_inode;  	int len = *max_len;  	int type; -	if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { +	if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {  		*max_len = BTRFS_FID_SIZE_CONNECTABLE;  		return 255;  	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { @@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,  	fid->root_objectid = BTRFS_I(inode)->root->objectid;  	fid->gen = inode->i_generation; -	if (connectable && !S_ISDIR(inode->i_mode)) { -		struct inode *parent; +	if (parent) {  		u64 parent_root_id; -		spin_lock(&dentry->d_lock); - -		parent = dentry->d_parent->d_inode;  		fid->parent_objectid = BTRFS_I(parent)->location.objectid;  		fid->parent_gen = parent->i_generation;  		parent_root_id = BTRFS_I(parent)->root->objectid; -		spin_unlock(&dentry->d_lock); -  		if (parent_root_id != fid->root_objectid) {  			fid->parent_root_objectid = parent_root_id;  			len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 49fd7b66d57..6e1d36702ff 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2347,12 +2347,10 @@ next:  	return count;  } -  static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, -			unsigned long num_refs) +			       unsigned long num_refs, +			       struct list_head *first_seq)  { -	struct list_head *first_seq = delayed_refs->seq_head.next; -  	spin_unlock(&delayed_refs->lock);  	pr_debug("waiting for more refs (num %ld, first %p)\n",  		 num_refs, first_seq); @@ -2381,6 +2379,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  	struct btrfs_delayed_ref_root *delayed_refs;  	struct btrfs_delayed_ref_node *ref;  	struct list_head cluster; +	struct list_head *first_seq = NULL;  	int ret;  	u64 delayed_start;  	int run_all = count == (unsigned long)-1; @@ -2436,8 +2435,10 @@ again:  				 */  				consider_waiting = 1;  				num_refs = delayed_refs->num_entries; +				first_seq = root->fs_info->tree_mod_seq_list.next;  			} else { -				wait_for_more_refs(delayed_refs, num_refs); +				wait_for_more_refs(delayed_refs, +						   num_refs, first_seq);  				/*  				 * after waiting, things have changed. we  				 * dropped the lock and someone else might have @@ -3578,7 +3579,7 @@ again:  	space_info->chunk_alloc = 0;  	spin_unlock(&space_info->lock);  out: -	mutex_unlock(&extent_root->fs_info->chunk_mutex); +	mutex_unlock(&fs_info->chunk_mutex);  	return ret;  } @@ -4355,10 +4356,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)  	BTRFS_I(inode)->outstanding_extents--;  	if (BTRFS_I(inode)->outstanding_extents == 0 && -	    BTRFS_I(inode)->delalloc_meta_reserved) { +	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, +			       &BTRFS_I(inode)->runtime_flags))  		drop_inode_space = 1; -		BTRFS_I(inode)->delalloc_meta_reserved = 0; -	}  	/*  	 * If we have more or the same amount of outsanding extents than we have @@ -4465,7 +4465,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	 * Add an item to reserve for updating the inode when we complete the  	 * delalloc io.  	 */ -	if (!BTRFS_I(inode)->delalloc_meta_reserved) { +	if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, +		      &BTRFS_I(inode)->runtime_flags)) {  		nr_extents++;  		extra_reserve = 1;  	} @@ -4511,7 +4512,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	spin_lock(&BTRFS_I(inode)->lock);  	if (extra_reserve) { -		BTRFS_I(inode)->delalloc_meta_reserved = 1; +		set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, +			&BTRFS_I(inode)->runtime_flags);  		nr_extents--;  	}  	BTRFS_I(inode)->reserved_extents += nr_extents; @@ -5217,7 +5219,7 @@ out:  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct extent_buffer *buf, -			   u64 parent, int last_ref, int for_cow) +			   u64 parent, int last_ref)  {  	struct btrfs_block_group_cache *cache = NULL;  	int ret; @@ -5227,7 +5229,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  					buf->start, buf->len,  					parent, root->root_key.objectid,  					btrfs_header_level(buf), -					BTRFS_DROP_DELAYED_REF, NULL, for_cow); +					BTRFS_DROP_DELAYED_REF, NULL, 0);  		BUG_ON(ret); /* -ENOMEM */  	} @@ -6249,7 +6251,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  					struct btrfs_root *root, u32 blocksize,  					u64 parent, u64 root_objectid,  					struct btrfs_disk_key *key, int level, -					u64 hint, u64 empty_size, int for_cow) +					u64 hint, u64 empty_size)  {  	struct btrfs_key ins;  	struct btrfs_block_rsv *block_rsv; @@ -6297,7 +6299,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  					ins.objectid,  					ins.offset, parent, root_objectid,  					level, BTRFS_ADD_DELAYED_EXTENT, -					extent_op, for_cow); +					extent_op, 0);  		BUG_ON(ret); /* -ENOMEM */  	}  	return buf; @@ -6715,7 +6717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,  			       btrfs_header_owner(path->nodes[level + 1]));  	} -	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); +	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);  out:  	wc->refs[level] = 0;  	wc->flags[level] = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 97f6703fd49..deafe19c34b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,6 +20,7 @@  #include "volumes.h"  #include "check-integrity.h"  #include "locking.h" +#include "rcu-string.h"  static struct kmem_cache *extent_state_cache;  static struct kmem_cache *extent_buffer_cache; @@ -186,7 +187,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,  			return parent;  	} -	entry = rb_entry(node, struct tree_entry, rb_node);  	rb_link_node(node, parent, p);  	rb_insert_color(node, root);  	return NULL; @@ -413,7 +413,7 @@ static struct extent_state *next_state(struct extent_state *state)  /*   * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1) + * it will optionally wake up any one waiting on this state (wake == 1).   *   * If no bits are set on the state struct after clearing things, the   * struct is freed and removed from the tree @@ -570,10 +570,8 @@ hit_next:  		if (err)  			goto out;  		if (state->end <= end) { -			clear_state_bit(tree, state, &bits, wake); -			if (last_end == (u64)-1) -				goto out; -			start = last_end + 1; +			state = clear_state_bit(tree, state, &bits, wake); +			goto next;  		}  		goto search_again;  	} @@ -781,7 +779,6 @@ hit_next:  	 * Just lock what we found and keep going  	 */  	if (state->start == start && state->end <= end) { -		struct rb_node *next_node;  		if (state->state & exclusive_bits) {  			*failed_start = state->start;  			err = -EEXIST; @@ -789,20 +786,15 @@ hit_next:  		}  		set_state_bits(tree, state, &bits); -  		cache_state(state, cached_state);  		merge_state(tree, state);  		if (last_end == (u64)-1)  			goto out; -  		start = last_end + 1; -		next_node = rb_next(&state->rb_node); -		if (next_node && start < end && prealloc && !need_resched()) { -			state = rb_entry(next_node, struct extent_state, -					 rb_node); -			if (state->start == start) -				goto hit_next; -		} +		state = next_state(state); +		if (start < end && state && state->start == start && +		    !need_resched()) +			goto hit_next;  		goto search_again;  	} @@ -845,6 +837,10 @@ hit_next:  			if (last_end == (u64)-1)  				goto out;  			start = last_end + 1; +			state = next_state(state); +			if (start < end && state && state->start == start && +			    !need_resched()) +				goto hit_next;  		}  		goto search_again;  	} @@ -995,21 +991,14 @@ hit_next:  	 * Just lock what we found and keep going  	 */  	if (state->start == start && state->end <= end) { -		struct rb_node *next_node; -  		set_state_bits(tree, state, &bits); -		clear_state_bit(tree, state, &clear_bits, 0); +		state = clear_state_bit(tree, state, &clear_bits, 0);  		if (last_end == (u64)-1)  			goto out; -  		start = last_end + 1; -		next_node = rb_next(&state->rb_node); -		if (next_node && start < end && prealloc && !need_resched()) { -			state = rb_entry(next_node, struct extent_state, -					 rb_node); -			if (state->start == start) -				goto hit_next; -		} +		if (start < end && state && state->start == start && +		    !need_resched()) +			goto hit_next;  		goto search_again;  	} @@ -1043,10 +1032,13 @@ hit_next:  			goto out;  		if (state->end <= end) {  			set_state_bits(tree, state, &bits); -			clear_state_bit(tree, state, &clear_bits, 0); +			state = clear_state_bit(tree, state, &clear_bits, 0);  			if (last_end == (u64)-1)  				goto out;  			start = last_end + 1; +			if (start < end && state && state->start == start && +			    !need_resched()) +				goto hit_next;  		}  		goto search_again;  	} @@ -1174,9 +1166,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,  			      cached_state, mask);  } -static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, -				 u64 end, struct extent_state **cached_state, -				 gfp_t mask) +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, +			  struct extent_state **cached_state, gfp_t mask)  {  	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,  				cached_state, mask); @@ -1294,7 +1285,7 @@ out:   * returned if we find something, and *start_ret and *end_ret are   * set to reflect the state struct that was found.   * - * If nothing was found, 1 is returned, < 0 on error + * If nothing was found, 1 is returned. If found something, return 0.   */  int find_first_extent_bit(struct extent_io_tree *tree, u64 start,  			  u64 *start_ret, u64 *end_ret, int bits) @@ -1924,12 +1915,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,  	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {  		/* try to remap that extent elsewhere? */  		bio_put(bio); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);  		return -EIO;  	} -	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " -			"sector %llu)\n", page->mapping->host->i_ino, start, -			dev->name, sector); +	printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " +		      "(dev %s sector %llu)\n", page->mapping->host->i_ino, +		      start, rcu_str_deref(dev->name), sector);  	bio_put(bio);  	return 0; @@ -2223,17 +2215,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)  			uptodate = 0;  	} -	if (!uptodate && tree->ops && -	    tree->ops->writepage_io_failed_hook) { -		ret = tree->ops->writepage_io_failed_hook(NULL, page, -						 start, end, NULL); -		/* Writeback already completed */ -		if (ret == 0) -			return 1; -	} -  	if (!uptodate) { -		clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);  		ClearPageUptodate(page);  		SetPageError(page);  	} @@ -2348,10 +2330,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err)  		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {  			ret = tree->ops->readpage_end_io_hook(page, start, end,  							      state, mirror); -			if (ret) +			if (ret) { +				/* no IO indicated but software detected errors +				 * in the block, either checksum errors or +				 * issues with the contents */ +				struct btrfs_root *root = +					BTRFS_I(page->mapping->host)->root; +				struct btrfs_device *device; +  				uptodate = 0; -			else +				device = btrfs_find_device_for_logical( +						root, start, mirror); +				if (device) +					btrfs_dev_stat_inc_and_print(device, +						BTRFS_DEV_STAT_CORRUPTION_ERRS); +			} else {  				clean_io_failure(start, page); +			}  		}  		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { @@ -3165,7 +3160,7 @@ static int write_one_eb(struct extent_buffer *eb,  	u64 offset = eb->start;  	unsigned long i, num_pages;  	int rw = (epd->sync_io ? WRITE_SYNC : WRITE); -	int ret; +	int ret = 0;  	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);  	num_pages = num_extent_pages(eb->start, eb->len); @@ -3330,6 +3325,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,  			     writepage_t writepage, void *data,  			     void (*flush_fn)(void *))  { +	struct inode *inode = mapping->host;  	int ret = 0;  	int done = 0;  	int nr_to_write_done = 0; @@ -3340,6 +3336,18 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,  	int scanned = 0;  	int tag; +	/* +	 * We have to hold onto the inode so that ordered extents can do their +	 * work when the IO finishes.  The alternative to this is failing to add +	 * an ordered extent if the igrab() fails there and that is a huge pain +	 * to deal with, so instead just hold onto the inode throughout the +	 * writepages operation.  If it fails here we are freeing up the inode +	 * anyway and we'd rather not waste our time writing out stuff that is +	 * going to be truncated anyway. +	 */ +	if (!igrab(inode)) +		return 0; +  	pagevec_init(&pvec, 0);  	if (wbc->range_cyclic) {  		index = mapping->writeback_index; /* Start from prev offset */ @@ -3434,6 +3442,7 @@ retry:  		index = 0;  		goto retry;  	} +	btrfs_add_delayed_iput(inode);  	return ret;  } @@ -3931,6 +3940,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,  	eb->start = start;  	eb->len = len;  	eb->tree = tree; +	eb->bflags = 0;  	rwlock_init(&eb->lock);  	atomic_set(&eb->write_locks, 0);  	atomic_set(&eb->read_locks, 0); @@ -3968,6 +3978,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,  	return eb;  } +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) +{ +	unsigned long i; +	struct page *p; +	struct extent_buffer *new; +	unsigned long num_pages = num_extent_pages(src->start, src->len); + +	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); +	if (new == NULL) +		return NULL; + +	for (i = 0; i < num_pages; i++) { +		p = alloc_page(GFP_ATOMIC); +		BUG_ON(!p); +		attach_extent_buffer_page(new, p); +		WARN_ON(PageDirty(p)); +		SetPageUptodate(p); +		new->pages[i] = p; +	} + +	copy_extent_buffer(new, src, 0, 0, src->len); +	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); +	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); + +	return new; +} + +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +{ +	struct extent_buffer *eb; +	unsigned long num_pages = num_extent_pages(0, len); +	unsigned long i; + +	eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); +	if (!eb) +		return NULL; + +	for (i = 0; i < num_pages; i++) { +		eb->pages[i] = alloc_page(GFP_ATOMIC); +		if (!eb->pages[i]) +			goto err; +	} +	set_extent_buffer_uptodate(eb); +	btrfs_set_header_nritems(eb, 0); +	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + +	return eb; +err: +	for (i--; i > 0; i--) +		__free_page(eb->pages[i]); +	__free_extent_buffer(eb); +	return NULL; +} +  static int extent_buffer_under_io(struct extent_buffer *eb)  {  	return (atomic_read(&eb->io_pages) || @@ -3982,18 +4046,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,  						unsigned long start_idx)  {  	unsigned long index; +	unsigned long num_pages;  	struct page *page; +	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);  	BUG_ON(extent_buffer_under_io(eb)); -	index = num_extent_pages(eb->start, eb->len); +	num_pages = num_extent_pages(eb->start, eb->len); +	index = start_idx + num_pages;  	if (start_idx >= index)  		return;  	do {  		index--;  		page = extent_buffer_page(eb, index); -		if (page) { +		if (page && mapped) {  			spin_lock(&page->mapping->private_lock);  			/*  			 * We do this since we'll remove the pages after we've @@ -4018,6 +4085,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,  			}  			spin_unlock(&page->mapping->private_lock); +		} +		if (page) {  			/* One for when we alloced the page */  			page_cache_release(page);  		} @@ -4236,14 +4305,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)  {  	WARN_ON(atomic_read(&eb->refs) == 0);  	if (atomic_dec_and_test(&eb->refs)) { -		struct extent_io_tree *tree = eb->tree; +		if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { +			spin_unlock(&eb->refs_lock); +		} else { +			struct extent_io_tree *tree = eb->tree; -		spin_unlock(&eb->refs_lock); +			spin_unlock(&eb->refs_lock); -		spin_lock(&tree->buffer_lock); -		radix_tree_delete(&tree->buffer, -				  eb->start >> PAGE_CACHE_SHIFT); -		spin_unlock(&tree->buffer_lock); +			spin_lock(&tree->buffer_lock); +			radix_tree_delete(&tree->buffer, +					  eb->start >> PAGE_CACHE_SHIFT); +			spin_unlock(&tree->buffer_lock); +		}  		/* Should be safe to release our pages at this point */  		btrfs_release_extent_buffer_page(eb, 0); @@ -4261,6 +4334,10 @@ void free_extent_buffer(struct extent_buffer *eb)  	spin_lock(&eb->refs_lock);  	if (atomic_read(&eb->refs) == 2 && +	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) +		atomic_dec(&eb->refs); + +	if (atomic_read(&eb->refs) == 2 &&  	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&  	    !extent_buffer_under_io(eb) &&  	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b516c3b8dec..25900af5b15 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -39,6 +39,7 @@  #define EXTENT_BUFFER_STALE 6  #define EXTENT_BUFFER_WRITEBACK 7  #define EXTENT_BUFFER_IOERR 8 +#define EXTENT_BUFFER_DUMMY 9  /* these are flags for extent_clear_unlock_delalloc */  #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -75,9 +76,6 @@ struct extent_io_ops {  			      unsigned long bio_flags);  	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);  	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); -	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, -					u64 start, u64 end, -				       struct extent_state *state);  	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,  				    struct extent_state *state, int mirror);  	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, @@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,  		   struct extent_state **cached_state, gfp_t mask);  int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,  			struct extent_state **cached_state, gfp_t mask); +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, +			  struct extent_state **cached_state, gfp_t mask);  int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,  		   gfp_t mask);  int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, @@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page);  struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,  					  u64 start, unsigned long len); +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);  struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,  					 u64 start, unsigned long len);  void free_extent_buffer(struct extent_buffer *eb); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 53bf2d764bb..9aa01ec2138 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -65,6 +65,21 @@ struct inode_defrag {  	int cycled;  }; +static int __compare_inode_defrag(struct inode_defrag *defrag1, +				  struct inode_defrag *defrag2) +{ +	if (defrag1->root > defrag2->root) +		return 1; +	else if (defrag1->root < defrag2->root) +		return -1; +	else if (defrag1->ino > defrag2->ino) +		return 1; +	else if (defrag1->ino < defrag2->ino) +		return -1; +	else +		return 0; +} +  /* pop a record for an inode into the defrag tree.  The lock   * must be held already   * @@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,  	struct inode_defrag *entry;  	struct rb_node **p;  	struct rb_node *parent = NULL; +	int ret;  	p = &root->fs_info->defrag_inodes.rb_node;  	while (*p) {  		parent = *p;  		entry = rb_entry(parent, struct inode_defrag, rb_node); -		if (defrag->ino < entry->ino) +		ret = __compare_inode_defrag(defrag, entry); +		if (ret < 0)  			p = &parent->rb_left; -		else if (defrag->ino > entry->ino) +		else if (ret > 0)  			p = &parent->rb_right;  		else {  			/* if we're reinserting an entry for @@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,  			goto exists;  		}  	} -	BTRFS_I(inode)->in_defrag = 1; +	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);  	rb_link_node(&defrag->rb_node, parent, p);  	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);  	return; @@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,  	if (btrfs_fs_closing(root->fs_info))  		return 0; -	if (BTRFS_I(inode)->in_defrag) +	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))  		return 0;  	if (trans) @@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,  	defrag->root = root->root_key.objectid;  	spin_lock(&root->fs_info->defrag_inodes_lock); -	if (!BTRFS_I(inode)->in_defrag) +	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))  		__btrfs_add_inode_defrag(inode, defrag);  	else  		kfree(defrag); @@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,  /*   * must be called with the defrag_inodes lock held   */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, +struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, +					     u64 root, u64 ino,  					     struct rb_node **next)  {  	struct inode_defrag *entry = NULL; +	struct inode_defrag tmp;  	struct rb_node *p;  	struct rb_node *parent = NULL; +	int ret; + +	tmp.ino = ino; +	tmp.root = root;  	p = info->defrag_inodes.rb_node;  	while (p) {  		parent = p;  		entry = rb_entry(parent, struct inode_defrag, rb_node); -		if (ino < entry->ino) +		ret = __compare_inode_defrag(&tmp, entry); +		if (ret < 0)  			p = parent->rb_left; -		else if (ino > entry->ino) +		else if (ret > 0)  			p = parent->rb_right;  		else  			return entry;  	}  	if (next) { -		while (parent && ino > entry->ino) { +		while (parent && __compare_inode_defrag(&tmp, entry) > 0) {  			parent = rb_next(parent);  			entry = rb_entry(parent, struct inode_defrag, rb_node);  		} @@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)  	struct btrfs_key key;  	struct btrfs_ioctl_defrag_range_args range;  	u64 first_ino = 0; +	u64 root_objectid = 0;  	int num_defrag;  	int defrag_batch = 1024; @@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)  		n = NULL;  		/* find an inode to defrag */ -		defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); +		defrag = btrfs_find_defrag_inode(fs_info, root_objectid, +						 first_ino, &n);  		if (!defrag) { -			if (n) -				defrag = rb_entry(n, struct inode_defrag, rb_node); -			else if (first_ino) { +			if (n) { +				defrag = rb_entry(n, struct inode_defrag, +						  rb_node); +			} else if (root_objectid || first_ino) { +				root_objectid = 0;  				first_ino = 0;  				continue;  			} else { @@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)  		/* remove it from the rbtree */  		first_ino = defrag->ino + 1; +		root_objectid = defrag->root;  		rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);  		if (btrfs_fs_closing(fs_info)) @@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)  			goto next;  		/* do a chunk of defrag */ -		BTRFS_I(inode)->in_defrag = 0; +		clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);  		range.start = defrag->last_offset;  		num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,  					       defrag_batch); @@ -1305,7 +1334,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,  				    loff_t *ppos, size_t count, size_t ocount)  {  	struct file *file = iocb->ki_filp; -	struct inode *inode = fdentry(file)->d_inode;  	struct iov_iter i;  	ssize_t written;  	ssize_t written_buffered; @@ -1315,18 +1343,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,  	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,  					    count, ocount); -	/* -	 * the generic O_DIRECT will update in-memory i_size after the -	 * DIOs are done.  But our endio handlers that update the on -	 * disk i_size never update past the in memory i_size.  So we -	 * need one more update here to catch any additions to the -	 * file -	 */ -	if (inode->i_size != BTRFS_I(inode)->disk_i_size) { -		btrfs_ordered_update_i_size(inode, inode->i_size, NULL); -		mark_inode_dirty(inode); -	} -  	if (written < 0 || written == count)  		return written; @@ -1404,12 +1420,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  		goto out;  	} -	err = btrfs_update_time(file); +	err = file_update_time(file);  	if (err) {  		mutex_unlock(&inode->i_mutex);  		goto out;  	} -	BTRFS_I(inode)->sequence++;  	start_pos = round_down(pos, root->sectorsize);  	if (start_pos > i_size_read(inode)) { @@ -1466,8 +1481,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)  	 * flush down new bytes that may have been written if the  	 * application were using truncate to replace a file in place.  	 */ -	if (BTRFS_I(inode)->ordered_data_close) { -		BTRFS_I(inode)->ordered_data_close = 0; +	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, +			       &BTRFS_I(inode)->runtime_flags)) {  		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);  		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)  			filemap_flush(inode->i_mapping); @@ -1498,14 +1513,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	trace_btrfs_sync_file(file, datasync); -	ret = filemap_write_and_wait_range(inode->i_mapping, start, end); -	if (ret) -		return ret;  	mutex_lock(&inode->i_mutex); -	/* we wait first, since the writeback may change the inode */ +	/* +	 * we wait first, since the writeback may change the inode, also wait +	 * ordered range does a filemape_write_and_wait_range which is why we +	 * don't do it above like other file systems. +	 */  	root->log_batch++; -	btrfs_wait_ordered_range(inode, 0, (u64)-1); +	btrfs_wait_ordered_range(inode, start, end);  	root->log_batch++;  	/* @@ -1523,7 +1539,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	 * syncing  	 */  	smp_mb(); -	if (BTRFS_I(inode)->last_trans <= +	if (btrfs_inode_in_log(inode, root->fs_info->generation) || +	    BTRFS_I(inode)->last_trans <=  	    root->fs_info->last_trans_committed) {  		BTRFS_I(inode)->last_trans = 0;  		mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 202008ec367..6c4e2baa929 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -33,6 +33,8 @@  static int link_free_space(struct btrfs_free_space_ctl *ctl,  			   struct btrfs_free_space *info); +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, +			      struct btrfs_free_space *info);  static struct inode *__lookup_free_space_inode(struct btrfs_root *root,  					       struct btrfs_path *path, @@ -75,7 +77,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,  		return ERR_PTR(-ENOENT);  	} -	inode->i_mapping->flags &= ~__GFP_FS; +	mapping_set_gfp_mask(inode->i_mapping, +			mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);  	return inode;  } @@ -365,7 +368,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,  static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)  { -	u64 *val; +	__le64 *val;  	io_ctl_map_page(io_ctl, 1); @@ -388,7 +391,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)  static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)  { -	u64 *gen; +	__le64 *gen;  	/*  	 * Skip the crc area.  If we don't check crcs then we just have a 64bit @@ -584,6 +587,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,  	return 0;  } +/* + * Since we attach pinned extents after the fact we can have contiguous sections + * of free space that are split up in entries.  This poses a problem with the + * tree logging stuff since it could have allocated across what appears to be 2 + * entries since we would have merged the entries when adding the pinned extents + * back to the free space cache.  So run through the space cache that we just + * loaded and merge contiguous entries.  This will make the log replay stuff not + * blow up and it will make for nicer allocator behavior. + */ +static void merge_space_tree(struct btrfs_free_space_ctl *ctl) +{ +	struct btrfs_free_space *e, *prev = NULL; +	struct rb_node *n; + +again: +	spin_lock(&ctl->tree_lock); +	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { +		e = rb_entry(n, struct btrfs_free_space, offset_index); +		if (!prev) +			goto next; +		if (e->bitmap || prev->bitmap) +			goto next; +		if (prev->offset + prev->bytes == e->offset) { +			unlink_free_space(ctl, prev); +			unlink_free_space(ctl, e); +			prev->bytes += e->bytes; +			kmem_cache_free(btrfs_free_space_cachep, e); +			link_free_space(ctl, prev); +			prev = NULL; +			spin_unlock(&ctl->tree_lock); +			goto again; +		} +next: +		prev = e; +	} +	spin_unlock(&ctl->tree_lock); +} +  int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  			    struct btrfs_free_space_ctl *ctl,  			    struct btrfs_path *path, u64 offset) @@ -726,6 +767,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  	}  	io_ctl_drop_pages(&io_ctl); +	merge_space_tree(ctl);  	ret = 1;  out:  	io_ctl_free(&io_ctl); @@ -972,9 +1014,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  		goto out; -	ret = filemap_write_and_wait(inode->i_mapping); -	if (ret) -		goto out; +	btrfs_wait_ordered_range(inode, 0, (u64)-1);  	key.objectid = BTRFS_FREE_SPACE_OBJECTID;  	key.offset = offset; @@ -1503,29 +1543,26 @@ again:  	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;  	/* -	 * XXX - this can go away after a few releases. -	 * -	 * since the only user of btrfs_remove_free_space is the tree logging -	 * stuff, and the only way to test that is under crash conditions, we -	 * want to have this debug stuff here just in case somethings not -	 * working.  Search the bitmap for the space we are trying to use to -	 * make sure its actually there.  If its not there then we need to stop -	 * because something has gone wrong. +	 * We need to search for bits in this bitmap.  We could only cover some +	 * of the extent in this bitmap thanks to how we add space, so we need +	 * to search for as much as it as we can and clear that amount, and then +	 * go searching for the next bit.  	 */  	search_start = *offset; -	search_bytes = *bytes; +	search_bytes = ctl->unit;  	search_bytes = min(search_bytes, end - search_start + 1);  	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);  	BUG_ON(ret < 0 || search_start != *offset); -	if (*offset > bitmap_info->offset && *offset + *bytes > end) { -		bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1); -		*bytes -= end - *offset + 1; -		*offset = end + 1; -	} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { -		bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes); -		*bytes = 0; -	} +	/* We may have found more bits than what we need */ +	search_bytes = min(search_bytes, *bytes); + +	/* Cannot clear past the end of the bitmap */ +	search_bytes = min(search_bytes, end - search_start + 1); + +	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); +	*offset += search_bytes; +	*bytes -= search_bytes;  	if (*bytes) {  		struct rb_node *next = rb_next(&bitmap_info->offset_index); @@ -1556,7 +1593,7 @@ again:  		 * everything over again.  		 */  		search_start = *offset; -		search_bytes = *bytes; +		search_bytes = ctl->unit;  		ret = search_bitmap(ctl, bitmap_info, &search_start,  				    &search_bytes);  		if (ret < 0 || search_start != *offset) @@ -1839,12 +1876,14 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,  {  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	struct btrfs_free_space *info; -	struct btrfs_free_space *next_info = NULL;  	int ret = 0;  	spin_lock(&ctl->tree_lock);  again: +	if (!bytes) +		goto out_lock; +  	info = tree_search_offset(ctl, offset, 0, 0);  	if (!info) {  		/* @@ -1865,88 +1904,48 @@ again:  		}  	} -	if (info->bytes < bytes && rb_next(&info->offset_index)) { -		u64 end; -		next_info = rb_entry(rb_next(&info->offset_index), -					     struct btrfs_free_space, -					     offset_index); - -		if (next_info->bitmap) -			end = next_info->offset + -			      BITS_PER_BITMAP * ctl->unit - 1; -		else -			end = next_info->offset + next_info->bytes; - -		if (next_info->bytes < bytes || -		    next_info->offset > offset || offset > end) { -			printk(KERN_CRIT "Found free space at %llu, size %llu," -			      " trying to use %llu\n", -			      (unsigned long long)info->offset, -			      (unsigned long long)info->bytes, -			      (unsigned long long)bytes); -			WARN_ON(1); -			ret = -EINVAL; -			goto out_lock; -		} - -		info = next_info; -	} - -	if (info->bytes == bytes) { +	if (!info->bitmap) {  		unlink_free_space(ctl, info); -		if (info->bitmap) { -			kfree(info->bitmap); -			ctl->total_bitmaps--; -		} -		kmem_cache_free(btrfs_free_space_cachep, info); -		ret = 0; -		goto out_lock; -	} +		if (offset == info->offset) { +			u64 to_free = min(bytes, info->bytes); -	if (!info->bitmap && info->offset == offset) { -		unlink_free_space(ctl, info); -		info->offset += bytes; -		info->bytes -= bytes; -		ret = link_free_space(ctl, info); -		WARN_ON(ret); -		goto out_lock; -	} +			info->bytes -= to_free; +			info->offset += to_free; +			if (info->bytes) { +				ret = link_free_space(ctl, info); +				WARN_ON(ret); +			} else { +				kmem_cache_free(btrfs_free_space_cachep, info); +			} -	if (!info->bitmap && info->offset <= offset && -	    info->offset + info->bytes >= offset + bytes) { -		u64 old_start = info->offset; -		/* -		 * we're freeing space in the middle of the info, -		 * this can happen during tree log replay -		 * -		 * first unlink the old info and then -		 * insert it again after the hole we're creating -		 */ -		unlink_free_space(ctl, info); -		if (offset + bytes < info->offset + info->bytes) { -			u64 old_end = info->offset + info->bytes; +			offset += to_free; +			bytes -= to_free; +			goto again; +		} else { +			u64 old_end = info->bytes + info->offset; -			info->offset = offset + bytes; -			info->bytes = old_end - info->offset; +			info->bytes = offset - info->offset;  			ret = link_free_space(ctl, info);  			WARN_ON(ret);  			if (ret)  				goto out_lock; -		} else { -			/* the hole we're creating ends at the end -			 * of the info struct, just free the info -			 */ -			kmem_cache_free(btrfs_free_space_cachep, info); -		} -		spin_unlock(&ctl->tree_lock); -		/* step two, insert a new info struct to cover -		 * anything before the hole -		 */ -		ret = btrfs_add_free_space(block_group, old_start, -					   offset - old_start); -		WARN_ON(ret); /* -ENOMEM */ -		goto out; +			/* Not enough bytes in this entry to satisfy us */ +			if (old_end < offset + bytes) { +				bytes -= old_end - offset; +				offset = old_end; +				goto again; +			} else if (old_end == offset + bytes) { +				/* all done */ +				goto out_lock; +			} +			spin_unlock(&ctl->tree_lock); + +			ret = btrfs_add_free_space(block_group, offset + bytes, +						   old_end - (offset + bytes)); +			WARN_ON(ret); +			goto out; +		}  	}  	ret = remove_from_bitmap(ctl, info, &offset, &bytes); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ceb7b9c9edc..fb8d671d00e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {  static int btrfs_setsize(struct inode *inode, loff_t newsize);  static int btrfs_truncate(struct inode *inode); -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);  static noinline int cow_file_range(struct inode *inode,  				   struct page *locked_page,  				   u64 start, u64 end, int *page_started, @@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,  	ret = insert_inline_extent(trans, root, inode, start,  				   inline_len, compressed_size,  				   compress_type, compressed_pages); -	if (ret) { +	if (ret && ret != -ENOSPC) {  		btrfs_abort_transaction(trans, root, ret);  		return ret; +	} else if (ret == -ENOSPC) { +		return 1;  	} +  	btrfs_delalloc_release_metadata(inode, end + 1 - start);  	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);  	return 0; @@ -827,7 +830,7 @@ static noinline int cow_file_range(struct inode *inode,  	if (IS_ERR(trans)) {  		extent_clear_unlock_delalloc(inode,  			     &BTRFS_I(inode)->io_tree, -			     start, end, NULL, +			     start, end, locked_page,  			     EXTENT_CLEAR_UNLOCK_PAGE |  			     EXTENT_CLEAR_UNLOCK |  			     EXTENT_CLEAR_DELALLOC | @@ -960,7 +963,7 @@ out:  out_unlock:  	extent_clear_unlock_delalloc(inode,  		     &BTRFS_I(inode)->io_tree, -		     start, end, NULL, +		     start, end, locked_page,  		     EXTENT_CLEAR_UNLOCK_PAGE |  		     EXTENT_CLEAR_UNLOCK |  		     EXTENT_CLEAR_DELALLOC | @@ -983,8 +986,10 @@ static noinline void async_cow_start(struct btrfs_work *work)  	compress_file_range(async_cow->inode, async_cow->locked_page,  			    async_cow->start, async_cow->end, async_cow,  			    &num_added); -	if (num_added == 0) +	if (num_added == 0) { +		btrfs_add_delayed_iput(async_cow->inode);  		async_cow->inode = NULL; +	}  }  /* @@ -1017,6 +1022,8 @@ static noinline void async_cow_free(struct btrfs_work *work)  {  	struct async_cow *async_cow;  	async_cow = container_of(work, struct async_cow, work); +	if (async_cow->inode) +		btrfs_add_delayed_iput(async_cow->inode);  	kfree(async_cow);  } @@ -1035,7 +1042,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,  	while (start < end) {  		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);  		BUG_ON(!async_cow); /* -ENOMEM */ -		async_cow->inode = inode; +		async_cow->inode = igrab(inode);  		async_cow->root = root;  		async_cow->locked_page = locked_page;  		async_cow->start = start; @@ -1133,8 +1140,18 @@ static noinline int run_delalloc_nocow(struct inode *inode,  	u64 ino = btrfs_ino(inode);  	path = btrfs_alloc_path(); -	if (!path) +	if (!path) { +		extent_clear_unlock_delalloc(inode, +			     &BTRFS_I(inode)->io_tree, +			     start, end, locked_page, +			     EXTENT_CLEAR_UNLOCK_PAGE | +			     EXTENT_CLEAR_UNLOCK | +			     EXTENT_CLEAR_DELALLOC | +			     EXTENT_CLEAR_DIRTY | +			     EXTENT_SET_WRITEBACK | +			     EXTENT_END_WRITEBACK);  		return -ENOMEM; +	}  	nolock = btrfs_is_free_space_inode(root, inode); @@ -1144,6 +1161,15 @@ static noinline int run_delalloc_nocow(struct inode *inode,  		trans = btrfs_join_transaction(root);  	if (IS_ERR(trans)) { +		extent_clear_unlock_delalloc(inode, +			     &BTRFS_I(inode)->io_tree, +			     start, end, locked_page, +			     EXTENT_CLEAR_UNLOCK_PAGE | +			     EXTENT_CLEAR_UNLOCK | +			     EXTENT_CLEAR_DELALLOC | +			     EXTENT_CLEAR_DIRTY | +			     EXTENT_SET_WRITEBACK | +			     EXTENT_END_WRITEBACK);  		btrfs_free_path(path);  		return PTR_ERR(trans);  	} @@ -1324,8 +1350,11 @@ out_check:  	}  	btrfs_release_path(path); -	if (cur_offset <= end && cow_start == (u64)-1) +	if (cur_offset <= end && cow_start == (u64)-1) {  		cow_start = cur_offset; +		cur_offset = end; +	} +  	if (cow_start != (u64)-1) {  		ret = cow_file_range(inode, locked_page, cow_start, end,  				     page_started, nr_written, 1); @@ -1344,6 +1373,17 @@ error:  	if (!ret)  		ret = err; +	if (ret && cur_offset < end) +		extent_clear_unlock_delalloc(inode, +			     &BTRFS_I(inode)->io_tree, +			     cur_offset, end, locked_page, +			     EXTENT_CLEAR_UNLOCK_PAGE | +			     EXTENT_CLEAR_UNLOCK | +			     EXTENT_CLEAR_DELALLOC | +			     EXTENT_CLEAR_DIRTY | +			     EXTENT_SET_WRITEBACK | +			     EXTENT_END_WRITEBACK); +  	btrfs_free_path(path);  	return ret;  } @@ -1358,20 +1398,23 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,  	int ret;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {  		ret = run_delalloc_nocow(inode, locked_page, start, end,  					 page_started, 1, nr_written); -	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) +	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {  		ret = run_delalloc_nocow(inode, locked_page, start, end,  					 page_started, 0, nr_written); -	else if (!btrfs_test_opt(root, COMPRESS) && -		 !(BTRFS_I(inode)->force_compress) && -		 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) +	} else if (!btrfs_test_opt(root, COMPRESS) && +		   !(BTRFS_I(inode)->force_compress) && +		   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {  		ret = cow_file_range(inode, locked_page, start, end,  				      page_started, nr_written, 1); -	else +	} else { +		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +			&BTRFS_I(inode)->runtime_flags);  		ret = cow_file_range_async(inode, locked_page, start, end,  					   page_started, nr_written); +	}  	return ret;  } @@ -1572,11 +1615,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  	if (btrfs_is_free_space_inode(root, inode))  		metadata = 2; -	ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); -	if (ret) -		return ret; -  	if (!(rw & REQ_WRITE)) { +		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); +		if (ret) +			return ret; +  		if (bio_flags & EXTENT_BIO_COMPRESSED) {  			return btrfs_submit_compressed_read(inode, bio,  						    mirror_num, bio_flags); @@ -1815,25 +1858,24 @@ out:   * an ordered extent if the range of bytes in the file it covers are   * fully written.   */ -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  { +	struct inode *inode = ordered_extent->inode;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans = NULL; -	struct btrfs_ordered_extent *ordered_extent = NULL;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct extent_state *cached_state = NULL;  	int compress_type = 0;  	int ret;  	bool nolock; -	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, -					     end - start + 1); -	if (!ret) -		return 0; -	BUG_ON(!ordered_extent); /* Logic error */ -  	nolock = btrfs_is_free_space_inode(root, inode); +	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { +		ret = -EIO; +		goto out; +	} +  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {  		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */  		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); @@ -1889,12 +1931,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  				   ordered_extent->file_offset,  				   ordered_extent->len);  	} -	unlock_extent_cached(io_tree, ordered_extent->file_offset, -			     ordered_extent->file_offset + -			     ordered_extent->len - 1, &cached_state, GFP_NOFS); +  	if (ret < 0) {  		btrfs_abort_transaction(trans, root, ret); -		goto out; +		goto out_unlock;  	}  	add_pending_csums(trans, inode, ordered_extent->file_offset, @@ -1905,10 +1945,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  		ret = btrfs_update_inode_fallback(trans, root, inode);  		if (ret) { /* -ENOMEM or corruption */  			btrfs_abort_transaction(trans, root, ret); -			goto out; +			goto out_unlock;  		}  	}  	ret = 0; +out_unlock: +	unlock_extent_cached(io_tree, ordered_extent->file_offset, +			     ordered_extent->file_offset + +			     ordered_extent->len - 1, &cached_state, GFP_NOFS);  out:  	if (root != root->fs_info->tree_root)  		btrfs_delalloc_release_metadata(inode, ordered_extent->len); @@ -1919,26 +1963,57 @@ out:  			btrfs_end_transaction(trans, root);  	} +	if (ret) +		clear_extent_uptodate(io_tree, ordered_extent->file_offset, +				      ordered_extent->file_offset + +				      ordered_extent->len - 1, NULL, GFP_NOFS); + +	/* +	 * This needs to be dont to make sure anybody waiting knows we are done +	 * upating everything for this ordered extent. +	 */ +	btrfs_remove_ordered_extent(inode, ordered_extent); +  	/* once for us */  	btrfs_put_ordered_extent(ordered_extent);  	/* once for the tree */  	btrfs_put_ordered_extent(ordered_extent); -	return 0; -out_unlock: -	unlock_extent_cached(io_tree, ordered_extent->file_offset, -			     ordered_extent->file_offset + -			     ordered_extent->len - 1, &cached_state, GFP_NOFS); -	goto out; +	return ret; +} + +static void finish_ordered_fn(struct btrfs_work *work) +{ +	struct btrfs_ordered_extent *ordered_extent; +	ordered_extent = container_of(work, struct btrfs_ordered_extent, work); +	btrfs_finish_ordered_io(ordered_extent);  }  static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,  				struct extent_state *state, int uptodate)  { +	struct inode *inode = page->mapping->host; +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct btrfs_ordered_extent *ordered_extent = NULL; +	struct btrfs_workers *workers; +  	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);  	ClearPagePrivate2(page); -	return btrfs_finish_ordered_io(page->mapping->host, start, end); +	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, +					    end - start + 1, uptodate)) +		return 0; + +	ordered_extent->work.func = finish_ordered_fn; +	ordered_extent->work.flags = 0; + +	if (btrfs_is_free_space_inode(root, inode)) +		workers = &root->fs_info->endio_freespace_worker; +	else +		workers = &root->fs_info->endio_write_workers; +	btrfs_queue_worker(workers, &ordered_extent->work); + +	return 0;  }  /* @@ -2072,12 +2147,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,  	struct btrfs_block_rsv *block_rsv;  	int ret; -	if (!list_empty(&root->orphan_list) || +	if (atomic_read(&root->orphan_inodes) ||  	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)  		return;  	spin_lock(&root->orphan_lock); -	if (!list_empty(&root->orphan_list)) { +	if (atomic_read(&root->orphan_inodes)) {  		spin_unlock(&root->orphan_lock);  		return;  	} @@ -2134,8 +2209,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  		block_rsv = NULL;  	} -	if (list_empty(&BTRFS_I(inode)->i_orphan)) { -		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); +	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +			      &BTRFS_I(inode)->runtime_flags)) {  #if 0  		/*  		 * For proper ENOSPC handling, we should do orphan @@ -2148,12 +2223,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  			insert = 1;  #endif  		insert = 1; +		atomic_dec(&root->orphan_inodes);  	} -	if (!BTRFS_I(inode)->orphan_meta_reserved) { -		BTRFS_I(inode)->orphan_meta_reserved = 1; +	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, +			      &BTRFS_I(inode)->runtime_flags))  		reserve = 1; -	}  	spin_unlock(&root->orphan_lock);  	/* grab metadata reservation from transaction handle */ @@ -2166,6 +2241,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  	if (insert >= 1) {  		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));  		if (ret && ret != -EEXIST) { +			clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +				  &BTRFS_I(inode)->runtime_flags);  			btrfs_abort_transaction(trans, root, ret);  			return ret;  		} @@ -2196,15 +2273,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)  	int ret = 0;  	spin_lock(&root->orphan_lock); -	if (!list_empty(&BTRFS_I(inode)->i_orphan)) { -		list_del_init(&BTRFS_I(inode)->i_orphan); +	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +			       &BTRFS_I(inode)->runtime_flags))  		delete_item = 1; -	} -	if (BTRFS_I(inode)->orphan_meta_reserved) { -		BTRFS_I(inode)->orphan_meta_reserved = 0; +	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, +			       &BTRFS_I(inode)->runtime_flags))  		release_rsv = 1; -	}  	spin_unlock(&root->orphan_lock);  	if (trans && delete_item) { @@ -2212,8 +2287,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)  		BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */  	} -	if (release_rsv) +	if (release_rsv) {  		btrfs_orphan_release_metadata(inode); +		atomic_dec(&root->orphan_inodes); +	}  	return 0;  } @@ -2341,6 +2418,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  				ret = PTR_ERR(trans);  				goto out;  			} +			printk(KERN_ERR "auto deleting %Lu\n", +			       found_key.objectid);  			ret = btrfs_del_orphan_item(trans, root,  						    found_key.objectid);  			BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ @@ -2352,9 +2431,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		 * add this inode to the orphan list so btrfs_orphan_del does  		 * the proper thing when we hit it  		 */ -		spin_lock(&root->orphan_lock); -		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); -		spin_unlock(&root->orphan_lock); +		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +			&BTRFS_I(inode)->runtime_flags);  		/* if we have links, this was a truncate, lets do that */  		if (inode->i_nlink) { @@ -2510,7 +2588,7 @@ static void btrfs_read_locked_inode(struct inode *inode)  	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));  	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); -	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); +	inode->i_version = btrfs_inode_sequence(leaf, inode_item);  	inode->i_generation = BTRFS_I(inode)->generation;  	inode->i_rdev = 0;  	rdev = btrfs_inode_rdev(leaf, inode_item); @@ -2594,7 +2672,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));  	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); -	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); +	btrfs_set_inode_sequence(leaf, item, inode->i_version);  	btrfs_set_inode_transid(leaf, item, trans->transid);  	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);  	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); @@ -2752,6 +2830,8 @@ err:  		goto out;  	btrfs_i_size_write(dir, dir->i_size - name_len * 2); +	inode_inc_iversion(inode); +	inode_inc_iversion(dir);  	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;  	btrfs_update_inode(trans, root, dir);  out: @@ -3089,6 +3169,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,  	}  	btrfs_i_size_write(dir, dir->i_size - name_len * 2); +	inode_inc_iversion(dir);  	dir->i_mtime = dir->i_ctime = CURRENT_TIME;  	ret = btrfs_update_inode(trans, root, dir);  	if (ret) @@ -3607,7 +3688,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)  		 * any new writes get down to disk quickly.  		 */  		if (newsize == 0) -			BTRFS_I(inode)->ordered_data_close = 1; +			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, +				&BTRFS_I(inode)->runtime_flags);  		/* we don't support swapfiles, so vmtruncate shouldn't fail */  		truncate_setsize(inode, newsize); @@ -3638,6 +3720,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)  	if (attr->ia_valid) {  		setattr_copy(inode, attr); +		inode_inc_iversion(inode);  		err = btrfs_dirty_inode(inode);  		if (!err && attr->ia_valid & ATTR_MODE) @@ -3671,7 +3754,8 @@ void btrfs_evict_inode(struct inode *inode)  	btrfs_wait_ordered_range(inode, 0, (u64)-1);  	if (root->fs_info->log_root_recovering) { -		BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); +		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +				 &BTRFS_I(inode)->runtime_flags));  		goto no_delete;  	} @@ -4066,7 +4150,7 @@ static struct inode *new_simple_dir(struct super_block *s,  	BTRFS_I(inode)->root = root;  	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); -	BTRFS_I(inode)->dummy_inode = 1; +	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);  	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;  	inode->i_op = &btrfs_dir_ro_inode_operations; @@ -4163,7 +4247,7 @@ static void btrfs_dentry_release(struct dentry *dentry)  }  static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, -				   struct nameidata *nd) +				   unsigned int flags)  {  	struct dentry *ret; @@ -4370,7 +4454,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)  	int ret = 0;  	bool nolock = false; -	if (BTRFS_I(inode)->dummy_inode) +	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))  		return 0;  	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) @@ -4403,7 +4487,7 @@ int btrfs_dirty_inode(struct inode *inode)  	struct btrfs_trans_handle *trans;  	int ret; -	if (BTRFS_I(inode)->dummy_inode) +	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))  		return 0;  	trans = btrfs_join_transaction(root); @@ -4431,46 +4515,18 @@ int btrfs_dirty_inode(struct inode *inode)   * This is a copy of file_update_time.  We need this so we can return error on   * ENOSPC for updating the inode in the case of file write and mmap writes.   */ -int btrfs_update_time(struct file *file) +static int btrfs_update_time(struct inode *inode, struct timespec *now, +			     int flags)  { -	struct inode *inode = file->f_path.dentry->d_inode; -	struct timespec now; -	int ret; -	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; - -	/* First try to exhaust all avenues to not sync */ -	if (IS_NOCMTIME(inode)) -		return 0; - -	now = current_fs_time(inode->i_sb); -	if (!timespec_equal(&inode->i_mtime, &now)) -		sync_it = S_MTIME; - -	if (!timespec_equal(&inode->i_ctime, &now)) -		sync_it |= S_CTIME; - -	if (IS_I_VERSION(inode)) -		sync_it |= S_VERSION; - -	if (!sync_it) -		return 0; - -	/* Finally allowed to write? Takes lock. */ -	if (mnt_want_write_file(file)) -		return 0; - -	/* Only change inode inside the lock region */ -	if (sync_it & S_VERSION) +	if (flags & S_VERSION)  		inode_inc_iversion(inode); -	if (sync_it & S_CTIME) -		inode->i_ctime = now; -	if (sync_it & S_MTIME) -		inode->i_mtime = now; -	ret = btrfs_dirty_inode(inode); -	if (!ret) -		mark_inode_dirty_sync(inode); -	mnt_drop_write(file->f_path.mnt); -	return ret; +	if (flags & S_CTIME) +		inode->i_ctime = *now; +	if (flags & S_MTIME) +		inode->i_mtime = *now; +	if (flags & S_ATIME) +		inode->i_atime = *now; +	return btrfs_dirty_inode(inode);  }  /* @@ -4730,6 +4786,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,  	btrfs_i_size_write(parent_inode, parent_inode->i_size +  			   name_len * 2); +	inode_inc_iversion(parent_inode);  	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;  	ret = btrfs_update_inode(trans, root, parent_inode);  	if (ret) @@ -4836,7 +4893,7 @@ out_unlock:  }  static int btrfs_create(struct inode *dir, struct dentry *dentry, -			umode_t mode, struct nameidata *nd) +			umode_t mode, bool excl)  {  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(dir)->root; @@ -4937,6 +4994,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,  	}  	btrfs_inc_nlink(inode); +	inode_inc_iversion(inode);  	inode->i_ctime = CURRENT_TIME;  	ihold(inode); @@ -5818,8 +5876,17 @@ map:  	bh_result->b_size = len;  	bh_result->b_bdev = em->bdev;  	set_buffer_mapped(bh_result); -	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) -		set_buffer_new(bh_result); +	if (create) { +		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) +			set_buffer_new(bh_result); + +		/* +		 * Need to update the i_size under the extent lock so buffered +		 * readers will get the updated i_size when we unlock. +		 */ +		if (start + len > i_size_read(inode)) +			i_size_write(inode, start + len); +	}  	free_extent_map(em); @@ -5903,9 +5970,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)  	struct btrfs_dio_private *dip = bio->bi_private;  	struct inode *inode = dip->inode;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct btrfs_trans_handle *trans;  	struct btrfs_ordered_extent *ordered = NULL; -	struct extent_state *cached_state = NULL;  	u64 ordered_offset = dip->logical_offset;  	u64 ordered_bytes = dip->bytes;  	int ret; @@ -5915,73 +5980,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)  again:  	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,  						   &ordered_offset, -						   ordered_bytes); +						   ordered_bytes, !err);  	if (!ret)  		goto out_test; -	BUG_ON(!ordered); - -	trans = btrfs_join_transaction(root); -	if (IS_ERR(trans)) { -		err = -ENOMEM; -		goto out; -	} -	trans->block_rsv = &root->fs_info->delalloc_block_rsv; - -	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { -		ret = btrfs_ordered_update_i_size(inode, 0, ordered); -		if (!ret) -			err = btrfs_update_inode_fallback(trans, root, inode); -		goto out; -	} - -	lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, -			 ordered->file_offset + ordered->len - 1, 0, -			 &cached_state); - -	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { -		ret = btrfs_mark_extent_written(trans, inode, -						ordered->file_offset, -						ordered->file_offset + -						ordered->len); -		if (ret) { -			err = ret; -			goto out_unlock; -		} -	} else { -		ret = insert_reserved_file_extent(trans, inode, -						  ordered->file_offset, -						  ordered->start, -						  ordered->disk_len, -						  ordered->len, -						  ordered->len, -						  0, 0, 0, -						  BTRFS_FILE_EXTENT_REG); -		unpin_extent_cache(&BTRFS_I(inode)->extent_tree, -				   ordered->file_offset, ordered->len); -		if (ret) { -			err = ret; -			WARN_ON(1); -			goto out_unlock; -		} -	} - -	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); -	ret = btrfs_ordered_update_i_size(inode, 0, ordered); -	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) -		btrfs_update_inode_fallback(trans, root, inode); -	ret = 0; -out_unlock: -	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, -			     ordered->file_offset + ordered->len - 1, -			     &cached_state, GFP_NOFS); -out: -	btrfs_delalloc_release_metadata(inode, ordered->len); -	btrfs_end_transaction(trans, root); -	ordered_offset = ordered->file_offset + ordered->len; -	btrfs_put_ordered_extent(ordered); -	btrfs_put_ordered_extent(ordered); - +	ordered->work.func = finish_ordered_fn; +	ordered->work.flags = 0; +	btrfs_queue_worker(&root->fs_info->endio_write_workers, +			   &ordered->work);  out_test:  	/*  	 * our bio might span multiple ordered extents.  If we haven't @@ -5990,12 +5996,12 @@ out_test:  	if (ordered_offset < dip->logical_offset + dip->bytes) {  		ordered_bytes = dip->logical_offset + dip->bytes -  			ordered_offset; +		ordered = NULL;  		goto again;  	}  out_done:  	bio->bi_private = dip->private; -	kfree(dip->csums);  	kfree(dip);  	/* If we had an error make sure to clear the uptodate flag */ @@ -6063,9 +6069,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,  	int ret;  	bio_get(bio); -	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); -	if (ret) -		goto err; + +	if (!write) { +		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); +		if (ret) +			goto err; +	}  	if (skip_sum)  		goto map; @@ -6360,12 +6369,48 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,  		 */  		ordered = btrfs_lookup_ordered_range(inode, lockstart,  						     lockend - lockstart + 1); -		if (!ordered) + +		/* +		 * We need to make sure there are no buffered pages in this +		 * range either, we could have raced between the invalidate in +		 * generic_file_direct_write and locking the extent.  The +		 * invalidate needs to happen so that reads after a write do not +		 * get stale data. +		 */ +		if (!ordered && (!writing || +		    !test_range_bit(&BTRFS_I(inode)->io_tree, +				    lockstart, lockend, EXTENT_UPTODATE, 0, +				    cached_state)))  			break; +  		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,  				     &cached_state, GFP_NOFS); -		btrfs_start_ordered_extent(inode, ordered, 1); -		btrfs_put_ordered_extent(ordered); + +		if (ordered) { +			btrfs_start_ordered_extent(inode, ordered, 1); +			btrfs_put_ordered_extent(ordered); +		} else { +			/* Screw you mmap */ +			ret = filemap_write_and_wait_range(file->f_mapping, +							   lockstart, +							   lockend); +			if (ret) +				goto out; + +			/* +			 * If we found a page that couldn't be invalidated just +			 * fall back to buffered. +			 */ +			ret = invalidate_inode_pages2_range(file->f_mapping, +					lockstart >> PAGE_CACHE_SHIFT, +					lockend >> PAGE_CACHE_SHIFT); +			if (ret) { +				if (ret == -EBUSY) +					ret = 0; +				goto out; +			} +		} +  		cond_resched();  	} @@ -6485,13 +6530,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)  static void btrfs_invalidatepage(struct page *page, unsigned long offset)  { +	struct inode *inode = page->mapping->host;  	struct extent_io_tree *tree;  	struct btrfs_ordered_extent *ordered;  	struct extent_state *cached_state = NULL;  	u64 page_start = page_offset(page);  	u64 page_end = page_start + PAGE_CACHE_SIZE - 1; -  	/*  	 * we have the page locked, so new writeback can't start,  	 * and the dirty bit won't be cleared while we are here. @@ -6501,13 +6546,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  	 */  	wait_on_page_writeback(page); -	tree = &BTRFS_I(page->mapping->host)->io_tree; +	tree = &BTRFS_I(inode)->io_tree;  	if (offset) {  		btrfs_releasepage(page, GFP_NOFS);  		return;  	}  	lock_extent_bits(tree, page_start, page_end, 0, &cached_state); -	ordered = btrfs_lookup_ordered_extent(page->mapping->host, +	ordered = btrfs_lookup_ordered_extent(inode,  					   page_offset(page));  	if (ordered) {  		/* @@ -6522,9 +6567,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  		 * whoever cleared the private bit is responsible  		 * for the finish_ordered_io  		 */ -		if (TestClearPagePrivate2(page)) { -			btrfs_finish_ordered_io(page->mapping->host, -						page_start, page_end); +		if (TestClearPagePrivate2(page) && +		    btrfs_dec_test_ordered_pending(inode, &ordered, page_start, +						   PAGE_CACHE_SIZE, 1)) { +			btrfs_finish_ordered_io(ordered);  		}  		btrfs_put_ordered_extent(ordered);  		cached_state = NULL; @@ -6576,7 +6622,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);  	if (!ret) { -		ret = btrfs_update_time(vma->vm_file); +		ret = file_update_time(vma->vm_file);  		reserved = 1;  	}  	if (ret) { @@ -6771,7 +6817,8 @@ static int btrfs_truncate(struct inode *inode)  	 * using truncate to replace the contents of the file will  	 * end up with a zero length file after a crash.  	 */ -	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) +	if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, +					   &BTRFS_I(inode)->runtime_flags))  		btrfs_add_ordered_operation(trans, root, inode);  	while (1) { @@ -6894,7 +6941,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->root = NULL;  	ei->space_info = NULL;  	ei->generation = 0; -	ei->sequence = 0;  	ei->last_trans = 0;  	ei->last_sub_trans = 0;  	ei->logged_trans = 0; @@ -6909,11 +6955,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->outstanding_extents = 0;  	ei->reserved_extents = 0; -	ei->ordered_data_close = 0; -	ei->orphan_meta_reserved = 0; -	ei->dummy_inode = 0; -	ei->in_defrag = 0; -	ei->delalloc_meta_reserved = 0; +	ei->runtime_flags = 0;  	ei->force_compress = BTRFS_COMPRESS_NONE;  	ei->delayed_node = NULL; @@ -6927,7 +6969,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	mutex_init(&ei->log_mutex);  	mutex_init(&ei->delalloc_mutex);  	btrfs_ordered_inode_tree_init(&ei->ordered_tree); -	INIT_LIST_HEAD(&ei->i_orphan);  	INIT_LIST_HEAD(&ei->delalloc_inodes);  	INIT_LIST_HEAD(&ei->ordered_operations);  	RB_CLEAR_NODE(&ei->rb_node); @@ -6946,7 +6987,7 @@ void btrfs_destroy_inode(struct inode *inode)  	struct btrfs_ordered_extent *ordered;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	WARN_ON(!list_empty(&inode->i_dentry)); +	WARN_ON(!hlist_empty(&inode->i_dentry));  	WARN_ON(inode->i_data.nrpages);  	WARN_ON(BTRFS_I(inode)->outstanding_extents);  	WARN_ON(BTRFS_I(inode)->reserved_extents); @@ -6972,13 +7013,12 @@ void btrfs_destroy_inode(struct inode *inode)  		spin_unlock(&root->fs_info->ordered_extent_lock);  	} -	spin_lock(&root->orphan_lock); -	if (!list_empty(&BTRFS_I(inode)->i_orphan)) { +	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +		     &BTRFS_I(inode)->runtime_flags)) {  		printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",  		       (unsigned long long)btrfs_ino(inode)); -		list_del_init(&BTRFS_I(inode)->i_orphan); +		atomic_dec(&root->orphan_inodes);  	} -	spin_unlock(&root->orphan_lock);  	while (1) {  		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -7099,10 +7139,13 @@ static void fixup_inode_flags(struct inode *dir, struct inode *inode)  	else  		b_inode->flags &= ~BTRFS_INODE_NODATACOW; -	if (b_dir->flags & BTRFS_INODE_COMPRESS) +	if (b_dir->flags & BTRFS_INODE_COMPRESS) {  		b_inode->flags |= BTRFS_INODE_COMPRESS; -	else -		b_inode->flags &= ~BTRFS_INODE_COMPRESS; +		b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; +	} else { +		b_inode->flags &= ~(BTRFS_INODE_COMPRESS | +				    BTRFS_INODE_NOCOMPRESS); +	}  }  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -7193,6 +7236,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))  		btrfs_add_ordered_operation(trans, root, old_inode); +	inode_inc_iversion(old_dir); +	inode_inc_iversion(new_dir); +	inode_inc_iversion(old_inode);  	old_dir->i_ctime = old_dir->i_mtime = ctime;  	new_dir->i_ctime = new_dir->i_mtime = ctime;  	old_inode->i_ctime = ctime; @@ -7219,6 +7265,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	}  	if (new_inode) { +		inode_inc_iversion(new_inode);  		new_inode->i_ctime = CURRENT_TIME;  		if (unlikely(btrfs_ino(new_inode) ==  			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { @@ -7490,6 +7537,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  		cur_offset += ins.offset;  		*alloc_hint = ins.objectid + ins.offset; +		inode_inc_iversion(inode);  		inode->i_ctime = CURRENT_TIME;  		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;  		if (!(mode & FALLOC_FL_KEEP_SIZE) && @@ -7647,6 +7695,7 @@ static const struct inode_operations btrfs_file_inode_operations = {  	.permission	= btrfs_permission,  	.fiemap		= btrfs_fiemap,  	.get_acl	= btrfs_get_acl, +	.update_time	= btrfs_update_time,  };  static const struct inode_operations btrfs_special_inode_operations = {  	.getattr	= btrfs_getattr, @@ -7657,6 +7706,7 @@ static const struct inode_operations btrfs_special_inode_operations = {  	.listxattr	= btrfs_listxattr,  	.removexattr	= btrfs_removexattr,  	.get_acl	= btrfs_get_acl, +	.update_time	= btrfs_update_time,  };  static const struct inode_operations btrfs_symlink_inode_operations = {  	.readlink	= generic_readlink, @@ -7670,6 +7720,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {  	.listxattr	= btrfs_listxattr,  	.removexattr	= btrfs_removexattr,  	.get_acl	= btrfs_get_acl, +	.update_time	= btrfs_update_time,  };  const struct dentry_operations btrfs_dentry_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14f8e1faa46..1e9f6c019ad 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -52,6 +52,7 @@  #include "locking.h"  #include "inode-map.h"  #include "backref.h" +#include "rcu-string.h"  /* Mask out flags that are inappropriate for the given type of inode. */  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -261,6 +262,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  	}  	btrfs_update_iflags(inode); +	inode_inc_iversion(inode);  	inode->i_ctime = CURRENT_TIME;  	ret = btrfs_update_inode(trans, root, inode); @@ -367,7 +369,7 @@ static noinline int create_subvol(struct btrfs_root *root,  		return PTR_ERR(trans);  	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, -				      0, objectid, NULL, 0, 0, 0, 0); +				      0, objectid, NULL, 0, 0, 0);  	if (IS_ERR(leaf)) {  		ret = PTR_ERR(leaf);  		goto fail; @@ -784,39 +786,57 @@ none:  	return -ENOENT;  } -/* - * Validaty check of prev em and next em: - * 1) no prev/next em - * 2) prev/next em is an hole/inline extent - */ -static int check_adjacent_extents(struct inode *inode, struct extent_map *em) +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)  {  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; -	struct extent_map *prev = NULL, *next = NULL; -	int ret = 0; +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +	struct extent_map *em; +	u64 len = PAGE_CACHE_SIZE; +	/* +	 * hopefully we have this extent in the tree already, try without +	 * the full extent lock +	 */  	read_lock(&em_tree->lock); -	prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1); -	next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1); +	em = lookup_extent_mapping(em_tree, start, len);  	read_unlock(&em_tree->lock); -	if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) && -	    (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)) -		ret = 1; -	free_extent_map(prev); -	free_extent_map(next); +	if (!em) { +		/* get the big lock and read metadata off disk */ +		lock_extent(io_tree, start, start + len - 1); +		em = btrfs_get_extent(inode, NULL, 0, start, len, 0); +		unlock_extent(io_tree, start, start + len - 1); +		if (IS_ERR(em)) +			return NULL; +	} + +	return em; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +{ +	struct extent_map *next; +	bool ret = true; + +	/* this is the last extent */ +	if (em->start + em->len >= i_size_read(inode)) +		return false; + +	next = defrag_lookup_extent(inode, em->start + em->len); +	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) +		ret = false; + +	free_extent_map(next);  	return ret;  } -static int should_defrag_range(struct inode *inode, u64 start, u64 len, -			       int thresh, u64 *last_len, u64 *skip, -			       u64 *defrag_end) +static int should_defrag_range(struct inode *inode, u64 start, int thresh, +			       u64 *last_len, u64 *skip, u64 *defrag_end)  { -	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; -	struct extent_map *em = NULL; -	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +	struct extent_map *em;  	int ret = 1; +	bool next_mergeable = true;  	/*  	 * make sure that once we start defragging an extent, we keep on @@ -827,23 +847,9 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,  	*skip = 0; -	/* -	 * hopefully we have this extent in the tree already, try without -	 * the full extent lock -	 */ -	read_lock(&em_tree->lock); -	em = lookup_extent_mapping(em_tree, start, len); -	read_unlock(&em_tree->lock); - -	if (!em) { -		/* get the big lock and read metadata off disk */ -		lock_extent(io_tree, start, start + len - 1); -		em = btrfs_get_extent(inode, NULL, 0, start, len, 0); -		unlock_extent(io_tree, start, start + len - 1); - -		if (IS_ERR(em)) -			return 0; -	} +	em = defrag_lookup_extent(inode, start); +	if (!em) +		return 0;  	/* this will cover holes, and inline extents */  	if (em->block_start >= EXTENT_MAP_LAST_BYTE) { @@ -851,18 +857,15 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,  		goto out;  	} -	/* If we have nothing to merge with us, just skip. */ -	if (check_adjacent_extents(inode, em)) { -		ret = 0; -		goto out; -	} +	next_mergeable = defrag_check_next_extent(inode, em);  	/* -	 * we hit a real extent, if it is big don't bother defragging it again +	 * we hit a real extent, if it is big or the next extent is not a +	 * real extent, don't bother defragging it  	 */ -	if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) +	if ((*last_len == 0 || *last_len >= thresh) && +	    (em->len >= thresh || !next_mergeable))  		ret = 0; -  out:  	/*  	 * last_len ends up being a counter of how many bytes we've defragged. @@ -1141,8 +1144,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  			break;  		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, -					 PAGE_CACHE_SIZE, extent_thresh, -					 &last_len, &skip, &defrag_end)) { +					 extent_thresh, &last_len, &skip, +					 &defrag_end)) {  			unsigned long next;  			/*  			 * the should_defrag function tells us how much to skip @@ -1303,6 +1306,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,  		ret = -EINVAL;  		goto out_free;  	} +	if (device->fs_devices && device->fs_devices->seeding) { +		printk(KERN_INFO "btrfs: resizer unable to apply on " +		       "seeding device %llu\n", +		       (unsigned long long)devid); +		ret = -EINVAL; +		goto out_free; +	} +  	if (!strcmp(sizestr, "max"))  		new_size = device->bdev->bd_inode->i_size;  	else { @@ -1344,8 +1355,9 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,  	do_div(new_size, root->sectorsize);  	new_size *= root->sectorsize; -	printk(KERN_INFO "btrfs: new size for %s is %llu\n", -		device->name, (unsigned long long)new_size); +	printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", +		      rcu_str_deref(device->name), +		      (unsigned long long)new_size);  	if (new_size > old_size) {  		trans = btrfs_start_transaction(root, 0); @@ -2262,10 +2274,17 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)  	di_args->bytes_used = dev->bytes_used;  	di_args->total_bytes = dev->total_bytes;  	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); -	if (dev->name) -		strncpy(di_args->path, dev->name, sizeof(di_args->path)); -	else +	if (dev->name) { +		struct rcu_string *name; + +		rcu_read_lock(); +		name = rcu_dereference(dev->name); +		strncpy(di_args->path, name->str, sizeof(di_args->path)); +		rcu_read_unlock(); +		di_args->path[sizeof(di_args->path) - 1] = 0; +	} else {  		di_args->path[0] = '\0'; +	}  out:  	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) @@ -2622,6 +2641,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  			btrfs_mark_buffer_dirty(leaf);  			btrfs_release_path(path); +			inode_inc_iversion(inode);  			inode->i_mtime = inode->i_ctime = CURRENT_TIME;  			/* @@ -2914,7 +2934,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)  		up_read(&info->groups_sem);  	} -	user_dest = (struct btrfs_ioctl_space_info *) +	user_dest = (struct btrfs_ioctl_space_info __user *)  		(arg + sizeof(struct btrfs_ioctl_space_args));  	if (copy_to_user(user_dest, dest_orig, alloc_size)) @@ -3042,6 +3062,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,  	return ret;  } +static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, +				      void __user *arg, int reset_after_read) +{ +	struct btrfs_ioctl_get_dev_stats *sa; +	int ret; + +	if (reset_after_read && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	sa = memdup_user(arg, sizeof(*sa)); +	if (IS_ERR(sa)) +		return PTR_ERR(sa); + +	ret = btrfs_get_dev_stats(root, sa, reset_after_read); + +	if (copy_to_user(arg, sa, sizeof(*sa))) +		ret = -EFAULT; + +	kfree(sa); +	return ret; +} +  static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)  {  	int ret = 0; @@ -3212,8 +3254,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,  	}  } -static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_balance(struct file *file, void __user *arg)  { +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_ioctl_balance_args *bargs;  	struct btrfs_balance_control *bctl; @@ -3225,6 +3268,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)  	if (fs_info->sb->s_flags & MS_RDONLY)  		return -EROFS; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret; +  	mutex_lock(&fs_info->volume_mutex);  	mutex_lock(&fs_info->balance_mutex); @@ -3291,6 +3338,7 @@ out_bargs:  out:  	mutex_unlock(&fs_info->balance_mutex);  	mutex_unlock(&fs_info->volume_mutex); +	mnt_drop_write_file(file);  	return ret;  } @@ -3386,7 +3434,7 @@ long btrfs_ioctl(struct file *file, unsigned int  	case BTRFS_IOC_DEV_INFO:  		return btrfs_ioctl_dev_info(root, argp);  	case BTRFS_IOC_BALANCE: -		return btrfs_ioctl_balance(root, NULL); +		return btrfs_ioctl_balance(file, NULL);  	case BTRFS_IOC_CLONE:  		return btrfs_ioctl_clone(file, arg, 0, 0, 0);  	case BTRFS_IOC_CLONE_RANGE: @@ -3419,11 +3467,15 @@ long btrfs_ioctl(struct file *file, unsigned int  	case BTRFS_IOC_SCRUB_PROGRESS:  		return btrfs_ioctl_scrub_progress(root, argp);  	case BTRFS_IOC_BALANCE_V2: -		return btrfs_ioctl_balance(root, argp); +		return btrfs_ioctl_balance(file, argp);  	case BTRFS_IOC_BALANCE_CTL:  		return btrfs_ioctl_balance_ctl(root, arg);  	case BTRFS_IOC_BALANCE_PROGRESS:  		return btrfs_ioctl_balance_progress(root, argp); +	case BTRFS_IOC_GET_DEV_STATS: +		return btrfs_ioctl_get_dev_stats(root, argp, 0); +	case BTRFS_IOC_GET_AND_RESET_DEV_STATS: +		return btrfs_ioctl_get_dev_stats(root, argp, 1);  	}  	return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 086e6bdae1c..e440aa653c3 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -266,6 +266,35 @@ struct btrfs_ioctl_logical_ino_args {  	__u64				inodes;  }; +enum btrfs_dev_stat_values { +	/* disk I/O failure stats */ +	BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ +	BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ +	BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ + +	/* stats for indirect indications for I/O failures */ +	BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or +					 * contents is illegal: this is an +					 * indication that the block was damaged +					 * during read or write, or written to +					 * wrong location or read from wrong +					 * location */ +	BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not +					 * been written */ + +	BTRFS_DEV_STAT_VALUES_MAX +}; + +struct btrfs_ioctl_get_dev_stats { +	__u64 devid;				/* in */ +	__u64 nr_items;				/* in/out */ + +	/* out values: */ +	__u64 values[BTRFS_DEV_STAT_VALUES_MAX]; + +	__u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ +}; +  #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \  				   struct btrfs_ioctl_vol_args)  #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -310,7 +339,7 @@ struct btrfs_ioctl_logical_ino_args {  #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)  #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \  				   struct btrfs_ioctl_vol_args_v2) -#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64) +#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)  #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)  #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \  			      struct btrfs_ioctl_scrub_args) @@ -330,5 +359,9 @@ struct btrfs_ioctl_logical_ino_args {  					struct btrfs_ioctl_ino_path_args)  #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \  					struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ +				      struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ +					struct btrfs_ioctl_get_dev_stats)  #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index bbf6d0d9aeb..643335a4fe3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  	entry->len = len;  	entry->disk_len = disk_len;  	entry->bytes_left = len; -	entry->inode = inode; +	entry->inode = igrab(inode);  	entry->compress_type = compress_type;  	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)  		set_bit(type, &entry->flags); @@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  	trace_btrfs_ordered_extent_add(inode, entry); -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	node = tree_insert(&tree->tree, file_offset,  			   &entry->rb_node);  	if (node)  		ordered_data_tree_panic(inode, -EEXIST, file_offset); -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);  	list_add_tail(&entry->root_extent_list, @@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode,  	struct btrfs_ordered_inode_tree *tree;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	list_add_tail(&sum->list, &entry->list); -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  }  /* @@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode,   */  int btrfs_dec_test_first_ordered_pending(struct inode *inode,  				   struct btrfs_ordered_extent **cached, -				   u64 *file_offset, u64 io_size) +				   u64 *file_offset, u64 io_size, int uptodate)  {  	struct btrfs_ordered_inode_tree *tree;  	struct rb_node *node;  	struct btrfs_ordered_extent *entry = NULL;  	int ret; +	unsigned long flags;  	u64 dec_end;  	u64 dec_start;  	u64 to_dec;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irqsave(&tree->lock, flags);  	node = tree_search(tree, *file_offset);  	if (!node) {  		ret = 1; @@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,  		       (unsigned long long)to_dec);  	}  	entry->bytes_left -= to_dec; +	if (!uptodate) +		set_bit(BTRFS_ORDERED_IOERR, &entry->flags); +  	if (entry->bytes_left == 0)  		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);  	else @@ -332,7 +336,7 @@ out:  		*cached = entry;  		atomic_inc(&entry->refs);  	} -	spin_unlock(&tree->lock); +	spin_unlock_irqrestore(&tree->lock, flags);  	return ret == 0;  } @@ -347,15 +351,21 @@ out:   */  int btrfs_dec_test_ordered_pending(struct inode *inode,  				   struct btrfs_ordered_extent **cached, -				   u64 file_offset, u64 io_size) +				   u64 file_offset, u64 io_size, int uptodate)  {  	struct btrfs_ordered_inode_tree *tree;  	struct rb_node *node;  	struct btrfs_ordered_extent *entry = NULL; +	unsigned long flags;  	int ret;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irqsave(&tree->lock, flags); +	if (cached && *cached) { +		entry = *cached; +		goto have_entry; +	} +  	node = tree_search(tree, file_offset);  	if (!node) {  		ret = 1; @@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,  	}  	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +have_entry:  	if (!offset_in_entry(entry, file_offset)) {  		ret = 1;  		goto out; @@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,  		       (unsigned long long)io_size);  	}  	entry->bytes_left -= io_size; +	if (!uptodate) +		set_bit(BTRFS_ORDERED_IOERR, &entry->flags); +  	if (entry->bytes_left == 0)  		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);  	else @@ -383,7 +397,7 @@ out:  		*cached = entry;  		atomic_inc(&entry->refs);  	} -	spin_unlock(&tree->lock); +	spin_unlock_irqrestore(&tree->lock, flags);  	return ret == 0;  } @@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)  	trace_btrfs_ordered_extent_put(entry->inode, entry);  	if (atomic_dec_and_test(&entry->refs)) { +		if (entry->inode) +			btrfs_add_delayed_iput(entry->inode);  		while (!list_empty(&entry->list)) {  			cur = entry->list.next;  			sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)  /*   * remove an ordered extent from the tree.  No references are dropped - * and you must wake_up entry->wait.  You must hold the tree lock - * while you call this function. + * and waiters are woken up.   */ -static void __btrfs_remove_ordered_extent(struct inode *inode, -					  struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, +				 struct btrfs_ordered_extent *entry)  {  	struct btrfs_ordered_inode_tree *tree;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct rb_node *node;  	tree = &BTRFS_I(inode)->ordered_tree; +	spin_lock_irq(&tree->lock);  	node = &entry->rb_node;  	rb_erase(node, &tree->tree);  	tree->last = NULL;  	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); +	spin_unlock_irq(&tree->lock);  	spin_lock(&root->fs_info->ordered_extent_lock);  	list_del_init(&entry->root_extent_list); @@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode,  		list_del_init(&BTRFS_I(inode)->ordered_operations);  	}  	spin_unlock(&root->fs_info->ordered_extent_lock); -} - -/* - * remove an ordered extent from the tree.  No references are dropped - * but any waiters are woken. - */ -void btrfs_remove_ordered_extent(struct inode *inode, -				 struct btrfs_ordered_extent *entry) -{ -	struct btrfs_ordered_inode_tree *tree; - -	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); -	__btrfs_remove_ordered_extent(inode, entry); -	spin_unlock(&tree->lock);  	wake_up(&entry->wait);  } @@ -621,17 +623,29 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)  		if (orig_end > INT_LIMIT(loff_t))  			orig_end = INT_LIMIT(loff_t);  	} -again: +  	/* start IO across the range first to instantiate any delalloc  	 * extents  	 */  	filemap_fdatawrite_range(inode->i_mapping, start, orig_end); -	/* The compression code will leave pages locked but return from -	 * writepage without setting the page writeback.  Starting again -	 * with WB_SYNC_ALL will end up waiting for the IO to actually start. +	/* +	 * So with compression we will find and lock a dirty page and clear the +	 * first one as dirty, setup an async extent, and immediately return +	 * with the entire range locked but with nobody actually marked with +	 * writeback.  So we can't just filemap_write_and_wait_range() and +	 * expect it to work since it will just kick off a thread to do the +	 * actual work.  So we need to call filemap_fdatawrite_range _again_ +	 * since it will wait on the page lock, which won't be unlocked until +	 * after the pages have been marked as writeback and so we're good to go +	 * from there.  We have to do this otherwise we'll miss the ordered +	 * extents and that results in badness.  Please Josef, do not think you +	 * know better and pull this out at some point in the future, it is +	 * right and you are wrong.  	 */ -	filemap_fdatawrite_range(inode->i_mapping, start, orig_end); +	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +		     &BTRFS_I(inode)->runtime_flags)) +		filemap_fdatawrite_range(inode->i_mapping, start, orig_end);  	filemap_fdatawait_range(inode->i_mapping, start, orig_end); @@ -657,11 +671,6 @@ again:  			break;  		end--;  	} -	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, -			   EXTENT_DELALLOC, 0, NULL)) { -		schedule_timeout(1); -		goto again; -	}  }  /* @@ -676,7 +685,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,  	struct btrfs_ordered_extent *entry = NULL;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	node = tree_search(tree, file_offset);  	if (!node)  		goto out; @@ -687,7 +696,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,  	if (entry)  		atomic_inc(&entry->refs);  out: -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  	return entry;  } @@ -703,7 +712,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,  	struct btrfs_ordered_extent *entry = NULL;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	node = tree_search(tree, file_offset);  	if (!node) {  		node = tree_search(tree, file_offset + len); @@ -728,7 +737,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,  out:  	if (entry)  		atomic_inc(&entry->refs); -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  	return entry;  } @@ -744,7 +753,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)  	struct btrfs_ordered_extent *entry = NULL;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	node = tree_search(tree, file_offset);  	if (!node)  		goto out; @@ -752,7 +761,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)  	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);  	atomic_inc(&entry->refs);  out: -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  	return entry;  } @@ -764,7 +773,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  				struct btrfs_ordered_extent *ordered)  {  	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; -	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	u64 disk_i_size;  	u64 new_i_size;  	u64 i_size_test; @@ -779,7 +787,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  	else  		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	disk_i_size = BTRFS_I(inode)->disk_i_size;  	/* truncate file */ @@ -798,14 +806,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  	}  	/* -	 * we can't update the disk_isize if there are delalloc bytes -	 * between disk_i_size and  this ordered extent -	 */ -	if (test_range_bit(io_tree, disk_i_size, offset - 1, -			   EXTENT_DELALLOC, 0, NULL)) { -		goto out; -	} -	/*  	 * walk backward from this ordered extent to disk_i_size.  	 * if we find an ordered extent then we can't update disk i_size  	 * yet @@ -825,15 +825,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  		}  		node = prev;  	} -	while (node) { +	for (; node; node = rb_prev(node)) {  		test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + +		/* We treat this entry as if it doesnt exist */ +		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) +			continue;  		if (test->file_offset + test->len <= disk_i_size)  			break;  		if (test->file_offset >= i_size)  			break;  		if (test->file_offset >= disk_i_size)  			goto out; -		node = rb_prev(node);  	}  	new_i_size = min_t(u64, offset, i_size); @@ -851,43 +854,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  		else  			node = rb_first(&tree->tree);  	} -	i_size_test = 0; -	if (node) { -		/* -		 * do we have an area where IO might have finished -		 * between our ordered extent and the next one. -		 */ + +	/* +	 * We are looking for an area between our current extent and the next +	 * ordered extent to update the i_size to.  There are 3 cases here +	 * +	 * 1) We don't actually have anything and we can update to i_size. +	 * 2) We have stuff but they already did their i_size update so again we +	 * can just update to i_size. +	 * 3) We have an outstanding ordered extent so the most we can update +	 * our disk_i_size to is the start of the next offset. +	 */ +	i_size_test = i_size; +	for (; node; node = rb_next(node)) {  		test = rb_entry(node, struct btrfs_ordered_extent, rb_node); -		if (test->file_offset > offset) + +		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) +			continue; +		if (test->file_offset > offset) {  			i_size_test = test->file_offset; -	} else { -		i_size_test = i_size; +			break; +		}  	}  	/*  	 * i_size_test is the end of a region after this ordered -	 * extent where there are no ordered extents.  As long as there -	 * are no delalloc bytes in this area, it is safe to update -	 * disk_i_size to the end of the region. +	 * extent where there are no ordered extents, we can safely set +	 * disk_i_size to this.  	 */ -	if (i_size_test > offset && -	    !test_range_bit(io_tree, offset, i_size_test - 1, -			    EXTENT_DELALLOC, 0, NULL)) { +	if (i_size_test > offset)  		new_i_size = min_t(u64, i_size_test, i_size); -	}  	BTRFS_I(inode)->disk_i_size = new_i_size;  	ret = 0;  out:  	/* -	 * we need to remove the ordered extent with the tree lock held -	 * so that other people calling this function don't find our fully -	 * processed ordered entry and skip updating the i_size +	 * We need to do this because we can't remove ordered extents until +	 * after the i_disk_size has been updated and then the inode has been +	 * updated to reflect the change, so we need to tell anybody who finds +	 * this ordered extent that we've already done all the real work, we +	 * just haven't completed all the other work.  	 */  	if (ordered) -		__btrfs_remove_ordered_extent(inode, ordered); -	spin_unlock(&tree->lock); -	if (ordered) -		wake_up(&ordered->wait); +		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); +	spin_unlock_irq(&tree->lock);  	return ret;  } @@ -912,7 +921,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,  	if (!ordered)  		return 1; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {  		if (disk_bytenr >= ordered_sum->bytenr) {  			num_sectors = ordered_sum->len / sectorsize; @@ -927,7 +936,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,  		}  	}  out: -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  	btrfs_put_ordered_extent(ordered);  	return ret;  } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c355ad4dc1a..e03c560d299 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -74,6 +74,12 @@ struct btrfs_ordered_sum {  #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ +#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ + +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent +				       * has done its due diligence in updating +				       * the isize. */ +  struct btrfs_ordered_extent {  	/* logical offset in the file */  	u64 file_offset; @@ -113,6 +119,8 @@ struct btrfs_ordered_extent {  	/* a per root list of all the pending ordered extents */  	struct list_head root_extent_list; + +	struct btrfs_work work;  }; @@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,  				struct btrfs_ordered_extent *entry);  int btrfs_dec_test_ordered_pending(struct inode *inode,  				   struct btrfs_ordered_extent **cached, -				   u64 file_offset, u64 io_size); +				   u64 file_offset, u64 io_size, int uptodate);  int btrfs_dec_test_first_ordered_pending(struct inode *inode,  				   struct btrfs_ordered_extent **cached, -				   u64 *file_offset, u64 io_size); +				   u64 *file_offset, u64 io_size, +				   int uptodate);  int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  			     u64 start, u64 len, u64 disk_len, int type);  int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index f38e452486b..5e23684887e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)  			       btrfs_dev_extent_chunk_offset(l, dev_extent),  			       (unsigned long long)  			       btrfs_dev_extent_length(l, dev_extent)); +		case BTRFS_DEV_STATS_KEY: +			printk(KERN_INFO "\t\tdevice stats\n"); +			break;  		};  	}  } diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h new file mode 100644 index 00000000000..9e111e4576d --- /dev/null +++ b/fs/btrfs/rcu-string.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2012 Red Hat.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +struct rcu_string { +	struct rcu_head rcu; +	char str[0]; +}; + +static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) +{ +	size_t len = strlen(src) + 1; +	struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + +					 (len * sizeof(char)), mask); +	if (!ret) +		return ret; +	strncpy(ret->str, src, len); +	return ret; +} + +static inline void rcu_string_free(struct rcu_string *str) +{ +	if (str) +		kfree_rcu(str, rcu); +} + +#define printk_in_rcu(fmt, ...) do {	\ +	rcu_read_lock();		\ +	printk(fmt, __VA_ARGS__);	\ +	rcu_read_unlock();		\ +} while (0) + +#define printk_ratelimited_in_rcu(fmt, ...) do {	\ +	rcu_read_lock();				\ +	printk_ratelimited(fmt, __VA_ARGS__);		\ +	rcu_read_unlock();				\ +} while (0) + +#define rcu_str_deref(rcu_str) ({				\ +	struct rcu_string *__str = rcu_dereference(rcu_str);	\ +	__str->str;						\ +}) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index ac5d0108588..48a4882d8ad 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work)  {  	struct reada_machine_work *rmw;  	struct btrfs_fs_info *fs_info; +	int old_ioprio;  	rmw = container_of(work, struct reada_machine_work, work);  	fs_info = rmw->fs_info;  	kfree(rmw); +	old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), +				       task_nice_ioprio(current)); +	set_task_ioprio(current, BTRFS_IOPRIO_READA);  	__reada_start_machine(fs_info); +	set_task_ioprio(current, old_ioprio);  }  static void __reada_start_machine(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2f3d6f917fb..b223620cd5a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -26,6 +26,7 @@  #include "backref.h"  #include "extent_io.h"  #include "check-integrity.h" +#include "rcu-string.h"  /*   * This is only the first step towards a full-features scrub. It reads all @@ -50,7 +51,7 @@ struct scrub_dev;  struct scrub_page {  	struct scrub_block	*sblock;  	struct page		*page; -	struct block_device	*bdev; +	struct btrfs_device	*dev;  	u64			flags;  /* extent flags */  	u64			generation;  	u64			logical; @@ -86,6 +87,7 @@ struct scrub_block {  		unsigned int	header_error:1;  		unsigned int	checksum_error:1;  		unsigned int	no_io_error_seen:1; +		unsigned int	generation_error:1; /* also sets header_error */  	};  }; @@ -319,10 +321,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)  	 * hold all of the paths here  	 */  	for (i = 0; i < ipath->fspath->elem_cnt; ++i) -		printk(KERN_WARNING "btrfs: %s at logical %llu on dev " +		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "  			"%s, sector %llu, root %llu, inode %llu, offset %llu, "  			"length %llu, links %u (path: %s)\n", swarn->errstr, -			swarn->logical, swarn->dev->name, +			swarn->logical, rcu_str_deref(swarn->dev->name),  			(unsigned long long)swarn->sector, root, inum, offset,  			min(isize - offset, (u64)PAGE_SIZE), nlink,  			(char *)(unsigned long)ipath->fspath->val[i]); @@ -331,10 +333,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)  	return 0;  err: -	printk(KERN_WARNING "btrfs: %s at logical %llu on dev " +	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "  		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "  		"resolving failed with ret=%d\n", swarn->errstr, -		swarn->logical, swarn->dev->name, +		swarn->logical, rcu_str_deref(swarn->dev->name),  		(unsigned long long)swarn->sector, root, inum, offset, ret);  	free_ipath(ipath); @@ -389,10 +391,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  		do {  			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,  							&ref_root, &ref_level); -			printk(KERN_WARNING +			printk_in_rcu(KERN_WARNING  				"btrfs: %s at logical %llu on dev %s, "  				"sector %llu: metadata %s (level %d) in tree " -				"%llu\n", errstr, swarn.logical, dev->name, +				"%llu\n", errstr, swarn.logical, +				rcu_str_deref(dev->name),  				(unsigned long long)swarn.sector,  				ref_level ? "node" : "leaf",  				ret < 0 ? -1 : ref_level, @@ -579,9 +582,11 @@ out:  		spin_lock(&sdev->stat_lock);  		++sdev->stat.uncorrectable_errors;  		spin_unlock(&sdev->stat_lock); -		printk_ratelimited(KERN_ERR + +		printk_ratelimited_in_rcu(KERN_ERR  			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", -			(unsigned long long)fixup->logical, sdev->dev->name); +			(unsigned long long)fixup->logical, +			rcu_str_deref(sdev->dev->name));  	}  	btrfs_free_path(path); @@ -675,6 +680,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		sdev->stat.read_errors++;  		sdev->stat.uncorrectable_errors++;  		spin_unlock(&sdev->stat_lock); +		btrfs_dev_stat_inc_and_print(sdev->dev, +					     BTRFS_DEV_STAT_READ_ERRS);  		goto out;  	} @@ -686,6 +693,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		sdev->stat.read_errors++;  		sdev->stat.uncorrectable_errors++;  		spin_unlock(&sdev->stat_lock); +		btrfs_dev_stat_inc_and_print(sdev->dev, +					     BTRFS_DEV_STAT_READ_ERRS);  		goto out;  	}  	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); @@ -699,6 +708,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		sdev->stat.read_errors++;  		sdev->stat.uncorrectable_errors++;  		spin_unlock(&sdev->stat_lock); +		btrfs_dev_stat_inc_and_print(sdev->dev, +					     BTRFS_DEV_STAT_READ_ERRS);  		goto out;  	} @@ -725,12 +736,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		spin_unlock(&sdev->stat_lock);  		if (__ratelimit(&_rs))  			scrub_print_warning("i/o error", sblock_to_check); +		btrfs_dev_stat_inc_and_print(sdev->dev, +					     BTRFS_DEV_STAT_READ_ERRS);  	} else if (sblock_bad->checksum_error) {  		spin_lock(&sdev->stat_lock);  		sdev->stat.csum_errors++;  		spin_unlock(&sdev->stat_lock);  		if (__ratelimit(&_rs))  			scrub_print_warning("checksum error", sblock_to_check); +		btrfs_dev_stat_inc_and_print(sdev->dev, +					     BTRFS_DEV_STAT_CORRUPTION_ERRS);  	} else if (sblock_bad->header_error) {  		spin_lock(&sdev->stat_lock);  		sdev->stat.verify_errors++; @@ -738,6 +753,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		if (__ratelimit(&_rs))  			scrub_print_warning("checksum/header error",  					    sblock_to_check); +		if (sblock_bad->generation_error) +			btrfs_dev_stat_inc_and_print(sdev->dev, +				BTRFS_DEV_STAT_GENERATION_ERRS); +		else +			btrfs_dev_stat_inc_and_print(sdev->dev, +				BTRFS_DEV_STAT_CORRUPTION_ERRS);  	}  	if (sdev->readonly) @@ -919,18 +940,20 @@ corrected_error:  			spin_lock(&sdev->stat_lock);  			sdev->stat.corrected_errors++;  			spin_unlock(&sdev->stat_lock); -			printk_ratelimited(KERN_ERR +			printk_ratelimited_in_rcu(KERN_ERR  				"btrfs: fixed up error at logical %llu on dev %s\n", -				(unsigned long long)logical, sdev->dev->name); +				(unsigned long long)logical, +				rcu_str_deref(sdev->dev->name));  		}  	} else {  did_not_correct_error:  		spin_lock(&sdev->stat_lock);  		sdev->stat.uncorrectable_errors++;  		spin_unlock(&sdev->stat_lock); -		printk_ratelimited(KERN_ERR +		printk_ratelimited_in_rcu(KERN_ERR  			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", -			(unsigned long long)logical, sdev->dev->name); +			(unsigned long long)logical, +			rcu_str_deref(sdev->dev->name));  	}  out: @@ -998,8 +1021,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,  			page = sblock->pagev + page_index;  			page->logical = logical;  			page->physical = bbio->stripes[mirror_index].physical; -			/* for missing devices, bdev is NULL */ -			page->bdev = bbio->stripes[mirror_index].dev->bdev; +			/* for missing devices, dev->bdev is NULL */ +			page->dev = bbio->stripes[mirror_index].dev;  			page->mirror_num = mirror_index + 1;  			page->page = alloc_page(GFP_NOFS);  			if (!page->page) { @@ -1043,7 +1066,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,  		struct scrub_page *page = sblock->pagev + page_num;  		DECLARE_COMPLETION_ONSTACK(complete); -		if (page->bdev == NULL) { +		if (page->dev->bdev == NULL) {  			page->io_error = 1;  			sblock->no_io_error_seen = 0;  			continue; @@ -1053,7 +1076,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,  		bio = bio_alloc(GFP_NOFS, 1);  		if (!bio)  			return -EIO; -		bio->bi_bdev = page->bdev; +		bio->bi_bdev = page->dev->bdev;  		bio->bi_sector = page->physical >> 9;  		bio->bi_end_io = scrub_complete_bio_end_io;  		bio->bi_private = &complete; @@ -1102,11 +1125,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  		h = (struct btrfs_header *)mapped_buffer;  		if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || -		    generation != le64_to_cpu(h->generation) ||  		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||  		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, -			   BTRFS_UUID_SIZE)) +			   BTRFS_UUID_SIZE)) { +			sblock->header_error = 1; +		} else if (generation != le64_to_cpu(h->generation)) {  			sblock->header_error = 1; +			sblock->generation_error = 1; +		}  		csum = h->csum;  	} else {  		if (!have_csum) @@ -1182,7 +1208,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  		bio = bio_alloc(GFP_NOFS, 1);  		if (!bio)  			return -EIO; -		bio->bi_bdev = page_bad->bdev; +		bio->bi_bdev = page_bad->dev->bdev;  		bio->bi_sector = page_bad->physical >> 9;  		bio->bi_end_io = scrub_complete_bio_end_io;  		bio->bi_private = &complete; @@ -1196,6 +1222,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  		/* this will also unplug the queue */  		wait_for_completion(&complete); +		if (!bio_flagged(bio, BIO_UPTODATE)) { +			btrfs_dev_stat_inc_and_print(page_bad->dev, +				BTRFS_DEV_STAT_WRITE_ERRS); +			bio_put(bio); +			return -EIO; +		}  		bio_put(bio);  	} @@ -1352,7 +1384,8 @@ static int scrub_checksum_super(struct scrub_block *sblock)  	u64 mapped_size;  	void *p;  	u32 crc = ~(u32)0; -	int fail = 0; +	int fail_gen = 0; +	int fail_cor = 0;  	u64 len;  	int index; @@ -1363,13 +1396,13 @@ static int scrub_checksum_super(struct scrub_block *sblock)  	memcpy(on_disk_csum, s->csum, sdev->csum_size);  	if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) -		++fail; +		++fail_cor;  	if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) -		++fail; +		++fail_gen;  	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) -		++fail; +		++fail_cor;  	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;  	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; @@ -1394,9 +1427,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)  	btrfs_csum_final(crc, calculated_csum);  	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) -		++fail; +		++fail_cor; -	if (fail) { +	if (fail_cor + fail_gen) {  		/*  		 * if we find an error in a super block, we just report it.  		 * They will get written with the next transaction commit @@ -1405,9 +1438,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)  		spin_lock(&sdev->stat_lock);  		++sdev->stat.super_errors;  		spin_unlock(&sdev->stat_lock); +		if (fail_cor) +			btrfs_dev_stat_inc_and_print(sdev->dev, +				BTRFS_DEV_STAT_CORRUPTION_ERRS); +		else +			btrfs_dev_stat_inc_and_print(sdev->dev, +				BTRFS_DEV_STAT_GENERATION_ERRS);  	} -	return fail; +	return fail_cor + fail_gen;  }  static void scrub_block_get(struct scrub_block *sblock) @@ -1551,7 +1590,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,  			return -ENOMEM;  		}  		spage->sblock = sblock; -		spage->bdev = sdev->dev->bdev; +		spage->dev = sdev->dev;  		spage->flags = flags;  		spage->generation = gen;  		spage->logical = logical; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c5f8fca4195..b19d7556772 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -54,6 +54,7 @@  #include "version.h"  #include "export.h"  #include "compression.h" +#include "rcu-string.h"  #define CREATE_TRACE_POINTS  #include <trace/events/btrfs.h> @@ -188,7 +189,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)  	va_start(args, fmt);  	if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { -		strncpy(lvl, fmt, 3); +		memcpy(lvl, fmt, 3); +		lvl[3] = '\0';  		fmt += 3;  		type = logtypes[fmt[1] - '0'];  	} else @@ -435,11 +437,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  		case Opt_thread_pool:  			intarg = 0;  			match_int(&args[0], &intarg); -			if (intarg) { +			if (intarg)  				info->thread_pool_size = intarg; -				printk(KERN_INFO "btrfs: thread pool %d\n", -				       info->thread_pool_size); -			}  			break;  		case Opt_max_inline:  			num = match_strdup(&args[0]); @@ -769,7 +768,7 @@ static int btrfs_fill_super(struct super_block *sb,  #ifdef CONFIG_BTRFS_FS_POSIX_ACL  	sb->s_flags |= MS_POSIXACL;  #endif - +	sb->s_flags |= MS_I_VERSION;  	err = open_ctree(sb, fs_devices, (char *)data);  	if (err) {  		printk("btrfs: open_ctree failed\n"); @@ -925,63 +924,48 @@ static inline int is_subvolume_inode(struct inode *inode)   */  static char *setup_root_args(char *args)  { -	unsigned copied = 0; -	unsigned len = strlen(args) + 2; -	char *pos; -	char *ret; +	unsigned len = strlen(args) + 2 + 1; +	char *src, *dst, *buf;  	/* -	 * We need the same args as before, but minus -	 * -	 * subvol=a -	 * -	 * and add +	 * We need the same args as before, but with this substitution: +	 * s!subvol=[^,]+!subvolid=0!  	 * -	 * subvolid=0 -	 * -	 * which is a difference of 2 characters, so we allocate strlen(args) + -	 * 2 characters. +	 * Since the replacement string is up to 2 bytes longer than the +	 * original, allocate strlen(args) + 2 + 1 bytes.  	 */ -	ret = kzalloc(len * sizeof(char), GFP_NOFS); -	if (!ret) -		return NULL; -	pos = strstr(args, "subvol="); +	src = strstr(args, "subvol=");  	/* This shouldn't happen, but just in case.. */ -	if (!pos) { -		kfree(ret); +	if (!src) +		return NULL; + +	buf = dst = kmalloc(len, GFP_NOFS); +	if (!buf)  		return NULL; -	}  	/* -	 * The subvol=<> arg is not at the front of the string, copy everybody -	 * up to that into ret. +	 * If the subvol= arg is not at the start of the string, +	 * copy whatever precedes it into buf.  	 */ -	if (pos != args) { -		*pos = '\0'; -		strcpy(ret, args); -		copied += strlen(args); -		pos++; +	if (src != args) { +		*src++ = '\0'; +		strcpy(buf, args); +		dst += strlen(args);  	} -	strncpy(ret + copied, "subvolid=0", len - copied); - -	/* Length of subvolid=0 */ -	copied += 10; +	strcpy(dst, "subvolid=0"); +	dst += strlen("subvolid=0");  	/* -	 * If there is no , after the subvol= option then we know there's no -	 * other options and we can just return. +	 * If there is a "," after the original subvol=... string, +	 * copy that suffix into our buffer.  Otherwise, we're done.  	 */ -	pos = strchr(pos, ','); -	if (!pos) -		return ret; - -	/* Copy the rest of the arguments into our buffer */ -	strncpy(ret + copied, pos, len - copied); -	copied += strlen(pos); +	src = strchr(src, ','); +	if (src) +		strcpy(dst, src); -	return ret; +	return buf;  }  static struct dentry *mount_subvol(const char *subvol_name, int flags, @@ -1084,7 +1068,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  	}  	bdev = fs_devices->latest_bdev; -	s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); +	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC, +		 fs_info);  	if (IS_ERR(s)) {  		error = PTR_ERR(s);  		goto error_close_devices; @@ -1098,7 +1083,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,  	} else {  		char b[BDEVNAME_SIZE]; -		s->s_flags = flags | MS_NOSEC;  		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));  		btrfs_sb(s)->bdev_holder = fs_type;  		error = btrfs_fill_super(s, fs_devices, data, @@ -1118,6 +1102,40 @@ error_fs_info:  	return ERR_PTR(error);  } +static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit) +{ +	spin_lock_irq(&workers->lock); +	workers->max_workers = new_limit; +	spin_unlock_irq(&workers->lock); +} + +static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, +				     int new_pool_size, int old_pool_size) +{ +	if (new_pool_size == old_pool_size) +		return; + +	fs_info->thread_pool_size = new_pool_size; + +	printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", +	       old_pool_size, new_pool_size); + +	btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); +	btrfs_set_max_workers(&fs_info->workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); +	btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); +} +  static int btrfs_remount(struct super_block *sb, int *flags, char *data)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1137,6 +1155,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  		goto restore;  	} +	btrfs_resize_thread_pool(fs_info, +		fs_info->thread_pool_size, old_thread_pool_size); +  	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))  		return 0; @@ -1166,6 +1187,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  		if (ret)  			goto restore; +		ret = btrfs_resume_balance_async(fs_info); +		if (ret) +			goto restore; +  		sb->s_flags &= ~MS_RDONLY;  	} @@ -1180,7 +1205,8 @@ restore:  	fs_info->compress_type = old_compress_type;  	fs_info->max_inline = old_max_inline;  	fs_info->alloc_start = old_alloc_start; -	fs_info->thread_pool_size = old_thread_pool_size; +	btrfs_resize_thread_pool(fs_info, +		old_thread_pool_size, fs_info->thread_pool_size);  	fs_info->metadata_ratio = old_metadata_ratio;  	return ret;  } @@ -1461,12 +1487,44 @@ static void btrfs_fs_dirty_inode(struct inode *inode, int flags)  				   "error %d\n", btrfs_ino(inode), ret);  } +static int btrfs_show_devname(struct seq_file *m, struct dentry *root) +{ +	struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); +	struct btrfs_fs_devices *cur_devices; +	struct btrfs_device *dev, *first_dev = NULL; +	struct list_head *head; +	struct rcu_string *name; + +	mutex_lock(&fs_info->fs_devices->device_list_mutex); +	cur_devices = fs_info->fs_devices; +	while (cur_devices) { +		head = &cur_devices->devices; +		list_for_each_entry(dev, head, dev_list) { +			if (!first_dev || dev->devid < first_dev->devid) +				first_dev = dev; +		} +		cur_devices = cur_devices->seed; +	} + +	if (first_dev) { +		rcu_read_lock(); +		name = rcu_dereference(first_dev->name); +		seq_escape(m, name->str, " \t\n\\"); +		rcu_read_unlock(); +	} else { +		WARN_ON(1); +	} +	mutex_unlock(&fs_info->fs_devices->device_list_mutex); +	return 0; +} +  static const struct super_operations btrfs_super_ops = {  	.drop_inode	= btrfs_drop_inode,  	.evict_inode	= btrfs_evict_inode,  	.put_super	= btrfs_put_super,  	.sync_fs	= btrfs_sync_fs,  	.show_options	= btrfs_show_options, +	.show_devname	= btrfs_show_devname,  	.write_inode	= btrfs_write_inode,  	.dirty_inode	= btrfs_fs_dirty_inode,  	.alloc_inode	= btrfs_alloc_inode, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 36422254ef6..b72b068183e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -28,6 +28,7 @@  #include "locking.h"  #include "tree-log.h"  #include "inode-map.h" +#include "volumes.h"  #define BTRFS_ROOT_TRANS_TAG 0 @@ -55,49 +56,54 @@ static noinline void switch_commit_root(struct btrfs_root *root)  static noinline int join_transaction(struct btrfs_root *root, int nofail)  {  	struct btrfs_transaction *cur_trans; +	struct btrfs_fs_info *fs_info = root->fs_info; -	spin_lock(&root->fs_info->trans_lock); +	spin_lock(&fs_info->trans_lock);  loop:  	/* The file system has been taken offline. No new transactions. */ -	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { -		spin_unlock(&root->fs_info->trans_lock); +	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +		spin_unlock(&fs_info->trans_lock);  		return -EROFS;  	} -	if (root->fs_info->trans_no_join) { +	if (fs_info->trans_no_join) {  		if (!nofail) { -			spin_unlock(&root->fs_info->trans_lock); +			spin_unlock(&fs_info->trans_lock);  			return -EBUSY;  		}  	} -	cur_trans = root->fs_info->running_transaction; +	cur_trans = fs_info->running_transaction;  	if (cur_trans) {  		if (cur_trans->aborted) { -			spin_unlock(&root->fs_info->trans_lock); +			spin_unlock(&fs_info->trans_lock);  			return cur_trans->aborted;  		}  		atomic_inc(&cur_trans->use_count);  		atomic_inc(&cur_trans->num_writers);  		cur_trans->num_joined++; -		spin_unlock(&root->fs_info->trans_lock); +		spin_unlock(&fs_info->trans_lock);  		return 0;  	} -	spin_unlock(&root->fs_info->trans_lock); +	spin_unlock(&fs_info->trans_lock);  	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);  	if (!cur_trans)  		return -ENOMEM; -	spin_lock(&root->fs_info->trans_lock); -	if (root->fs_info->running_transaction) { +	spin_lock(&fs_info->trans_lock); +	if (fs_info->running_transaction) {  		/*  		 * someone started a transaction after we unlocked.  Make sure  		 * to redo the trans_no_join checks above  		 */  		kmem_cache_free(btrfs_transaction_cachep, cur_trans); -		cur_trans = root->fs_info->running_transaction; +		cur_trans = fs_info->running_transaction;  		goto loop; +	} else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +		spin_unlock(&root->fs_info->trans_lock); +		kmem_cache_free(btrfs_transaction_cachep, cur_trans); +		return -EROFS;  	}  	atomic_set(&cur_trans->num_writers, 1); @@ -121,20 +127,38 @@ loop:  	cur_trans->delayed_refs.flushing = 0;  	cur_trans->delayed_refs.run_delayed_start = 0;  	cur_trans->delayed_refs.seq = 1; + +	/* +	 * although the tree mod log is per file system and not per transaction, +	 * the log must never go across transaction boundaries. +	 */ +	smp_mb(); +	if (!list_empty(&fs_info->tree_mod_seq_list)) { +		printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " +			"creating a fresh transaction\n"); +		WARN_ON(1); +	} +	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { +		printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " +			"creating a fresh transaction\n"); +		WARN_ON(1); +	} +	atomic_set(&fs_info->tree_mod_seq, 0); +  	init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);  	spin_lock_init(&cur_trans->commit_lock);  	spin_lock_init(&cur_trans->delayed_refs.lock);  	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);  	INIT_LIST_HEAD(&cur_trans->pending_snapshots); -	list_add_tail(&cur_trans->list, &root->fs_info->trans_list); +	list_add_tail(&cur_trans->list, &fs_info->trans_list);  	extent_io_tree_init(&cur_trans->dirty_pages, -			     root->fs_info->btree_inode->i_mapping); -	root->fs_info->generation++; -	cur_trans->transid = root->fs_info->generation; -	root->fs_info->running_transaction = cur_trans; +			     fs_info->btree_inode->i_mapping); +	fs_info->generation++; +	cur_trans->transid = fs_info->generation; +	fs_info->running_transaction = cur_trans;  	cur_trans->aborted = 0; -	spin_unlock(&root->fs_info->trans_lock); +	spin_unlock(&fs_info->trans_lock);  	return 0;  } @@ -758,6 +782,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,  	if (ret)  		return ret; +	ret = btrfs_run_dev_stats(trans, root->fs_info); +	BUG_ON(ret); +  	while (!list_empty(&fs_info->dirty_cowonly_roots)) {  		next = fs_info->dirty_cowonly_roots.next;  		list_del_init(next); @@ -1190,14 +1217,20 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,  static void cleanup_transaction(struct btrfs_trans_handle *trans, -				struct btrfs_root *root) +				struct btrfs_root *root, int err)  {  	struct btrfs_transaction *cur_trans = trans->transaction;  	WARN_ON(trans->use_count > 1); +	btrfs_abort_transaction(trans, root, err); +  	spin_lock(&root->fs_info->trans_lock);  	list_del_init(&cur_trans->list); +	if (cur_trans == root->fs_info->running_transaction) { +		root->fs_info->running_transaction = NULL; +		root->fs_info->trans_no_join = 0; +	}  	spin_unlock(&root->fs_info->trans_lock);  	btrfs_cleanup_one_transaction(trans->transaction, root); @@ -1503,7 +1536,7 @@ cleanup_transaction:  //	WARN_ON(1);  	if (current->journal_info == trans)  		current->journal_info = NULL; -	cleanup_transaction(trans, root); +	cleanup_transaction(trans, root, ret);  	return ret;  } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index eb1ae908582..8abeae4224f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -690,6 +690,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,  	kfree(name);  	iput(inode); + +	btrfs_run_delayed_items(trans, root);  	return ret;  } @@ -895,6 +897,7 @@ again:  				ret = btrfs_unlink_inode(trans, root, dir,  							 inode, victim_name,  							 victim_name_len); +				btrfs_run_delayed_items(trans, root);  			}  			kfree(victim_name);  			ptr = (unsigned long)(victim_ref + 1) + victim_name_len; @@ -1475,6 +1478,9 @@ again:  			ret = btrfs_unlink_inode(trans, root, dir, inode,  						 name, name_len);  			BUG_ON(ret); + +			btrfs_run_delayed_items(trans, root); +  			kfree(name);  			iput(inode); @@ -1628,7 +1634,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,  	int i;  	int ret; -	btrfs_read_buffer(eb, gen); +	ret = btrfs_read_buffer(eb, gen); +	if (ret) +		return ret;  	level = btrfs_header_level(eb); @@ -1749,7 +1757,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,  			path->slots[*level]++;  			if (wc->free) { -				btrfs_read_buffer(next, ptr_gen); +				ret = btrfs_read_buffer(next, ptr_gen); +				if (ret) { +					free_extent_buffer(next); +					return ret; +				}  				btrfs_tree_lock(next);  				btrfs_set_lock_blocking(next); @@ -1766,7 +1778,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,  			free_extent_buffer(next);  			continue;  		} -		btrfs_read_buffer(next, ptr_gen); +		ret = btrfs_read_buffer(next, ptr_gen); +		if (ret) { +			free_extent_buffer(next); +			return ret; +		}  		WARN_ON(*level <= 0);  		if (path->nodes[*level-1]) @@ -2657,6 +2673,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,  		btrfs_release_path(path);  	}  	btrfs_release_path(path); +	if (ret > 0) +		ret = 0;  	return ret;  } @@ -3028,21 +3046,6 @@ out:  	return ret;  } -static int inode_in_log(struct btrfs_trans_handle *trans, -		 struct inode *inode) -{ -	struct btrfs_root *root = BTRFS_I(inode)->root; -	int ret = 0; - -	mutex_lock(&root->log_mutex); -	if (BTRFS_I(inode)->logged_trans == trans->transid && -	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) -		ret = 1; -	mutex_unlock(&root->log_mutex); -	return ret; -} - -  /*   * helper function around btrfs_log_inode to make sure newly created   * parent directories also end up in the log.  A minimal inode and backref @@ -3083,7 +3086,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,  	if (ret)  		goto end_no_trans; -	if (inode_in_log(trans, inode)) { +	if (btrfs_inode_in_log(inode, trans->transid)) {  		ret = BTRFS_NO_LOG_SYNC;  		goto end_no_trans;  	} diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 12f5147bd2b..ab942f46b3d 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -23,9 +23,9 @@   *   * ulist = ulist_alloc();   * ulist_add(ulist, root); - * elem = NULL; + * ULIST_ITER_INIT(&uiter);   * - * while ((elem = ulist_next(ulist, elem)) { + * while ((elem = ulist_next(ulist, &uiter)) {   * 	for (all child nodes n in elem)   *		ulist_add(ulist, n);   *	do something useful with the node; @@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit);   *   * The allocated ulist will be returned in an initialized state.   */ -struct ulist *ulist_alloc(unsigned long gfp_mask) +struct ulist *ulist_alloc(gfp_t gfp_mask)  {  	struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); @@ -144,13 +144,22 @@ EXPORT_SYMBOL(ulist_free);   * unaltered.   */  int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, -	      unsigned long gfp_mask) +	      gfp_t gfp_mask) +{ +	return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); +} + +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, +		    unsigned long *old_aux, gfp_t gfp_mask)  {  	int i;  	for (i = 0; i < ulist->nnodes; ++i) { -		if (ulist->nodes[i].val == val) +		if (ulist->nodes[i].val == val) { +			if (old_aux) +				*old_aux = ulist->nodes[i].aux;  			return 0; +		}  	}  	if (ulist->nnodes >= ulist->nodes_alloced) { @@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);  /**   * ulist_next - iterate ulist   * @ulist:	ulist to iterate - * @prev:	previously returned element or %NULL to start iteration + * @uiter:	iterator variable, initialized with ULIST_ITER_INIT(&iterator)   *   * Note: locking must be provided by the caller. In case of rwlocks only read   *       locking is needed   * - * This function is used to iterate an ulist. The iteration is started with - * @prev = %NULL. It returns the next element from the ulist or %NULL when the + * This function is used to iterate an ulist. + * It returns the next element from the ulist or %NULL when the   * end is reached. No guarantee is made with respect to the order in which   * the elements are returned. They might neither be returned in order of   * addition nor in ascending order.   * It is allowed to call ulist_add during an enumeration. Newly added items   * are guaranteed to show up in the running enumeration.   */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)  { -	int next; -  	if (ulist->nnodes == 0)  		return NULL; - -	if (!prev) -		return &ulist->nodes[0]; - -	next = (prev - ulist->nodes) + 1; -	if (next < 0 || next >= ulist->nnodes) +	if (uiter->i < 0 || uiter->i >= ulist->nnodes)  		return NULL; -	return &ulist->nodes[next]; +	return &ulist->nodes[uiter->i++];  }  EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 2e25dec58ec..21bdc8ec813 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -24,6 +24,10 @@   */  #define ULIST_SIZE 16 +struct ulist_iterator { +	int i; +}; +  /*   * element of the list   */ @@ -59,10 +63,15 @@ struct ulist {  void ulist_init(struct ulist *ulist);  void ulist_fini(struct ulist *ulist);  void ulist_reinit(struct ulist *ulist); -struct ulist *ulist_alloc(unsigned long gfp_mask); +struct ulist *ulist_alloc(gfp_t gfp_mask);  void ulist_free(struct ulist *ulist);  int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, -	      unsigned long gfp_mask); -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); +	      gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, +		    unsigned long *old_aux, gfp_t gfp_mask); +struct ulist_node *ulist_next(struct ulist *ulist, +			      struct ulist_iterator *uiter); + +#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)  #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1411b99555a..ecaad40e7ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@  #include <linux/random.h>  #include <linux/iocontext.h>  #include <linux/capability.h> +#include <linux/ratelimit.h>  #include <linux/kthread.h>  #include <asm/div64.h>  #include "compat.h" @@ -34,11 +35,14 @@  #include "volumes.h"  #include "async-thread.h"  #include "check-integrity.h" +#include "rcu-string.h"  static int init_first_rw_device(struct btrfs_trans_handle *trans,  				struct btrfs_root *root,  				struct btrfs_device *device);  static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +static void __btrfs_reset_dev_stats(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);  static DEFINE_MUTEX(uuid_mutex);  static LIST_HEAD(fs_uuids); @@ -61,7 +65,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)  		device = list_entry(fs_devices->devices.next,  				    struct btrfs_device, dev_list);  		list_del(&device->dev_list); -		kfree(device->name); +		rcu_string_free(device->name);  		kfree(device);  	}  	kfree(fs_devices); @@ -331,8 +335,8 @@ static noinline int device_list_add(const char *path,  {  	struct btrfs_device *device;  	struct btrfs_fs_devices *fs_devices; +	struct rcu_string *name;  	u64 found_transid = btrfs_super_generation(disk_super); -	char *name;  	fs_devices = find_fsid(disk_super->fsid);  	if (!fs_devices) { @@ -361,15 +365,18 @@ static noinline int device_list_add(const char *path,  			return -ENOMEM;  		}  		device->devid = devid; +		device->dev_stats_valid = 0;  		device->work.func = pending_bios_fn;  		memcpy(device->uuid, disk_super->dev_item.uuid,  		       BTRFS_UUID_SIZE);  		spin_lock_init(&device->io_lock); -		device->name = kstrdup(path, GFP_NOFS); -		if (!device->name) { + +		name = rcu_string_strdup(path, GFP_NOFS); +		if (!name) {  			kfree(device);  			return -ENOMEM;  		} +		rcu_assign_pointer(device->name, name);  		INIT_LIST_HEAD(&device->dev_alloc_list);  		/* init readahead state */ @@ -386,12 +393,12 @@ static noinline int device_list_add(const char *path,  		device->fs_devices = fs_devices;  		fs_devices->num_devices++; -	} else if (!device->name || strcmp(device->name, path)) { -		name = kstrdup(path, GFP_NOFS); +	} else if (!device->name || strcmp(device->name->str, path)) { +		name = rcu_string_strdup(path, GFP_NOFS);  		if (!name)  			return -ENOMEM; -		kfree(device->name); -		device->name = name; +		rcu_string_free(device->name); +		rcu_assign_pointer(device->name, name);  		if (device->missing) {  			fs_devices->missing_devices--;  			device->missing = 0; @@ -426,15 +433,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)  	/* We have held the volume lock, it is safe to get the devices. */  	list_for_each_entry(orig_dev, &orig->devices, dev_list) { +		struct rcu_string *name; +  		device = kzalloc(sizeof(*device), GFP_NOFS);  		if (!device)  			goto error; -		device->name = kstrdup(orig_dev->name, GFP_NOFS); -		if (!device->name) { +		/* +		 * This is ok to do without rcu read locked because we hold the +		 * uuid mutex so nothing we touch in here is going to disappear. +		 */ +		name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); +		if (!name) {  			kfree(device);  			goto error;  		} +		rcu_assign_pointer(device->name, name);  		device->devid = orig_dev->devid;  		device->work.func = pending_bios_fn; @@ -487,7 +501,7 @@ again:  		}  		list_del_init(&device->dev_list);  		fs_devices->num_devices--; -		kfree(device->name); +		rcu_string_free(device->name);  		kfree(device);  	} @@ -512,7 +526,7 @@ static void __free_device(struct work_struct *work)  	if (device->bdev)  		blkdev_put(device->bdev, device->mode); -	kfree(device->name); +	rcu_string_free(device->name);  	kfree(device);  } @@ -536,6 +550,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)  	mutex_lock(&fs_devices->device_list_mutex);  	list_for_each_entry(device, &fs_devices->devices, dev_list) {  		struct btrfs_device *new_device; +		struct rcu_string *name;  		if (device->bdev)  			fs_devices->open_devices--; @@ -551,8 +566,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)  		new_device = kmalloc(sizeof(*new_device), GFP_NOFS);  		BUG_ON(!new_device); /* -ENOMEM */  		memcpy(new_device, device, sizeof(*new_device)); -		new_device->name = kstrdup(device->name, GFP_NOFS); -		BUG_ON(device->name && !new_device->name); /* -ENOMEM */ + +		/* Safe because we are under uuid_mutex */ +		name = rcu_string_strdup(device->name->str, GFP_NOFS); +		BUG_ON(device->name && !name); /* -ENOMEM */ +		rcu_assign_pointer(new_device->name, name);  		new_device->bdev = NULL;  		new_device->writeable = 0;  		new_device->in_fs_metadata = 0; @@ -617,9 +635,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  		if (!device->name)  			continue; -		bdev = blkdev_get_by_path(device->name, flags, holder); +		bdev = blkdev_get_by_path(device->name->str, flags, holder);  		if (IS_ERR(bdev)) { -			printk(KERN_INFO "open %s failed\n", device->name); +			printk(KERN_INFO "open %s failed\n", device->name->str);  			goto error;  		}  		filemap_write_and_wait(bdev->bd_inode->i_mapping); @@ -1628,12 +1646,13 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  	struct block_device *bdev;  	struct list_head *devices;  	struct super_block *sb = root->fs_info->sb; +	struct rcu_string *name;  	u64 total_bytes;  	int seeding_dev = 0;  	int ret = 0;  	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) -		return -EINVAL; +		return -EROFS;  	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,  				  root->fs_info->bdev_holder); @@ -1667,23 +1686,24 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  		goto error;  	} -	device->name = kstrdup(device_path, GFP_NOFS); -	if (!device->name) { +	name = rcu_string_strdup(device_path, GFP_NOFS); +	if (!name) {  		kfree(device);  		ret = -ENOMEM;  		goto error;  	} +	rcu_assign_pointer(device->name, name);  	ret = find_next_devid(root, &device->devid);  	if (ret) { -		kfree(device->name); +		rcu_string_free(device->name);  		kfree(device);  		goto error;  	}  	trans = btrfs_start_transaction(root, 0);  	if (IS_ERR(trans)) { -		kfree(device->name); +		rcu_string_free(device->name);  		kfree(device);  		ret = PTR_ERR(trans);  		goto error; @@ -1792,7 +1812,7 @@ error_trans:  	unlock_chunks(root);  	btrfs_abort_transaction(trans, root, ret);  	btrfs_end_transaction(trans, root); -	kfree(device->name); +	rcu_string_free(device->name);  	kfree(device);  error:  	blkdev_put(bdev, FMODE_EXCL); @@ -2825,31 +2845,48 @@ out:  static int balance_kthread(void *data)  { -	struct btrfs_balance_control *bctl = -			(struct btrfs_balance_control *)data; -	struct btrfs_fs_info *fs_info = bctl->fs_info; +	struct btrfs_fs_info *fs_info = data;  	int ret = 0;  	mutex_lock(&fs_info->volume_mutex);  	mutex_lock(&fs_info->balance_mutex); -	set_balance_control(bctl); - -	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { -		printk(KERN_INFO "btrfs: force skipping balance\n"); -	} else { +	if (fs_info->balance_ctl) {  		printk(KERN_INFO "btrfs: continuing balance\n"); -		ret = btrfs_balance(bctl, NULL); +		ret = btrfs_balance(fs_info->balance_ctl, NULL);  	}  	mutex_unlock(&fs_info->balance_mutex);  	mutex_unlock(&fs_info->volume_mutex); +  	return ret;  } -int btrfs_recover_balance(struct btrfs_root *tree_root) +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)  {  	struct task_struct *tsk; + +	spin_lock(&fs_info->balance_lock); +	if (!fs_info->balance_ctl) { +		spin_unlock(&fs_info->balance_lock); +		return 0; +	} +	spin_unlock(&fs_info->balance_lock); + +	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { +		printk(KERN_INFO "btrfs: force skipping balance\n"); +		return 0; +	} + +	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); +	if (IS_ERR(tsk)) +		return PTR_ERR(tsk); + +	return 0; +} + +int btrfs_recover_balance(struct btrfs_fs_info *fs_info) +{  	struct btrfs_balance_control *bctl;  	struct btrfs_balance_item *item;  	struct btrfs_disk_balance_args disk_bargs; @@ -2862,29 +2899,30 @@ int btrfs_recover_balance(struct btrfs_root *tree_root)  	if (!path)  		return -ENOMEM; -	bctl = kzalloc(sizeof(*bctl), GFP_NOFS); -	if (!bctl) { -		ret = -ENOMEM; -		goto out; -	} -  	key.objectid = BTRFS_BALANCE_OBJECTID;  	key.type = BTRFS_BALANCE_ITEM_KEY;  	key.offset = 0; -	ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); +	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);  	if (ret < 0) -		goto out_bctl; +		goto out;  	if (ret > 0) { /* ret = -ENOENT; */  		ret = 0; -		goto out_bctl; +		goto out; +	} + +	bctl = kzalloc(sizeof(*bctl), GFP_NOFS); +	if (!bctl) { +		ret = -ENOMEM; +		goto out;  	}  	leaf = path->nodes[0];  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); -	bctl->fs_info = tree_root->fs_info; -	bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; +	bctl->fs_info = fs_info; +	bctl->flags = btrfs_balance_flags(leaf, item); +	bctl->flags |= BTRFS_BALANCE_RESUME;  	btrfs_balance_data(leaf, item, &disk_bargs);  	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); @@ -2893,14 +2931,13 @@ int btrfs_recover_balance(struct btrfs_root *tree_root)  	btrfs_balance_sys(leaf, item, &disk_bargs);  	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); -	tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); -	if (IS_ERR(tsk)) -		ret = PTR_ERR(tsk); -	else -		goto out; +	mutex_lock(&fs_info->volume_mutex); +	mutex_lock(&fs_info->balance_mutex); -out_bctl: -	kfree(bctl); +	set_balance_control(bctl); + +	mutex_unlock(&fs_info->balance_mutex); +	mutex_unlock(&fs_info->volume_mutex);  out:  	btrfs_free_path(path);  	return ret; @@ -4001,13 +4038,60 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	return 0;  } +static void *merge_stripe_index_into_bio_private(void *bi_private, +						 unsigned int stripe_index) +{ +	/* +	 * with single, dup, RAID0, RAID1 and RAID10, stripe_index is +	 * at most 1. +	 * The alternative solution (instead of stealing bits from the +	 * pointer) would be to allocate an intermediate structure +	 * that contains the old private pointer plus the stripe_index. +	 */ +	BUG_ON((((uintptr_t)bi_private) & 3) != 0); +	BUG_ON(stripe_index > 3); +	return (void *)(((uintptr_t)bi_private) | stripe_index); +} + +static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) +{ +	return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); +} + +static unsigned int extract_stripe_index_from_bio_private(void *bi_private) +{ +	return (unsigned int)((uintptr_t)bi_private) & 3; +} +  static void btrfs_end_bio(struct bio *bio, int err)  { -	struct btrfs_bio *bbio = bio->bi_private; +	struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);  	int is_orig_bio = 0; -	if (err) +	if (err) {  		atomic_inc(&bbio->error); +		if (err == -EIO || err == -EREMOTEIO) { +			unsigned int stripe_index = +				extract_stripe_index_from_bio_private( +					bio->bi_private); +			struct btrfs_device *dev; + +			BUG_ON(stripe_index >= bbio->num_stripes); +			dev = bbio->stripes[stripe_index].dev; +			if (dev->bdev) { +				if (bio->bi_rw & WRITE) +					btrfs_dev_stat_inc(dev, +						BTRFS_DEV_STAT_WRITE_ERRS); +				else +					btrfs_dev_stat_inc(dev, +						BTRFS_DEV_STAT_READ_ERRS); +				if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) +					btrfs_dev_stat_inc(dev, +						BTRFS_DEV_STAT_FLUSH_ERRS); +				btrfs_dev_stat_print_on_error(dev); +			} +		} +	}  	if (bio == bbio->orig_bio)  		is_orig_bio = 1; @@ -4149,14 +4233,23 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  			bio = first_bio;  		}  		bio->bi_private = bbio; +		bio->bi_private = merge_stripe_index_into_bio_private( +				bio->bi_private, (unsigned int)dev_nr);  		bio->bi_end_io = btrfs_end_bio;  		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;  		dev = bbio->stripes[dev_nr].dev;  		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { +#ifdef DEBUG +			struct rcu_string *name; + +			rcu_read_lock(); +			name = rcu_dereference(dev->name);  			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "  				 "(%s id %llu), size=%u\n", rw,  				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, -				 dev->name, dev->devid, bio->bi_size); +				 name->str, dev->devid, bio->bi_size); +			rcu_read_unlock(); +#endif  			bio->bi_bdev = dev->bdev;  			if (async_submit)  				schedule_bio(root, dev, rw, bio); @@ -4509,6 +4602,28 @@ int btrfs_read_sys_array(struct btrfs_root *root)  	return ret;  } +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, +						   u64 logical, int mirror_num) +{ +	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; +	int ret; +	u64 map_length = 0; +	struct btrfs_bio *bbio = NULL; +	struct btrfs_device *device; + +	BUG_ON(mirror_num == 0); +	ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio, +			      mirror_num); +	if (ret) { +		BUG_ON(bbio != NULL); +		return NULL; +	} +	BUG_ON(mirror_num != bbio->mirror_num); +	device = bbio->stripes[mirror_num - 1].dev; +	kfree(bbio); +	return device; +} +  int btrfs_read_chunk_tree(struct btrfs_root *root)  {  	struct btrfs_path *path; @@ -4583,3 +4698,231 @@ error:  	btrfs_free_path(path);  	return ret;  } + +static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +{ +	int i; + +	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) +		btrfs_dev_stat_reset(dev, i); +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_key key; +	struct btrfs_key found_key; +	struct btrfs_root *dev_root = fs_info->dev_root; +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; +	struct extent_buffer *eb; +	int slot; +	int ret = 0; +	struct btrfs_device *device; +	struct btrfs_path *path = NULL; +	int i; + +	path = btrfs_alloc_path(); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} + +	mutex_lock(&fs_devices->device_list_mutex); +	list_for_each_entry(device, &fs_devices->devices, dev_list) { +		int item_size; +		struct btrfs_dev_stats_item *ptr; + +		key.objectid = 0; +		key.type = BTRFS_DEV_STATS_KEY; +		key.offset = device->devid; +		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); +		if (ret) { +			printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", +				      rcu_str_deref(device->name), +				      (unsigned long long)device->devid); +			__btrfs_reset_dev_stats(device); +			device->dev_stats_valid = 1; +			btrfs_release_path(path); +			continue; +		} +		slot = path->slots[0]; +		eb = path->nodes[0]; +		btrfs_item_key_to_cpu(eb, &found_key, slot); +		item_size = btrfs_item_size_nr(eb, slot); + +		ptr = btrfs_item_ptr(eb, slot, +				     struct btrfs_dev_stats_item); + +		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { +			if (item_size >= (1 + i) * sizeof(__le64)) +				btrfs_dev_stat_set(device, i, +					btrfs_dev_stats_value(eb, ptr, i)); +			else +				btrfs_dev_stat_reset(device, i); +		} + +		device->dev_stats_valid = 1; +		btrfs_dev_stat_print_on_load(device); +		btrfs_release_path(path); +	} +	mutex_unlock(&fs_devices->device_list_mutex); + +out: +	btrfs_free_path(path); +	return ret < 0 ? ret : 0; +} + +static int update_dev_stat_item(struct btrfs_trans_handle *trans, +				struct btrfs_root *dev_root, +				struct btrfs_device *device) +{ +	struct btrfs_path *path; +	struct btrfs_key key; +	struct extent_buffer *eb; +	struct btrfs_dev_stats_item *ptr; +	int ret; +	int i; + +	key.objectid = 0; +	key.type = BTRFS_DEV_STATS_KEY; +	key.offset = device->devid; + +	path = btrfs_alloc_path(); +	BUG_ON(!path); +	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); +	if (ret < 0) { +		printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", +			      ret, rcu_str_deref(device->name)); +		goto out; +	} + +	if (ret == 0 && +	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { +		/* need to delete old one and insert a new one */ +		ret = btrfs_del_item(trans, dev_root, path); +		if (ret != 0) { +			printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", +				      rcu_str_deref(device->name), ret); +			goto out; +		} +		ret = 1; +	} + +	if (ret == 1) { +		/* need to insert a new item */ +		btrfs_release_path(path); +		ret = btrfs_insert_empty_item(trans, dev_root, path, +					      &key, sizeof(*ptr)); +		if (ret < 0) { +			printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", +				      rcu_str_deref(device->name), ret); +			goto out; +		} +	} + +	eb = path->nodes[0]; +	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); +	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) +		btrfs_set_dev_stats_value(eb, ptr, i, +					  btrfs_dev_stat_read(device, i)); +	btrfs_mark_buffer_dirty(eb); + +out: +	btrfs_free_path(path); +	return ret; +} + +/* + * called from commit_transaction. Writes all changed device stats to disk. + */ +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, +			struct btrfs_fs_info *fs_info) +{ +	struct btrfs_root *dev_root = fs_info->dev_root; +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; +	struct btrfs_device *device; +	int ret = 0; + +	mutex_lock(&fs_devices->device_list_mutex); +	list_for_each_entry(device, &fs_devices->devices, dev_list) { +		if (!device->dev_stats_valid || !device->dev_stats_dirty) +			continue; + +		ret = update_dev_stat_item(trans, dev_root, device); +		if (!ret) +			device->dev_stats_dirty = 0; +	} +	mutex_unlock(&fs_devices->device_list_mutex); + +	return ret; +} + +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) +{ +	btrfs_dev_stat_inc(dev, index); +	btrfs_dev_stat_print_on_error(dev); +} + +void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) +{ +	if (!dev->dev_stats_valid) +		return; +	printk_ratelimited_in_rcu(KERN_ERR +			   "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", +			   rcu_str_deref(dev->name), +			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), +			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), +			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), +			   btrfs_dev_stat_read(dev, +					       BTRFS_DEV_STAT_CORRUPTION_ERRS), +			   btrfs_dev_stat_read(dev, +					       BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) +{ +	printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", +	       rcu_str_deref(dev->name), +	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), +	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), +	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), +	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), +	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +int btrfs_get_dev_stats(struct btrfs_root *root, +			struct btrfs_ioctl_get_dev_stats *stats, +			int reset_after_read) +{ +	struct btrfs_device *dev; +	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; +	int i; + +	mutex_lock(&fs_devices->device_list_mutex); +	dev = btrfs_find_device(root, stats->devid, NULL, NULL); +	mutex_unlock(&fs_devices->device_list_mutex); + +	if (!dev) { +		printk(KERN_WARNING +		       "btrfs: get dev_stats failed, device not found\n"); +		return -ENODEV; +	} else if (!dev->dev_stats_valid) { +		printk(KERN_WARNING +		       "btrfs: get dev_stats failed, not yet valid\n"); +		return -ENODEV; +	} else if (reset_after_read) { +		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { +			if (stats->nr_items > i) +				stats->values[i] = +					btrfs_dev_stat_read_and_reset(dev, i); +			else +				btrfs_dev_stat_reset(dev, i); +		} +	} else { +		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) +			if (stats->nr_items > i) +				stats->values[i] = btrfs_dev_stat_read(dev, i); +	} +	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) +		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; +	return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bb6b03f97aa..95f6637614d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -22,6 +22,7 @@  #include <linux/bio.h>  #include <linux/sort.h>  #include "async-thread.h" +#include "ioctl.h"  #define BTRFS_STRIPE_LEN	(64 * 1024) @@ -57,7 +58,7 @@ struct btrfs_device {  	/* the mode sent to blkdev_get */  	fmode_t mode; -	char *name; +	struct rcu_string *name;  	/* the internal btrfs device id */  	u64 devid; @@ -106,6 +107,11 @@ struct btrfs_device {  	struct completion flush_wait;  	int nobarriers; +	/* disk I/O failure stats. For detailed description refer to +	 * enum btrfs_dev_stat_values in ioctl.h */ +	int dev_stats_valid; +	int dev_stats_dirty; /* counters need to be written to disk */ +	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];  };  struct btrfs_fs_devices { @@ -275,10 +281,57 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);  int btrfs_init_new_device(struct btrfs_root *root, char *path);  int btrfs_balance(struct btrfs_balance_control *bctl,  		  struct btrfs_ioctl_balance_args *bargs); -int btrfs_recover_balance(struct btrfs_root *tree_root); +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); +int btrfs_recover_balance(struct btrfs_fs_info *fs_info);  int btrfs_pause_balance(struct btrfs_fs_info *fs_info);  int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);  int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);  int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,  			 u64 *start, u64 *max_avail); +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, +						   u64 logical, int mirror_num); +void btrfs_dev_stat_print_on_error(struct btrfs_device *device); +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); +int btrfs_get_dev_stats(struct btrfs_root *root, +			struct btrfs_ioctl_get_dev_stats *stats, +			int reset_after_read); +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, +			struct btrfs_fs_info *fs_info); + +static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, +				      int index) +{ +	atomic_inc(dev->dev_stat_values + index); +	dev->dev_stats_dirty = 1; +} + +static inline int btrfs_dev_stat_read(struct btrfs_device *dev, +				      int index) +{ +	return atomic_read(dev->dev_stat_values + index); +} + +static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, +						int index) +{ +	int ret; + +	ret = atomic_xchg(dev->dev_stat_values + index, 0); +	dev->dev_stats_dirty = 1; +	return ret; +} + +static inline void btrfs_dev_stat_set(struct btrfs_device *dev, +				      int index, unsigned long val) +{ +	atomic_set(dev->dev_stat_values + index, val); +	dev->dev_stats_dirty = 1; +} + +static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, +					int index) +{ +	btrfs_dev_stat_set(dev, index, 0); +}  #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index e7a5659087e..3f4e2d69e83 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,  	if (ret)  		goto out; +	inode_inc_iversion(inode);  	inode->i_ctime = CURRENT_TIME;  	ret = btrfs_update_inode(trans, root, inode);  	BUG_ON(ret);  |