diff options
| author | Chris Mason <chris.mason@oracle.com> | 2012-05-31 16:50:28 -0400 | 
|---|---|---|
| committer | Chris Mason <chris.mason@oracle.com> | 2012-05-31 16:49:53 -0400 | 
| commit | 1e20932a23578bb1ec59107843574e259b96193f (patch) | |
| tree | 844ae54293c4414fc4c232a36d0e4d4939dc35aa | |
| parent | cfc442b69696b593cb442f09997dcb4cb5748171 (diff) | |
| parent | c31931088fd6cf953bd0868a2647b6c3928e6c96 (diff) | |
| download | olio-linux-3.10-1e20932a23578bb1ec59107843574e259b96193f.tar.xz olio-linux-3.10-1e20932a23578bb1ec59107843574e259b96193f.zip  | |
Merge branch 'for-chris' of git://git.jan-o-sch.net/btrfs-unstable into for-linus
Conflicts:
	fs/btrfs/ulist.h
Signed-off-by: Chris Mason <chris.mason@oracle.com>
| -rw-r--r-- | fs/btrfs/backref.c | 495 | ||||
| -rw-r--r-- | fs/btrfs/backref.h | 3 | ||||
| -rw-r--r-- | fs/btrfs/ctree.c | 849 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 34 | ||||
| -rw-r--r-- | fs/btrfs/delayed-ref.c | 10 | ||||
| -rw-r--r-- | fs/btrfs/delayed-ref.h | 24 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 7 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 10 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 80 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.h | 3 | ||||
| -rw-r--r-- | fs/btrfs/ioctl.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 55 | ||||
| -rw-r--r-- | fs/btrfs/ulist.c | 34 | ||||
| -rw-r--r-- | fs/btrfs/ulist.h | 14 | 
14 files changed, 1368 insertions, 252 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index bcec0675023..3f75895c919 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -24,22 +24,135 @@  #include "delayed-ref.h"  #include "locking.h" +struct extent_inode_elem { +	u64 inum; +	u64 offset; +	struct extent_inode_elem *next; +}; + +static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, +				struct btrfs_file_extent_item *fi, +				u64 extent_item_pos, +				struct extent_inode_elem **eie) +{ +	u64 data_offset; +	u64 data_len; +	struct extent_inode_elem *e; + +	data_offset = btrfs_file_extent_offset(eb, fi); +	data_len = btrfs_file_extent_num_bytes(eb, fi); + +	if (extent_item_pos < data_offset || +	    extent_item_pos >= data_offset + data_len) +		return 1; + +	e = kmalloc(sizeof(*e), GFP_NOFS); +	if (!e) +		return -ENOMEM; + +	e->next = *eie; +	e->inum = key->objectid; +	e->offset = key->offset + (extent_item_pos - data_offset); +	*eie = e; + +	return 0; +} + +static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, +				u64 extent_item_pos, +				struct extent_inode_elem **eie) +{ +	u64 disk_byte; +	struct btrfs_key key; +	struct btrfs_file_extent_item *fi; +	int slot; +	int nritems; +	int extent_type; +	int ret; + +	/* +	 * from the shared data ref, we only have the leaf but we need +	 * the key. thus, we must look into all items and see that we +	 * find one (some) with a reference to our extent item. +	 */ +	nritems = btrfs_header_nritems(eb); +	for (slot = 0; slot < nritems; ++slot) { +		btrfs_item_key_to_cpu(eb, &key, slot); +		if (key.type != BTRFS_EXTENT_DATA_KEY) +			continue; +		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); +		extent_type = btrfs_file_extent_type(eb, fi); +		if (extent_type == BTRFS_FILE_EXTENT_INLINE) +			continue; +		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ +		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); +		if (disk_byte != wanted_disk_byte) +			continue; + +		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); +		if (ret < 0) +			return ret; +	} + +	return 0; +} +  /*   * this structure records all encountered refs on the way up to the root   */  struct __prelim_ref {  	struct list_head list;  	u64 root_id; -	struct btrfs_key key; +	struct btrfs_key key_for_search;  	int level;  	int count; +	struct extent_inode_elem *inode_list;  	u64 parent;  	u64 wanted_disk_byte;  }; +/* + * the rules for all callers of this function are: + * - obtaining the parent is the goal + * - if you add a key, you must know that it is a correct key + * - if you cannot add the parent or a correct key, then we will look into the + *   block later to set a correct key + * + * delayed refs + * ============ + *        backref type | shared | indirect | shared | indirect + * information         |   tree |     tree |   data |     data + * --------------------+--------+----------+--------+---------- + *      parent logical |    y   |     -    |    -   |     - + *      key to resolve |    -   |     y    |    y   |     y + *  tree block logical |    -   |     -    |    -   |     - + *  root for resolving |    y   |     y    |    y   |     y + * + * - column 1:       we've the parent -> done + * - column 2, 3, 4: we use the key to find the parent + * + * on disk refs (inline or keyed) + * ============================== + *        backref type | shared | indirect | shared | indirect + * information         |   tree |     tree |   data |     data + * --------------------+--------+----------+--------+---------- + *      parent logical |    y   |     -    |    y   |     - + *      key to resolve |    -   |     -    |    -   |     y + *  tree block logical |    y   |     y    |    y   |     y + *  root for resolving |    -   |     y    |    y   |     y + * + * - column 1, 3: we've the parent -> done + * - column 2:    we take the first key from the block to find the parent + *                (see __add_missing_keys) + * - column 4:    we use the key to find the parent + * + * additional information that's available but not required to find the parent + * block might help in merging entries to gain some speed. + */ +  static int __add_prelim_ref(struct list_head *head, u64 root_id, -			    struct btrfs_key *key, int level, u64 parent, -			    u64 wanted_disk_byte, int count) +			    struct btrfs_key *key, int level, +			    u64 parent, u64 wanted_disk_byte, int count)  {  	struct __prelim_ref *ref; @@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,  	ref->root_id = root_id;  	if (key) -		ref->key = *key; +		ref->key_for_search = *key;  	else -		memset(&ref->key, 0, sizeof(ref->key)); +		memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); +	ref->inode_list = NULL;  	ref->level = level;  	ref->count = count;  	ref->parent = parent; @@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,  }  static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, -				struct ulist *parents, -				struct extent_buffer *eb, int level, -				u64 wanted_objectid, u64 wanted_disk_byte) +				struct ulist *parents, int level, +				struct btrfs_key *key, u64 wanted_disk_byte, +				const u64 *extent_item_pos)  {  	int ret; -	int slot; +	int slot = path->slots[level]; +	struct extent_buffer *eb = path->nodes[level];  	struct btrfs_file_extent_item *fi; -	struct btrfs_key key; +	struct extent_inode_elem *eie = NULL;  	u64 disk_byte; +	u64 wanted_objectid = key->objectid;  add_parent: -	ret = ulist_add(parents, eb->start, 0, GFP_NOFS); +	if (level == 0 && extent_item_pos) { +		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); +		ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie); +		if (ret < 0) +			return ret; +	} +	ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);  	if (ret < 0)  		return ret; @@ -89,6 +211,7 @@ add_parent:  	 * repeat this until we don't find any additional EXTENT_DATA items.  	 */  	while (1) { +		eie = NULL;  		ret = btrfs_next_leaf(root, path);  		if (ret < 0)  			return ret; @@ -97,9 +220,9 @@ add_parent:  		eb = path->nodes[0];  		for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { -			btrfs_item_key_to_cpu(eb, &key, slot); -			if (key.objectid != wanted_objectid || -			    key.type != BTRFS_EXTENT_DATA_KEY) +			btrfs_item_key_to_cpu(eb, key, slot); +			if (key->objectid != wanted_objectid || +			    key->type != BTRFS_EXTENT_DATA_KEY)  				return 0;  			fi = btrfs_item_ptr(eb, slot,  						struct btrfs_file_extent_item); @@ -118,8 +241,10 @@ add_parent:   */  static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,  					int search_commit_root, +					u64 time_seq,  					struct __prelim_ref *ref, -					struct ulist *parents) +					struct ulist *parents, +					const u64 *extent_item_pos)  {  	struct btrfs_path *path;  	struct btrfs_root *root; @@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,  		goto out;  	path->lowest_level = level; -	ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); +	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);  	pr_debug("search slot in root %llu (level %d, ref count %d) returned "  		 "%d for key (%llu %u %llu)\n",  		 (unsigned long long)ref->root_id, level, ref->count, ret, -		 (unsigned long long)ref->key.objectid, ref->key.type, -		 (unsigned long long)ref->key.offset); +		 (unsigned long long)ref->key_for_search.objectid, +		 ref->key_for_search.type, +		 (unsigned long long)ref->key_for_search.offset);  	if (ret < 0)  		goto out; @@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,  		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);  	} -	/* the last two parameters will only be used for level == 0 */ -	ret = add_all_parents(root, path, parents, eb, level, key.objectid, -				ref->wanted_disk_byte); +	ret = add_all_parents(root, path, parents, level, &key, +				ref->wanted_disk_byte, extent_item_pos);  out:  	btrfs_free_path(path);  	return ret; @@ -191,8 +316,9 @@ out:   * resolve all indirect backrefs from the list   */  static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, -				   int search_commit_root, -				   struct list_head *head) +				   int search_commit_root, u64 time_seq, +				   struct list_head *head, +				   const u64 *extent_item_pos)  {  	int err;  	int ret = 0; @@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  	struct __prelim_ref *new_ref;  	struct ulist *parents;  	struct ulist_node *node; +	struct ulist_iterator uiter;  	parents = ulist_alloc(GFP_NOFS);  	if (!parents) @@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  		if (ref->count == 0)  			continue;  		err = __resolve_indirect_ref(fs_info, search_commit_root, -					     ref, parents); +					     time_seq, ref, parents, +					     extent_item_pos);  		if (err) {  			if (ret == 0)  				ret = err; @@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  		}  		/* we put the first parent into the ref at hand */ -		node = ulist_next(parents, NULL); +		ULIST_ITER_INIT(&uiter); +		node = ulist_next(parents, &uiter);  		ref->parent = node ? node->val : 0; +		ref->inode_list = +			node ? (struct extent_inode_elem *)node->aux : 0;  		/* additional parents require new refs being added here */ -		while ((node = ulist_next(parents, node))) { +		while ((node = ulist_next(parents, &uiter))) {  			new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);  			if (!new_ref) {  				ret = -ENOMEM; @@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  			}  			memcpy(new_ref, ref, sizeof(*ref));  			new_ref->parent = node->val; +			new_ref->inode_list = +					(struct extent_inode_elem *)node->aux;  			list_add(&new_ref->list, &ref->list);  		}  		ulist_reinit(parents); @@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  	return ret;  } +static inline int ref_for_same_block(struct __prelim_ref *ref1, +				     struct __prelim_ref *ref2) +{ +	if (ref1->level != ref2->level) +		return 0; +	if (ref1->root_id != ref2->root_id) +		return 0; +	if (ref1->key_for_search.type != ref2->key_for_search.type) +		return 0; +	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) +		return 0; +	if (ref1->key_for_search.offset != ref2->key_for_search.offset) +		return 0; +	if (ref1->parent != ref2->parent) +		return 0; + +	return 1; +} + +/* + * read tree blocks and add keys where required. + */ +static int __add_missing_keys(struct btrfs_fs_info *fs_info, +			      struct list_head *head) +{ +	struct list_head *pos; +	struct extent_buffer *eb; + +	list_for_each(pos, head) { +		struct __prelim_ref *ref; +		ref = list_entry(pos, struct __prelim_ref, list); + +		if (ref->parent) +			continue; +		if (ref->key_for_search.type) +			continue; +		BUG_ON(!ref->wanted_disk_byte); +		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, +				     fs_info->tree_root->leafsize, 0); +		BUG_ON(!eb); +		btrfs_tree_read_lock(eb); +		if (btrfs_header_level(eb) == 0) +			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); +		else +			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); +		btrfs_tree_read_unlock(eb); +		free_extent_buffer(eb); +	} +	return 0; +} +  /*   * merge two lists of backrefs and adjust counts accordingly   *   * mode = 1: merge identical keys, if key is set + *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here. + *           additionally, we could even add a key range for the blocks we + *           looked into to merge even more (-> replace unresolved refs by those + *           having a parent).   * mode = 2: merge identical parents   */  static int __merge_refs(struct list_head *head, int mode) @@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode)  		ref1 = list_entry(pos1, struct __prelim_ref, list); -		if (mode == 1 && ref1->key.type == 0) -			continue;  		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;  		     pos2 = n2, n2 = pos2->next) {  			struct __prelim_ref *ref2; +			struct __prelim_ref *xchg;  			ref2 = list_entry(pos2, struct __prelim_ref, list);  			if (mode == 1) { -				if (memcmp(&ref1->key, &ref2->key, -					   sizeof(ref1->key)) || -				    ref1->level != ref2->level || -				    ref1->root_id != ref2->root_id) +				if (!ref_for_same_block(ref1, ref2))  					continue; +				if (!ref1->parent && ref2->parent) { +					xchg = ref1; +					ref1 = ref2; +					ref2 = xchg; +				}  				ref1->count += ref2->count;  			} else {  				if (ref1->parent != ref2->parent) @@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode)   * smaller or equal that seq to the list   */  static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, -			      struct btrfs_key *info_key,  			      struct list_head *prefs)  {  	struct btrfs_delayed_extent_op *extent_op = head->extent_op;  	struct rb_node *n = &head->node.rb_node; +	struct btrfs_key key; +	struct btrfs_key op_key = {0};  	int sgn;  	int ret = 0;  	if (extent_op && extent_op->update_key) -		btrfs_disk_key_to_cpu(info_key, &extent_op->key); +		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);  	while ((n = rb_prev(n))) {  		struct btrfs_delayed_ref_node *node; @@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,  			struct btrfs_delayed_tree_ref *ref;  			ref = btrfs_delayed_node_to_tree_ref(node); -			ret = __add_prelim_ref(prefs, ref->root, info_key, +			ret = __add_prelim_ref(prefs, ref->root, &op_key,  					       ref->level + 1, 0, node->bytenr,  					       node->ref_mod * sgn);  			break; @@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,  			struct btrfs_delayed_tree_ref *ref;  			ref = btrfs_delayed_node_to_tree_ref(node); -			ret = __add_prelim_ref(prefs, ref->root, info_key, +			ret = __add_prelim_ref(prefs, ref->root, NULL,  					       ref->level + 1, ref->parent,  					       node->bytenr,  					       node->ref_mod * sgn); @@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,  		}  		case BTRFS_EXTENT_DATA_REF_KEY: {  			struct btrfs_delayed_data_ref *ref; -			struct btrfs_key key; -  			ref = btrfs_delayed_node_to_data_ref(node);  			key.objectid = ref->objectid; @@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,  		}  		case BTRFS_SHARED_DATA_REF_KEY: {  			struct btrfs_delayed_data_ref *ref; -			struct btrfs_key key;  			ref = btrfs_delayed_node_to_data_ref(node); @@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,   */  static int __add_inline_refs(struct btrfs_fs_info *fs_info,  			     struct btrfs_path *path, u64 bytenr, -			     struct btrfs_key *info_key, int *info_level, -			     struct list_head *prefs) +			     int *info_level, struct list_head *prefs)  {  	int ret = 0;  	int slot; @@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  	 * enumerate all inline refs  	 */  	leaf = path->nodes[0]; -	slot = path->slots[0] - 1; +	slot = path->slots[0];  	item_size = btrfs_item_size_nr(leaf, slot);  	BUG_ON(item_size < sizeof(*ei)); @@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {  		struct btrfs_tree_block_info *info; -		struct btrfs_disk_key disk_key;  		info = (struct btrfs_tree_block_info *)ptr;  		*info_level = btrfs_tree_block_level(leaf, info); -		btrfs_tree_block_key(leaf, info, &disk_key); -		btrfs_disk_key_to_cpu(info_key, &disk_key);  		ptr += sizeof(struct btrfs_tree_block_info);  		BUG_ON(ptr > end);  	} else { @@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  		switch (type) {  		case BTRFS_SHARED_BLOCK_REF_KEY: -			ret = __add_prelim_ref(prefs, 0, info_key, +			ret = __add_prelim_ref(prefs, 0, NULL,  						*info_level + 1, offset,  						bytenr, 1);  			break; @@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  			break;  		}  		case BTRFS_TREE_BLOCK_REF_KEY: -			ret = __add_prelim_ref(prefs, offset, info_key, -					       *info_level + 1, 0, bytenr, 1); +			ret = __add_prelim_ref(prefs, offset, NULL, +					       *info_level + 1, 0, +					       bytenr, 1);  			break;  		case BTRFS_EXTENT_DATA_REF_KEY: {  			struct btrfs_extent_data_ref *dref; @@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  			key.type = BTRFS_EXTENT_DATA_KEY;  			key.offset = btrfs_extent_data_ref_offset(leaf, dref);  			root = btrfs_extent_data_ref_root(leaf, dref); -			ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, -						count); +			ret = __add_prelim_ref(prefs, root, &key, 0, 0, +					       bytenr, count);  			break;  		}  		default: @@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,   */  static int __add_keyed_refs(struct btrfs_fs_info *fs_info,  			    struct btrfs_path *path, u64 bytenr, -			    struct btrfs_key *info_key, int info_level, -			    struct list_head *prefs) +			    int info_level, struct list_head *prefs)  {  	struct btrfs_root *extent_root = fs_info->extent_root;  	int ret; @@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,  		switch (key.type) {  		case BTRFS_SHARED_BLOCK_REF_KEY: -			ret = __add_prelim_ref(prefs, 0, info_key, +			ret = __add_prelim_ref(prefs, 0, NULL,  						info_level + 1, key.offset,  						bytenr, 1);  			break; @@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,  			break;  		}  		case BTRFS_TREE_BLOCK_REF_KEY: -			ret = __add_prelim_ref(prefs, key.offset, info_key, -						info_level + 1, 0, bytenr, 1); +			ret = __add_prelim_ref(prefs, key.offset, NULL, +					       info_level + 1, 0, +					       bytenr, 1);  			break;  		case BTRFS_EXTENT_DATA_REF_KEY: {  			struct btrfs_extent_data_ref *dref; @@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,  			key.offset = btrfs_extent_data_ref_offset(leaf, dref);  			root = btrfs_extent_data_ref_root(leaf, dref);  			ret = __add_prelim_ref(prefs, root, &key, 0, 0, -						bytenr, count); +					       bytenr, count);  			break;  		}  		default: @@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,   */  static int find_parent_nodes(struct btrfs_trans_handle *trans,  			     struct btrfs_fs_info *fs_info, u64 bytenr, -			     u64 seq, struct ulist *refs, struct ulist *roots) +			     u64 delayed_ref_seq, u64 time_seq, +			     struct ulist *refs, struct ulist *roots, +			     const u64 *extent_item_pos)  {  	struct btrfs_key key;  	struct btrfs_path *path; -	struct btrfs_key info_key = { 0 };  	struct btrfs_delayed_ref_root *delayed_refs = NULL;  	struct btrfs_delayed_ref_head *head;  	int info_level = 0; @@ -645,7 +830,7 @@ again:  				btrfs_put_delayed_ref(&head->node);  				goto again;  			} -			ret = __add_delayed_refs(head, seq, &info_key, +			ret = __add_delayed_refs(head, delayed_ref_seq,  						 &prefs_delayed);  			if (ret) {  				spin_unlock(&delayed_refs->lock); @@ -659,16 +844,17 @@ again:  		struct extent_buffer *leaf;  		int slot; +		path->slots[0]--;  		leaf = path->nodes[0]; -		slot = path->slots[0] - 1; +		slot = path->slots[0];  		btrfs_item_key_to_cpu(leaf, &key, slot);  		if (key.objectid == bytenr &&  		    key.type == BTRFS_EXTENT_ITEM_KEY) {  			ret = __add_inline_refs(fs_info, path, bytenr, -						&info_key, &info_level, &prefs); +						&info_level, &prefs);  			if (ret)  				goto out; -			ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, +			ret = __add_keyed_refs(fs_info, path, bytenr,  					       info_level, &prefs);  			if (ret)  				goto out; @@ -676,21 +862,18 @@ again:  	}  	btrfs_release_path(path); -	/* -	 * when adding the delayed refs above, the info_key might not have -	 * been known yet. Go over the list and replace the missing keys -	 */ -	list_for_each_entry(ref, &prefs_delayed, list) { -		if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) -			memcpy(&ref->key, &info_key, sizeof(ref->key)); -	}  	list_splice_init(&prefs_delayed, &prefs); +	ret = __add_missing_keys(fs_info, &prefs); +	if (ret) +		goto out; +  	ret = __merge_refs(&prefs, 1);  	if (ret)  		goto out; -	ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); +	ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, +				      &prefs, extent_item_pos);  	if (ret)  		goto out; @@ -709,7 +892,33 @@ again:  			BUG_ON(ret < 0);  		}  		if (ref->count && ref->parent) { -			ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); +			struct extent_inode_elem *eie = NULL; +			if (extent_item_pos && !ref->inode_list) { +				u32 bsz; +				struct extent_buffer *eb; +				bsz = btrfs_level_size(fs_info->extent_root, +							info_level); +				eb = read_tree_block(fs_info->extent_root, +							   ref->parent, bsz, 0); +				BUG_ON(!eb); +				ret = find_extent_in_eb(eb, bytenr, +							*extent_item_pos, &eie); +				ref->inode_list = eie; +				free_extent_buffer(eb); +			} +			ret = ulist_add_merge(refs, ref->parent, +					      (unsigned long)ref->inode_list, +					      (unsigned long *)&eie, GFP_NOFS); +			if (!ret && extent_item_pos) { +				/* +				 * we've recorded that parent, so we must extend +				 * its inode list here +				 */ +				BUG_ON(!eie); +				while (eie->next) +					eie = eie->next; +				eie->next = ref->inode_list; +			}  			BUG_ON(ret < 0);  		}  		kfree(ref); @@ -734,6 +943,28 @@ out:  	return ret;  } +static void free_leaf_list(struct ulist *blocks) +{ +	struct ulist_node *node = NULL; +	struct extent_inode_elem *eie; +	struct extent_inode_elem *eie_next; +	struct ulist_iterator uiter; + +	ULIST_ITER_INIT(&uiter); +	while ((node = ulist_next(blocks, &uiter))) { +		if (!node->aux) +			continue; +		eie = (struct extent_inode_elem *)node->aux; +		for (; eie; eie = eie_next) { +			eie_next = eie->next; +			kfree(eie); +		} +		node->aux = 0; +	} + +	ulist_free(blocks); +} +  /*   * Finds all leafs with a reference to the specified combination of bytenr and   * offset. key_list_head will point to a list of corresponding keys (caller must @@ -744,7 +975,9 @@ out:   */  static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,  				struct btrfs_fs_info *fs_info, u64 bytenr, -				u64 num_bytes, u64 seq, struct ulist **leafs) +				u64 delayed_ref_seq, u64 time_seq, +				struct ulist **leafs, +				const u64 *extent_item_pos)  {  	struct ulist *tmp;  	int ret; @@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,  		return -ENOMEM;  	} -	ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); +	ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, +				time_seq, *leafs, tmp, extent_item_pos);  	ulist_free(tmp);  	if (ret < 0 && ret != -ENOENT) { -		ulist_free(*leafs); +		free_leaf_list(*leafs);  		return ret;  	} @@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,   */  int btrfs_find_all_roots(struct btrfs_trans_handle *trans,  				struct btrfs_fs_info *fs_info, u64 bytenr, -				u64 num_bytes, u64 seq, struct ulist **roots) +				u64 delayed_ref_seq, u64 time_seq, +				struct ulist **roots)  {  	struct ulist *tmp;  	struct ulist_node *node = NULL; +	struct ulist_iterator uiter;  	int ret;  	tmp = ulist_alloc(GFP_NOFS); @@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,  		return -ENOMEM;  	} +	ULIST_ITER_INIT(&uiter);  	while (1) { -		ret = find_parent_nodes(trans, fs_info, bytenr, seq, -					tmp, *roots); +		ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, +					time_seq, tmp, *roots, NULL);  		if (ret < 0 && ret != -ENOENT) {  			ulist_free(tmp);  			ulist_free(*roots);  			return ret;  		} -		node = ulist_next(tmp, node); +		node = ulist_next(tmp, &uiter);  		if (!node)  			break;  		bytenr = node->val; @@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,  	return 0;  } -static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, -				u64 orig_extent_item_objectid, -				u64 extent_item_pos, u64 root, +static int iterate_leaf_refs(struct extent_inode_elem *inode_list, +				u64 root, u64 extent_item_objectid,  				iterate_extent_inodes_t *iterate, void *ctx)  { -	u64 disk_byte; -	struct btrfs_key key; -	struct btrfs_file_extent_item *fi; -	struct extent_buffer *eb; -	int slot; -	int nritems; +	struct extent_inode_elem *eie;  	int ret = 0; -	int extent_type; -	u64 data_offset; -	u64 data_len; - -	eb = read_tree_block(fs_info->tree_root, logical, -				fs_info->tree_root->leafsize, 0); -	if (!eb) -		return -EIO; - -	/* -	 * from the shared data ref, we only have the leaf but we need -	 * the key. thus, we must look into all items and see that we -	 * find one (some) with a reference to our extent item. -	 */ -	nritems = btrfs_header_nritems(eb); -	for (slot = 0; slot < nritems; ++slot) { -		btrfs_item_key_to_cpu(eb, &key, slot); -		if (key.type != BTRFS_EXTENT_DATA_KEY) -			continue; -		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); -		extent_type = btrfs_file_extent_type(eb, fi); -		if (extent_type == BTRFS_FILE_EXTENT_INLINE) -			continue; -		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ -		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); -		if (disk_byte != orig_extent_item_objectid) -			continue; - -		data_offset = btrfs_file_extent_offset(eb, fi); -		data_len = btrfs_file_extent_num_bytes(eb, fi); - -		if (extent_item_pos < data_offset || -		    extent_item_pos >= data_offset + data_len) -			continue; +	for (eie = inode_list; eie; eie = eie->next) {  		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " -				"root %llu\n", orig_extent_item_objectid, -				key.objectid, key.offset, root); -		ret = iterate(key.objectid, -				key.offset + (extent_item_pos - data_offset), -				root, ctx); +			 "root %llu\n", extent_item_objectid, +			 eie->inum, eie->offset, root); +		ret = iterate(eie->inum, eie->offset, root, ctx);  		if (ret) { -			pr_debug("stopping iteration because ret=%d\n", ret); +			pr_debug("stopping iteration for %llu due to ret=%d\n", +				 extent_item_objectid, ret);  			break;  		}  	} -	free_extent_buffer(eb); -  	return ret;  } @@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,  	struct ulist *roots = NULL;  	struct ulist_node *ref_node = NULL;  	struct ulist_node *root_node = NULL; -	struct seq_list seq_elem; +	struct seq_list seq_elem = {}; +	struct seq_list tree_mod_seq_elem = {}; +	struct ulist_iterator ref_uiter; +	struct ulist_iterator root_uiter;  	struct btrfs_delayed_ref_root *delayed_refs = NULL;  	pr_debug("resolving all inodes for extent %llu\n", @@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,  		spin_lock(&delayed_refs->lock);  		btrfs_get_delayed_seq(delayed_refs, &seq_elem);  		spin_unlock(&delayed_refs->lock); +		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);  	}  	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, -				   extent_item_pos, seq_elem.seq, -				   &refs); - +				   seq_elem.seq, tree_mod_seq_elem.seq, &refs, +				   &extent_item_pos);  	if (ret)  		goto out; -	while (!ret && (ref_node = ulist_next(refs, ref_node))) { -		ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, -						seq_elem.seq, &roots); +	ULIST_ITER_INIT(&ref_uiter); +	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { +		ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, +						seq_elem.seq, +						tree_mod_seq_elem.seq, &roots);  		if (ret)  			break; -		while (!ret && (root_node = ulist_next(roots, root_node))) { -			pr_debug("root %llu references leaf %llu\n", -					root_node->val, ref_node->val); -			ret = iterate_leaf_refs(fs_info, ref_node->val, -						extent_item_objectid, -						extent_item_pos, root_node->val, -						iterate, ctx); +		ULIST_ITER_INIT(&root_uiter); +		while (!ret && (root_node = ulist_next(roots, &root_uiter))) { +			pr_debug("root %llu references leaf %llu, data list " +				 "%#lx\n", root_node->val, ref_node->val, +				 ref_node->aux); +			ret = iterate_leaf_refs( +				(struct extent_inode_elem *)ref_node->aux, +				root_node->val, extent_item_objectid, +				iterate, ctx);  		} +		ulist_free(roots); +		roots = NULL;  	} -	ulist_free(refs); +	free_leaf_list(refs);  	ulist_free(roots);  out:  	if (!search_commit_root) { +		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);  		btrfs_put_delayed_seq(delayed_refs, &seq_elem);  		btrfs_end_transaction(trans, fs_info->extent_root);  	} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 57ea2e959e4..c18d8ac7b79 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);  int btrfs_find_all_roots(struct btrfs_trans_handle *trans,  				struct btrfs_fs_info *fs_info, u64 bytenr, -				u64 num_bytes, u64 seq, struct ulist **roots); +				u64 delayed_ref_seq, u64 time_seq, +				struct ulist **roots);  struct btrfs_data_container *init_data_container(u32 total_bytes);  struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 99fcad631a2..d7a96cfdc50 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@  #include <linux/sched.h>  #include <linux/slab.h> +#include <linux/rbtree.h>  #include "ctree.h"  #include "disk-io.h"  #include "transaction.h" @@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  			      struct extent_buffer *dst_buf,  			      struct extent_buffer *src_buf);  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		   struct btrfs_path *path, int level, int slot); +		    struct btrfs_path *path, int level, int slot, +		    int tree_mod_log); +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +				 struct extent_buffer *eb); +struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, +					  u32 blocksize, u64 parent_transid, +					  u64 time_seq); +struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, +						u64 bytenr, u32 blocksize, +						u64 time_seq);  struct btrfs_path *btrfs_alloc_path(void)  { @@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,  				     new_root_objectid, &disk_key, level, -				     buf->start, 0, 1); +				     buf->start, 0);  	if (IS_ERR(cow))  		return PTR_ERR(cow); @@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	return 0;  } +enum mod_log_op { +	MOD_LOG_KEY_REPLACE, +	MOD_LOG_KEY_ADD, +	MOD_LOG_KEY_REMOVE, +	MOD_LOG_KEY_REMOVE_WHILE_FREEING, +	MOD_LOG_KEY_REMOVE_WHILE_MOVING, +	MOD_LOG_MOVE_KEYS, +	MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { +	int dst_slot; +	int nr_items; +}; + +struct tree_mod_root { +	u64 logical; +	u8 level; +}; + +struct tree_mod_elem { +	struct rb_node node; +	u64 index;		/* shifted logical */ +	struct seq_list elem; +	enum mod_log_op op; + +	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ +	int slot; + +	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ +	u64 generation; + +	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ +	struct btrfs_disk_key key; +	u64 blockptr; + +	/* this is used for op == MOD_LOG_MOVE_KEYS */ +	struct tree_mod_move move; + +	/* this is used for op == MOD_LOG_ROOT_REPLACE */ +	struct tree_mod_root old_root; +}; + +static inline void +__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +{ +	elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); +	list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +} + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem) +{ +	elem->flags = 1; +	spin_lock(&fs_info->tree_mod_seq_lock); +	__get_tree_mod_seq(fs_info, elem); +	spin_unlock(&fs_info->tree_mod_seq_lock); +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem) +{ +	struct rb_root *tm_root; +	struct rb_node *node; +	struct rb_node *next; +	struct seq_list *cur_elem; +	struct tree_mod_elem *tm; +	u64 min_seq = (u64)-1; +	u64 seq_putting = elem->seq; + +	if (!seq_putting) +		return; + +	BUG_ON(!(elem->flags & 1)); +	spin_lock(&fs_info->tree_mod_seq_lock); +	list_del(&elem->list); + +	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { +		if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { +			if (seq_putting > cur_elem->seq) { +				/* +				 * blocker with lower sequence number exists, we +				 * cannot remove anything from the log +				 */ +				goto out; +			} +			min_seq = cur_elem->seq; +		} +	} + +	/* +	 * anything that's lower than the lowest existing (read: blocked) +	 * sequence number can be removed from the tree. +	 */ +	write_lock(&fs_info->tree_mod_log_lock); +	tm_root = &fs_info->tree_mod_log; +	for (node = rb_first(tm_root); node; node = next) { +		next = rb_next(node); +		tm = container_of(node, struct tree_mod_elem, node); +		if (tm->elem.seq > min_seq) +			continue; +		rb_erase(node, tm_root); +		list_del(&tm->elem.list); +		kfree(tm); +	} +	write_unlock(&fs_info->tree_mod_log_lock); +out: +	spin_unlock(&fs_info->tree_mod_seq_lock); +} + +/* + * key order of the log: + *       index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ +	struct rb_root *tm_root; +	struct rb_node **new; +	struct rb_node *parent = NULL; +	struct tree_mod_elem *cur; +	int ret = 0; + +	BUG_ON(!tm || !tm->elem.seq); + +	write_lock(&fs_info->tree_mod_log_lock); +	tm_root = &fs_info->tree_mod_log; +	new = &tm_root->rb_node; +	while (*new) { +		cur = container_of(*new, struct tree_mod_elem, node); +		parent = *new; +		if (cur->index < tm->index) +			new = &((*new)->rb_left); +		else if (cur->index > tm->index) +			new = &((*new)->rb_right); +		else if (cur->elem.seq < tm->elem.seq) +			new = &((*new)->rb_left); +		else if (cur->elem.seq > tm->elem.seq) +			new = &((*new)->rb_right); +		else { +			kfree(tm); +			ret = -EEXIST; +			goto unlock; +		} +	} + +	rb_link_node(&tm->node, parent, new); +	rb_insert_color(&tm->node, tm_root); +unlock: +	write_unlock(&fs_info->tree_mod_log_lock); +	return ret; +} + +static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, +				    struct extent_buffer *eb) { +	smp_mb(); +	if (list_empty(&(fs_info)->tree_mod_seq_list)) +		return 1; +	if (!eb) +		return 0; +	if (btrfs_header_level(eb) == 0) +		return 1; +	return 0; +} + +static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, +				 struct tree_mod_elem **tm_ret) +{ +	struct tree_mod_elem *tm; +	int seq; + +	if (tree_mod_dont_log(fs_info, NULL)) +		return 0; + +	tm = *tm_ret = kzalloc(sizeof(*tm), flags); +	if (!tm) +		return -ENOMEM; + +	tm->elem.flags = 0; +	spin_lock(&fs_info->tree_mod_seq_lock); +	if (list_empty(&fs_info->tree_mod_seq_list)) { +		/* +		 * someone emptied the list while we were waiting for the lock. +		 * we must not add to the list, because no blocker exists. items +		 * are removed from the list only when the existing blocker is +		 * removed from the list. +		 */ +		kfree(tm); +		seq = 0; +	} else { +		__get_tree_mod_seq(fs_info, &tm->elem); +		seq = tm->elem.seq; +	} +	spin_unlock(&fs_info->tree_mod_seq_lock); + +	return seq; +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, +			     struct extent_buffer *eb, int slot, +			     enum mod_log_op op, gfp_t flags) +{ +	struct tree_mod_elem *tm; +	int ret; + +	ret = tree_mod_alloc(fs_info, flags, &tm); +	if (ret <= 0) +		return ret; + +	tm->index = eb->start >> PAGE_CACHE_SHIFT; +	if (op != MOD_LOG_KEY_ADD) { +		btrfs_node_key(eb, &tm->key, slot); +		tm->blockptr = btrfs_node_blockptr(eb, slot); +	} +	tm->op = op; +	tm->slot = slot; +	tm->generation = btrfs_node_ptr_generation(eb, slot); + +	return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, +			int slot, enum mod_log_op op) +{ +	return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, +			 struct extent_buffer *eb, int dst_slot, int src_slot, +			 int nr_items, gfp_t flags) +{ +	struct tree_mod_elem *tm; +	int ret; +	int i; + +	if (tree_mod_dont_log(fs_info, eb)) +		return 0; + +	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { +		ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, +					      MOD_LOG_KEY_REMOVE_WHILE_MOVING); +		BUG_ON(ret < 0); +	} + +	ret = tree_mod_alloc(fs_info, flags, &tm); +	if (ret <= 0) +		return ret; + +	tm->index = eb->start >> PAGE_CACHE_SHIFT; +	tm->slot = src_slot; +	tm->move.dst_slot = dst_slot; +	tm->move.nr_items = nr_items; +	tm->op = MOD_LOG_MOVE_KEYS; + +	return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, +			 struct extent_buffer *old_root, +			 struct extent_buffer *new_root, gfp_t flags) +{ +	struct tree_mod_elem *tm; +	int ret; + +	ret = tree_mod_alloc(fs_info, flags, &tm); +	if (ret <= 0) +		return ret; + +	tm->index = new_root->start >> PAGE_CACHE_SHIFT; +	tm->old_root.logical = old_root->start; +	tm->old_root.level = btrfs_header_level(old_root); +	tm->generation = btrfs_header_generation(old_root); +	tm->op = MOD_LOG_ROOT_REPLACE; + +	return __tree_mod_log_insert(fs_info, tm); +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, +		      int smallest) +{ +	struct rb_root *tm_root; +	struct rb_node *node; +	struct tree_mod_elem *cur = NULL; +	struct tree_mod_elem *found = NULL; +	u64 index = start >> PAGE_CACHE_SHIFT; + +	read_lock(&fs_info->tree_mod_log_lock); +	tm_root = &fs_info->tree_mod_log; +	node = tm_root->rb_node; +	while (node) { +		cur = container_of(node, struct tree_mod_elem, node); +		if (cur->index < index) { +			node = node->rb_left; +		} else if (cur->index > index) { +			node = node->rb_right; +		} else if (cur->elem.seq < min_seq) { +			node = node->rb_left; +		} else if (!smallest) { +			/* we want the node with the highest seq */ +			if (found) +				BUG_ON(found->elem.seq > cur->elem.seq); +			found = cur; +			node = node->rb_left; +		} else if (cur->elem.seq > min_seq) { +			/* we want the node with the smallest seq */ +			if (found) +				BUG_ON(found->elem.seq < cur->elem.seq); +			found = cur; +			node = node->rb_right; +		} else { +			found = cur; +			break; +		} +	} +	read_unlock(&fs_info->tree_mod_log_lock); + +	return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, +			   u64 min_seq) +{ +	return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ +	return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static inline void +tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, +		     struct extent_buffer *src, unsigned long dst_offset, +		     unsigned long src_offset, int nr_items) +{ +	int ret; +	int i; + +	if (tree_mod_dont_log(fs_info, NULL)) +		return; + +	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) +		return; + +	/* speed this up by single seq for all operations? */ +	for (i = 0; i < nr_items; i++) { +		ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, +					      MOD_LOG_KEY_REMOVE); +		BUG_ON(ret < 0); +		ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, +					      MOD_LOG_KEY_ADD); +		BUG_ON(ret < 0); +	} +} + +static inline void +tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, +		     int dst_offset, int src_offset, int nr_items) +{ +	int ret; +	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, +				       nr_items, GFP_NOFS); +	BUG_ON(ret < 0); +} + +static inline void +tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, +			  struct extent_buffer *eb, +			  struct btrfs_disk_key *disk_key, int slot, int atomic) +{ +	int ret; + +	ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, +					   MOD_LOG_KEY_REPLACE, +					   atomic ? GFP_ATOMIC : GFP_NOFS); +	BUG_ON(ret < 0); +} + +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +				 struct extent_buffer *eb) +{ +	int i; +	int ret; +	u32 nritems; + +	if (tree_mod_dont_log(fs_info, eb)) +		return; + +	nritems = btrfs_header_nritems(eb); +	for (i = nritems - 1; i >= 0; i--) { +		ret = tree_mod_log_insert_key(fs_info, eb, i, +					      MOD_LOG_KEY_REMOVE_WHILE_FREEING); +		BUG_ON(ret < 0); +	} +} + +static inline void +tree_mod_log_set_root_pointer(struct btrfs_root *root, +			      struct extent_buffer *new_root_node) +{ +	int ret; +	tree_mod_log_free_eb(root->fs_info, root->node); +	ret = tree_mod_log_insert_root(root->fs_info, root->node, +				       new_root_node, GFP_NOFS); +	BUG_ON(ret < 0); +} +  /*   * check if the tree block can be shared by multiple trees   */ @@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,  			ret = btrfs_dec_ref(trans, root, buf, 1, 1);  			BUG_ON(ret); /* -ENOMEM */  		} +		/* +		 * don't log freeing in case we're freeing the root node, this +		 * is done by tree_mod_log_set_root_pointer later +		 */ +		if (buf != root->node && btrfs_header_level(buf) != 0) +			tree_mod_log_free_eb(root->fs_info, buf);  		clean_tree_block(trans, root, buf);  		*last_ref = 1;  	} @@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,  				     root->root_key.objectid, &disk_key, -				     level, search_start, empty_size, 1); +				     level, search_start, empty_size);  	if (IS_ERR(cow))  		return PTR_ERR(cow); @@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  			parent_start = 0;  		extent_buffer_get(cow); +		tree_mod_log_set_root_pointer(root, cow);  		rcu_assign_pointer(root->node, cow);  		btrfs_free_tree_block(trans, root, buf, parent_start, -				      last_ref, 1); +				      last_ref);  		free_extent_buffer(buf);  		add_root_to_dirty_list(root);  	} else { @@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  			parent_start = 0;  		WARN_ON(trans->transid != btrfs_header_generation(parent)); +		tree_mod_log_insert_key(root->fs_info, parent, parent_slot, +					MOD_LOG_KEY_REPLACE);  		btrfs_set_node_blockptr(parent, parent_slot,  					cow->start);  		btrfs_set_node_ptr_generation(parent, parent_slot,  					      trans->transid);  		btrfs_mark_buffer_dirty(parent);  		btrfs_free_tree_block(trans, root, buf, parent_start, -				      last_ref, 1); +				      last_ref);  	}  	if (unlock_orig)  		btrfs_tree_unlock(buf); @@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	return 0;  } +/* + * returns the logical address of the oldest predecessor of the given root. + * entries older than time_seq are ignored. + */ +static struct tree_mod_elem * +__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, +			   struct btrfs_root *root, u64 time_seq) +{ +	struct tree_mod_elem *tm; +	struct tree_mod_elem *found = NULL; +	u64 root_logical = root->node->start; +	int looped = 0; + +	if (!time_seq) +		return 0; + +	/* +	 * the very last operation that's logged for a root is the replacement +	 * operation (if it is replaced at all). this has the index of the *new* +	 * root, making it the very first operation that's logged for this root. +	 */ +	while (1) { +		tm = tree_mod_log_search_oldest(fs_info, root_logical, +						time_seq); +		if (!looped && !tm) +			return 0; +		/* +		 * we must have key remove operations in the log before the +		 * replace operation. +		 */ +		BUG_ON(!tm); + +		if (tm->op != MOD_LOG_ROOT_REPLACE) +			break; + +		found = tm; +		root_logical = tm->old_root.logical; +		BUG_ON(root_logical == root->node->start); +		looped = 1; +	} + +	return found; +} + +/* + * tm is a pointer to the first operation to rewind within eb. then, all + * previous operations will be rewinded (until we reach something older than + * time_seq). + */ +static void +__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, +		      struct tree_mod_elem *first_tm) +{ +	u32 n; +	struct rb_node *next; +	struct tree_mod_elem *tm = first_tm; +	unsigned long o_dst; +	unsigned long o_src; +	unsigned long p_size = sizeof(struct btrfs_key_ptr); + +	n = btrfs_header_nritems(eb); +	while (tm && tm->elem.seq >= time_seq) { +		/* +		 * all the operations are recorded with the operator used for +		 * the modification. as we're going backwards, we do the +		 * opposite of each operation here. +		 */ +		switch (tm->op) { +		case MOD_LOG_KEY_REMOVE_WHILE_FREEING: +			BUG_ON(tm->slot < n); +		case MOD_LOG_KEY_REMOVE_WHILE_MOVING: +		case MOD_LOG_KEY_REMOVE: +			btrfs_set_node_key(eb, &tm->key, tm->slot); +			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); +			btrfs_set_node_ptr_generation(eb, tm->slot, +						      tm->generation); +			n++; +			break; +		case MOD_LOG_KEY_REPLACE: +			BUG_ON(tm->slot >= n); +			btrfs_set_node_key(eb, &tm->key, tm->slot); +			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); +			btrfs_set_node_ptr_generation(eb, tm->slot, +						      tm->generation); +			break; +		case MOD_LOG_KEY_ADD: +			if (tm->slot != n - 1) { +				o_dst = btrfs_node_key_ptr_offset(tm->slot); +				o_src = btrfs_node_key_ptr_offset(tm->slot + 1); +				memmove_extent_buffer(eb, o_dst, o_src, p_size); +			} +			n--; +			break; +		case MOD_LOG_MOVE_KEYS: +			o_dst = btrfs_node_key_ptr_offset(tm->slot); +			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); +			memmove_extent_buffer(eb, o_dst, o_src, +					      tm->move.nr_items * p_size); +			break; +		case MOD_LOG_ROOT_REPLACE: +			/* +			 * this operation is special. for roots, this must be +			 * handled explicitly before rewinding. +			 * for non-roots, this operation may exist if the node +			 * was a root: root A -> child B; then A gets empty and +			 * B is promoted to the new root. in the mod log, we'll +			 * have a root-replace operation for B, a tree block +			 * that is no root. we simply ignore that operation. +			 */ +			break; +		} +		next = rb_next(&tm->node); +		if (!next) +			break; +		tm = container_of(next, struct tree_mod_elem, node); +		if (tm->index != first_tm->index) +			break; +	} +	btrfs_set_header_nritems(eb, n); +} + +static struct extent_buffer * +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, +		    u64 time_seq) +{ +	struct extent_buffer *eb_rewin; +	struct tree_mod_elem *tm; + +	if (!time_seq) +		return eb; + +	if (btrfs_header_level(eb) == 0) +		return eb; + +	tm = tree_mod_log_search(fs_info, eb->start, time_seq); +	if (!tm) +		return eb; + +	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { +		BUG_ON(tm->slot != 0); +		eb_rewin = alloc_dummy_extent_buffer(eb->start, +						fs_info->tree_root->nodesize); +		BUG_ON(!eb_rewin); +		btrfs_set_header_bytenr(eb_rewin, eb->start); +		btrfs_set_header_backref_rev(eb_rewin, +					     btrfs_header_backref_rev(eb)); +		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); +		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); +	} else { +		eb_rewin = btrfs_clone_extent_buffer(eb); +		BUG_ON(!eb_rewin); +	} + +	extent_buffer_get(eb_rewin); +	free_extent_buffer(eb); + +	__tree_mod_log_rewind(eb_rewin, time_seq, tm); + +	return eb_rewin; +} + +static inline struct extent_buffer * +get_old_root(struct btrfs_root *root, u64 time_seq) +{ +	struct tree_mod_elem *tm; +	struct extent_buffer *eb; +	struct tree_mod_root *old_root; +	u64 old_generation; + +	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); +	if (!tm) +		return root->node; + +	old_root = &tm->old_root; +	old_generation = tm->generation; + +	tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq); +	/* +	 * there was an item in the log when __tree_mod_log_oldest_root +	 * returned. this one must not go away, because the time_seq passed to +	 * us must be blocking its removal. +	 */ +	BUG_ON(!tm); + +	if (old_root->logical == root->node->start) { +		/* there are logged operations for the current root */ +		eb = btrfs_clone_extent_buffer(root->node); +	} else { +		/* there's a root replace operation for the current root */ +		eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, +					       root->nodesize); +		btrfs_set_header_bytenr(eb, eb->start); +		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); +		btrfs_set_header_owner(eb, root->root_key.objectid); +	} +	if (!eb) +		return NULL; +	btrfs_set_header_level(eb, old_root->level); +	btrfs_set_header_generation(eb, old_generation); +	__tree_mod_log_rewind(eb, time_seq, tm); + +	return eb; +} +  static inline int should_cow_block(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root,  				   struct extent_buffer *buf) @@ -976,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  			goto enospc;  		} +		tree_mod_log_set_root_pointer(root, child);  		rcu_assign_pointer(root->node, child);  		add_root_to_dirty_list(root); @@ -989,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		free_extent_buffer(mid);  		root_sub_used(root, mid->len); -		btrfs_free_tree_block(trans, root, mid, 0, 1, 0); +		btrfs_free_tree_block(trans, root, mid, 0, 1);  		/* once for the root ptr */  		free_extent_buffer_stale(mid);  		return 0; @@ -1042,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		if (btrfs_header_nritems(right) == 0) {  			clean_tree_block(trans, root, right);  			btrfs_tree_unlock(right); -			del_ptr(trans, root, path, level + 1, pslot + 1); +			del_ptr(trans, root, path, level + 1, pslot + 1, 1);  			root_sub_used(root, right->len); -			btrfs_free_tree_block(trans, root, right, 0, 1, 0); +			btrfs_free_tree_block(trans, root, right, 0, 1);  			free_extent_buffer_stale(right);  			right = NULL;  		} else {  			struct btrfs_disk_key right_key;  			btrfs_node_key(right, &right_key, 0); +			tree_mod_log_set_node_key(root->fs_info, parent, +						  &right_key, pslot + 1, 0);  			btrfs_set_node_key(parent, &right_key, pslot + 1);  			btrfs_mark_buffer_dirty(parent);  		} @@ -1084,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	if (btrfs_header_nritems(mid) == 0) {  		clean_tree_block(trans, root, mid);  		btrfs_tree_unlock(mid); -		del_ptr(trans, root, path, level + 1, pslot); +		del_ptr(trans, root, path, level + 1, pslot, 1);  		root_sub_used(root, mid->len); -		btrfs_free_tree_block(trans, root, mid, 0, 1, 0); +		btrfs_free_tree_block(trans, root, mid, 0, 1);  		free_extent_buffer_stale(mid);  		mid = NULL;  	} else {  		/* update the parent key to reflect our changes */  		struct btrfs_disk_key mid_key;  		btrfs_node_key(mid, &mid_key, 0); +		tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, +					  pslot, 0);  		btrfs_set_node_key(parent, &mid_key, pslot);  		btrfs_mark_buffer_dirty(parent);  	} @@ -1190,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  			struct btrfs_disk_key disk_key;  			orig_slot += left_nr;  			btrfs_node_key(mid, &disk_key, 0); +			tree_mod_log_set_node_key(root->fs_info, parent, +						  &disk_key, pslot, 0);  			btrfs_set_node_key(parent, &disk_key, pslot);  			btrfs_mark_buffer_dirty(parent);  			if (btrfs_header_nritems(left) > orig_slot) { @@ -1241,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  			struct btrfs_disk_key disk_key;  			btrfs_node_key(right, &disk_key, 0); +			tree_mod_log_set_node_key(root->fs_info, parent, +						  &disk_key, pslot + 1, 0);  			btrfs_set_node_key(parent, &disk_key, pslot + 1);  			btrfs_mark_buffer_dirty(parent); @@ -1498,7 +2158,7 @@ static int  read_block_for_search(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct btrfs_path *p,  		       struct extent_buffer **eb_ret, int level, int slot, -		       struct btrfs_key *key) +		       struct btrfs_key *key, u64 time_seq)  {  	u64 blocknr;  	u64 gen; @@ -1852,7 +2512,7 @@ cow_done:  			}  			err = read_block_for_search(trans, root, p, -						    &b, level, slot, key); +						    &b, level, slot, key, 0);  			if (err == -EAGAIN)  				goto again;  			if (err) { @@ -1924,6 +2584,115 @@ done:  }  /* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, +			  struct btrfs_path *p, u64 time_seq) +{ +	struct extent_buffer *b; +	int slot; +	int ret; +	int err; +	int level; +	int lowest_unlock = 1; +	u8 lowest_level = 0; + +	lowest_level = p->lowest_level; +	WARN_ON(p->nodes[0] != NULL); + +	if (p->search_commit_root) { +		BUG_ON(time_seq); +		return btrfs_search_slot(NULL, root, key, p, 0, 0); +	} + +again: +	b = get_old_root(root, time_seq); +	extent_buffer_get(b); +	level = btrfs_header_level(b); +	btrfs_tree_read_lock(b); +	p->locks[level] = BTRFS_READ_LOCK; + +	while (b) { +		level = btrfs_header_level(b); +		p->nodes[level] = b; +		btrfs_clear_path_blocking(p, NULL, 0); + +		/* +		 * we have a lock on b and as long as we aren't changing +		 * the tree, there is no way to for the items in b to change. +		 * It is safe to drop the lock on our parent before we +		 * go through the expensive btree search on b. +		 */ +		btrfs_unlock_up_safe(p, level + 1); + +		ret = bin_search(b, key, level, &slot); + +		if (level != 0) { +			int dec = 0; +			if (ret && slot > 0) { +				dec = 1; +				slot -= 1; +			} +			p->slots[level] = slot; +			unlock_up(p, level, lowest_unlock, 0, NULL); + +			if (level == lowest_level) { +				if (dec) +					p->slots[level]++; +				goto done; +			} + +			err = read_block_for_search(NULL, root, p, &b, level, +						    slot, key, time_seq); +			if (err == -EAGAIN) +				goto again; +			if (err) { +				ret = err; +				goto done; +			} + +			level = btrfs_header_level(b); +			err = btrfs_try_tree_read_lock(b); +			if (!err) { +				btrfs_set_path_blocking(p); +				btrfs_tree_read_lock(b); +				btrfs_clear_path_blocking(p, b, +							  BTRFS_READ_LOCK); +			} +			p->locks[level] = BTRFS_READ_LOCK; +			p->nodes[level] = b; +			b = tree_mod_log_rewind(root->fs_info, b, time_seq); +			if (b != p->nodes[level]) { +				btrfs_tree_unlock_rw(p->nodes[level], +						     p->locks[level]); +				p->locks[level] = 0; +				p->nodes[level] = b; +			} +		} else { +			p->slots[level] = slot; +			unlock_up(p, level, lowest_unlock, 0, NULL); +			goto done; +		} +	} +	ret = 1; +done: +	if (!p->leave_spinning) +		btrfs_set_path_blocking(p); +	if (ret < 0) +		btrfs_release_path(p); + +	return ret; +} + +/*   * adjust the pointers going up the tree, starting at level   * making sure the right key of each node is points to 'key'.   * This is used after shifting pointers to the left, so it stops @@ -1943,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,  		if (!path->nodes[i])  			break;  		t = path->nodes[i]; +		tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);  		btrfs_set_node_key(t, key, tslot);  		btrfs_mark_buffer_dirty(path->nodes[i]);  		if (tslot != 0) @@ -2025,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,  	} else  		push_items = min(src_nritems - 8, push_items); +	tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, +			     push_items);  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(dst_nritems),  			   btrfs_node_key_ptr_offset(0),  			   push_items * sizeof(struct btrfs_key_ptr));  	if (push_items < src_nritems) { +		tree_mod_log_eb_move(root->fs_info, src, 0, push_items, +				     src_nritems - push_items);  		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),  				      btrfs_node_key_ptr_offset(push_items),  				      (src_nritems - push_items) * @@ -2084,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  	if (max_push < push_items)  		push_items = max_push; +	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);  	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),  				      btrfs_node_key_ptr_offset(0),  				      (dst_nritems) *  				      sizeof(struct btrfs_key_ptr)); +	tree_mod_log_eb_copy(root->fs_info, dst, src, 0, +			     src_nritems - push_items, push_items);  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -2131,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,  				   root->root_key.objectid, &lower_key, -				   level, root->node->start, 0, 0); +				   level, root->node->start, 0);  	if (IS_ERR(c))  		return PTR_ERR(c); @@ -2163,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(c);  	old = root->node; +	tree_mod_log_set_root_pointer(root, c);  	rcu_assign_pointer(root->node, c);  	/* the super has an extra ref to root->node */ @@ -2186,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  static void insert_ptr(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct btrfs_path *path,  		       struct btrfs_disk_key *key, u64 bytenr, -		       int slot, int level) +		       int slot, int level, int tree_mod_log)  {  	struct extent_buffer *lower;  	int nritems; +	int ret;  	BUG_ON(!path->nodes[level]);  	btrfs_assert_tree_locked(path->nodes[level]); @@ -2198,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,  	BUG_ON(slot > nritems);  	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));  	if (slot != nritems) { +		if (tree_mod_log && level) +			tree_mod_log_eb_move(root->fs_info, lower, slot + 1, +					     slot, nritems - slot);  		memmove_extent_buffer(lower,  			      btrfs_node_key_ptr_offset(slot + 1),  			      btrfs_node_key_ptr_offset(slot),  			      (nritems - slot) * sizeof(struct btrfs_key_ptr));  	} +	if (tree_mod_log && level) { +		ret = tree_mod_log_insert_key(root->fs_info, lower, slot, +					      MOD_LOG_KEY_ADD); +		BUG_ON(ret < 0); +	}  	btrfs_set_node_key(lower, key, slot);  	btrfs_set_node_blockptr(lower, slot, bytenr);  	WARN_ON(trans->transid == 0); @@ -2254,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,  					root->root_key.objectid, -					&disk_key, level, c->start, 0, 0); +					&disk_key, level, c->start, 0);  	if (IS_ERR(split))  		return PTR_ERR(split); @@ -2273,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  			    (unsigned long)btrfs_header_chunk_tree_uuid(split),  			    BTRFS_UUID_SIZE); - +	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);  	copy_extent_buffer(split, c,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(mid), @@ -2286,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(split);  	insert_ptr(trans, root, path, &disk_key, split->start, -		   path->slots[level + 1] + 1, level + 1); +		   path->slots[level + 1] + 1, level + 1, 1);  	if (path->slots[level] >= mid) {  		path->slots[level] -= mid; @@ -2823,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,  	btrfs_set_header_nritems(l, mid);  	btrfs_item_key(right, &disk_key, 0);  	insert_ptr(trans, root, path, &disk_key, right->start, -		   path->slots[1] + 1, 1); +		   path->slots[1] + 1, 1, 0);  	btrfs_mark_buffer_dirty(right);  	btrfs_mark_buffer_dirty(l); @@ -3006,7 +3793,7 @@ again:  	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,  					root->root_key.objectid, -					&disk_key, 0, l->start, 0, 0); +					&disk_key, 0, l->start, 0);  	if (IS_ERR(right))  		return PTR_ERR(right); @@ -3030,7 +3817,7 @@ again:  		if (mid <= slot) {  			btrfs_set_header_nritems(right, 0);  			insert_ptr(trans, root, path, &disk_key, right->start, -				   path->slots[1] + 1, 1); +				   path->slots[1] + 1, 1, 0);  			btrfs_tree_unlock(path->nodes[0]);  			free_extent_buffer(path->nodes[0]);  			path->nodes[0] = right; @@ -3039,7 +3826,7 @@ again:  		} else {  			btrfs_set_header_nritems(right, 0);  			insert_ptr(trans, root, path, &disk_key, right->start, -					  path->slots[1], 1); +					  path->slots[1], 1, 0);  			btrfs_tree_unlock(path->nodes[0]);  			free_extent_buffer(path->nodes[0]);  			path->nodes[0] = right; @@ -3751,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root   * empty a node.   */  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		    struct btrfs_path *path, int level, int slot) +		    struct btrfs_path *path, int level, int slot, +		    int tree_mod_log)  {  	struct extent_buffer *parent = path->nodes[level];  	u32 nritems; +	int ret;  	nritems = btrfs_header_nritems(parent);  	if (slot != nritems - 1) { +		if (tree_mod_log && level) +			tree_mod_log_eb_move(root->fs_info, parent, slot, +					     slot + 1, nritems - slot - 1);  		memmove_extent_buffer(parent,  			      btrfs_node_key_ptr_offset(slot),  			      btrfs_node_key_ptr_offset(slot + 1),  			      sizeof(struct btrfs_key_ptr) *  			      (nritems - slot - 1)); +	} else if (tree_mod_log && level) { +		ret = tree_mod_log_insert_key(root->fs_info, parent, slot, +					      MOD_LOG_KEY_REMOVE); +		BUG_ON(ret < 0);  	} +  	nritems--;  	btrfs_set_header_nritems(parent, nritems);  	if (nritems == 0 && parent == root->node) { @@ -3795,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,  				    struct extent_buffer *leaf)  {  	WARN_ON(btrfs_header_generation(leaf) != trans->transid); -	del_ptr(trans, root, path, 1, path->slots[1]); +	del_ptr(trans, root, path, 1, path->slots[1], 1);  	/*  	 * btrfs_free_extent is expensive, we want to make sure we @@ -3806,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,  	root_sub_used(root, leaf->len);  	extent_buffer_get(leaf); -	btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); +	btrfs_free_tree_block(trans, root, leaf, 0, 1);  	free_extent_buffer_stale(leaf);  }  /* @@ -4273,7 +5070,7 @@ again:  		next = c;  		next_rw_lock = path->locks[level];  		ret = read_block_for_search(NULL, root, path, &next, level, -					    slot, &key); +					    slot, &key, 0);  		if (ret == -EAGAIN)  			goto again; @@ -4310,7 +5107,7 @@ again:  			break;  		ret = read_block_for_search(NULL, root, path, &next, level, -					    0, &key); +					    0, &key, 0);  		if (ret == -EAGAIN)  			goto again; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1c665ebe47e..0151ca1ac65 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1140,6 +1140,15 @@ struct btrfs_fs_info {  	spinlock_t delayed_iput_lock;  	struct list_head delayed_iputs; +	/* this protects tree_mod_seq_list */ +	spinlock_t tree_mod_seq_lock; +	atomic_t tree_mod_seq; +	struct list_head tree_mod_seq_list; + +	/* this protects tree_mod_log */ +	rwlock_t tree_mod_log_lock; +	struct rb_root tree_mod_log; +  	atomic_t nr_async_submits;  	atomic_t async_submit_draining;  	atomic_t nr_async_bios; @@ -2537,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  					struct btrfs_root *root, u32 blocksize,  					u64 parent, u64 root_objectid,  					struct btrfs_disk_key *key, int level, -					u64 hint, u64 empty_size, int for_cow); +					u64 hint, u64 empty_size);  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct extent_buffer *buf, -			   u64 parent, int last_ref, int for_cow); +			   u64 parent, int last_ref);  struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,  					    struct btrfs_root *root,  					    u64 bytenr, u32 blocksize, @@ -2700,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,  int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root  		      *root, struct btrfs_key *key, struct btrfs_path *p, int  		      ins_len, int cow); +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, +			  struct btrfs_path *p, u64 time_seq);  int btrfs_realloc_node(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct extent_buffer *parent,  		       int start_slot, int cache_only, u64 *last_ret, @@ -3139,4 +3150,23 @@ void btrfs_reada_detach(void *handle);  int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,  			 u64 start, int err); +/* delayed seq elem */ +struct seq_list { +	struct list_head list; +	u64 seq; +	u32 flags; +}; + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem); + +static inline int is_fstree(u64 rootid) +{ +	if (rootid == BTRFS_FS_TREE_OBJECTID || +	    (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) +		return 1; +	return 0; +}  #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 69f22e3ab3b..13ae7b04790 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  	ref->is_head = 0;  	ref->in_tree = 1; -	if (need_ref_seq(for_cow, ref_root)) +	if (is_fstree(ref_root))  		seq = inc_delayed_seq(delayed_refs);  	ref->seq = seq; @@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,  	ref->is_head = 0;  	ref->in_tree = 1; -	if (need_ref_seq(for_cow, ref_root)) +	if (is_fstree(ref_root))  		seq = inc_delayed_seq(delayed_refs);  	ref->seq = seq; @@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  	add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,  				   num_bytes, parent, ref_root, level, action,  				   for_cow); -	if (!need_ref_seq(for_cow, ref_root) && +	if (!is_fstree(ref_root) &&  	    waitqueue_active(&delayed_refs->seq_wait))  		wake_up(&delayed_refs->seq_wait);  	spin_unlock(&delayed_refs->lock); +  	return 0;  } @@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,  	add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,  				   num_bytes, parent, ref_root, owner, offset,  				   action, for_cow); -	if (!need_ref_seq(for_cow, ref_root) && +	if (!is_fstree(ref_root) &&  	    waitqueue_active(&delayed_refs->seq_wait))  		wake_up(&delayed_refs->seq_wait);  	spin_unlock(&delayed_refs->lock); +  	return 0;  } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d8f244d9492..413927fb995 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,  int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,  			   struct list_head *cluster, u64 search_start); -struct seq_list { -	struct list_head list; -	u64 seq; -}; -  static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)  {  	assert_spin_locked(&delayed_refs->lock); @@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,  			    u64 seq);  /* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ -	if (for_cow) -		return 0; - -	if (rootid == BTRFS_FS_TREE_OBJECTID) -		return 1; - -	if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) -		return 1; - -	return 0; -} - -/*   * a node might live in a head or a regular ref, this lets you   * test for the proper type to use.   */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b0d49e21b0b..b99d5127ba1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,  	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,  				      BTRFS_TREE_LOG_OBJECTID, NULL, -				      0, 0, 0, 0); +				      0, 0, 0);  	if (IS_ERR(leaf)) {  		kfree(root);  		return ERR_CAST(leaf); @@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb,  	spin_lock_init(&fs_info->delayed_iput_lock);  	spin_lock_init(&fs_info->defrag_inodes_lock);  	spin_lock_init(&fs_info->free_chunk_lock); +	spin_lock_init(&fs_info->tree_mod_seq_lock); +	rwlock_init(&fs_info->tree_mod_log_lock);  	mutex_init(&fs_info->reloc_mutex);  	init_completion(&fs_info->kobj_unregister);  	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);  	INIT_LIST_HEAD(&fs_info->space_info); +	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);  	btrfs_mapping_init(&fs_info->mapping_tree);  	btrfs_init_block_rsv(&fs_info->global_block_rsv);  	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); @@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb,  	atomic_set(&fs_info->async_submit_draining, 0);  	atomic_set(&fs_info->nr_async_bios, 0);  	atomic_set(&fs_info->defrag_running, 0); +	atomic_set(&fs_info->tree_mod_seq, 0);  	fs_info->sb = sb;  	fs_info->max_inline = 8192 * 1024;  	fs_info->metadata_ratio = 0;  	fs_info->defrag_inodes = RB_ROOT;  	fs_info->trans_no_join = 0;  	fs_info->free_chunk_space = 0; +	fs_info->tree_mod_log = RB_ROOT;  	/* readahead state */  	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1902726fa70..4b5a1e1bdef 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5218,7 +5218,7 @@ out:  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct extent_buffer *buf, -			   u64 parent, int last_ref, int for_cow) +			   u64 parent, int last_ref)  {  	struct btrfs_block_group_cache *cache = NULL;  	int ret; @@ -5228,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  					buf->start, buf->len,  					parent, root->root_key.objectid,  					btrfs_header_level(buf), -					BTRFS_DROP_DELAYED_REF, NULL, for_cow); +					BTRFS_DROP_DELAYED_REF, NULL, 0);  		BUG_ON(ret); /* -ENOMEM */  	} @@ -6250,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  					struct btrfs_root *root, u32 blocksize,  					u64 parent, u64 root_objectid,  					struct btrfs_disk_key *key, int level, -					u64 hint, u64 empty_size, int for_cow) +					u64 hint, u64 empty_size)  {  	struct btrfs_key ins;  	struct btrfs_block_rsv *block_rsv; @@ -6298,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  					ins.objectid,  					ins.offset, parent, root_objectid,  					level, BTRFS_ADD_DELAYED_EXTENT, -					extent_op, for_cow); +					extent_op, 0);  		BUG_ON(ret); /* -ENOMEM */  	}  	return buf; @@ -6716,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,  			       btrfs_header_owner(path->nodes[level + 1]));  	} -	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); +	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);  out:  	wc->refs[level] = 0;  	wc->flags[level] = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b3692c1373a..2c8f7b20461 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3924,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,  	eb->start = start;  	eb->len = len;  	eb->tree = tree; +	eb->bflags = 0;  	rwlock_init(&eb->lock);  	atomic_set(&eb->write_locks, 0);  	atomic_set(&eb->read_locks, 0); @@ -3961,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,  	return eb;  } +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) +{ +	unsigned long i; +	struct page *p; +	struct extent_buffer *new; +	unsigned long num_pages = num_extent_pages(src->start, src->len); + +	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); +	if (new == NULL) +		return NULL; + +	for (i = 0; i < num_pages; i++) { +		p = alloc_page(GFP_ATOMIC); +		BUG_ON(!p); +		attach_extent_buffer_page(new, p); +		WARN_ON(PageDirty(p)); +		SetPageUptodate(p); +		new->pages[i] = p; +	} + +	copy_extent_buffer(new, src, 0, 0, src->len); +	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); +	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); + +	return new; +} + +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +{ +	struct extent_buffer *eb; +	unsigned long num_pages = num_extent_pages(0, len); +	unsigned long i; + +	eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); +	if (!eb) +		return NULL; + +	for (i = 0; i < num_pages; i++) { +		eb->pages[i] = alloc_page(GFP_ATOMIC); +		if (!eb->pages[i]) +			goto err; +	} +	set_extent_buffer_uptodate(eb); +	btrfs_set_header_nritems(eb, 0); +	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + +	return eb; +err: +	for (i--; i > 0; i--) +		__free_page(eb->pages[i]); +	__free_extent_buffer(eb); +	return NULL; +} +  static int extent_buffer_under_io(struct extent_buffer *eb)  {  	return (atomic_read(&eb->io_pages) || @@ -3977,6 +4032,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,  	unsigned long index;  	unsigned long num_pages;  	struct page *page; +	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);  	BUG_ON(extent_buffer_under_io(eb)); @@ -3988,7 +4044,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,  	do {  		index--;  		page = extent_buffer_page(eb, index); -		if (page) { +		if (page && mapped) {  			spin_lock(&page->mapping->private_lock);  			/*  			 * We do this since we'll remove the pages after we've @@ -4013,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,  			}  			spin_unlock(&page->mapping->private_lock); +		} +		if (page) {  			/* One for when we alloced the page */  			page_cache_release(page);  		} @@ -4231,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)  {  	WARN_ON(atomic_read(&eb->refs) == 0);  	if (atomic_dec_and_test(&eb->refs)) { -		struct extent_io_tree *tree = eb->tree; +		if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { +			spin_unlock(&eb->refs_lock); +		} else { +			struct extent_io_tree *tree = eb->tree; -		spin_unlock(&eb->refs_lock); +			spin_unlock(&eb->refs_lock); -		spin_lock(&tree->buffer_lock); -		radix_tree_delete(&tree->buffer, -				  eb->start >> PAGE_CACHE_SHIFT); -		spin_unlock(&tree->buffer_lock); +			spin_lock(&tree->buffer_lock); +			radix_tree_delete(&tree->buffer, +					  eb->start >> PAGE_CACHE_SHIFT); +			spin_unlock(&tree->buffer_lock); +		}  		/* Should be safe to release our pages at this point */  		btrfs_release_extent_buffer_page(eb, 0); @@ -4256,6 +4318,10 @@ void free_extent_buffer(struct extent_buffer *eb)  	spin_lock(&eb->refs_lock);  	if (atomic_read(&eb->refs) == 2 && +	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) +		atomic_dec(&eb->refs); + +	if (atomic_read(&eb->refs) == 2 &&  	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&  	    !extent_buffer_under_io(eb) &&  	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4d8124b6457..25900af5b15 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -39,6 +39,7 @@  #define EXTENT_BUFFER_STALE 6  #define EXTENT_BUFFER_WRITEBACK 7  #define EXTENT_BUFFER_IOERR 8 +#define EXTENT_BUFFER_DUMMY 9  /* these are flags for extent_clear_unlock_delalloc */  #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -264,6 +265,8 @@ void set_page_extent_mapped(struct page *page);  struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,  					  u64 start, unsigned long len); +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);  struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,  					 u64 start, unsigned long len);  void free_extent_buffer(struct extent_buffer *eb); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0f8c354c4c7..24b776c08d9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -368,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root,  		return PTR_ERR(trans);  	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, -				      0, objectid, NULL, 0, 0, 0, 0); +				      0, objectid, NULL, 0, 0, 0);  	if (IS_ERR(leaf)) {  		ret = PTR_ERR(leaf);  		goto fail; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 82b03afcbd9..1791c6e3d83 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -56,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root)  static noinline int join_transaction(struct btrfs_root *root, int nofail)  {  	struct btrfs_transaction *cur_trans; +	struct btrfs_fs_info *fs_info = root->fs_info; -	spin_lock(&root->fs_info->trans_lock); +	spin_lock(&fs_info->trans_lock);  loop:  	/* The file system has been taken offline. No new transactions. */ -	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { -		spin_unlock(&root->fs_info->trans_lock); +	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { +		spin_unlock(&fs_info->trans_lock);  		return -EROFS;  	} -	if (root->fs_info->trans_no_join) { +	if (fs_info->trans_no_join) {  		if (!nofail) { -			spin_unlock(&root->fs_info->trans_lock); +			spin_unlock(&fs_info->trans_lock);  			return -EBUSY;  		}  	} -	cur_trans = root->fs_info->running_transaction; +	cur_trans = fs_info->running_transaction;  	if (cur_trans) {  		if (cur_trans->aborted) { -			spin_unlock(&root->fs_info->trans_lock); +			spin_unlock(&fs_info->trans_lock);  			return cur_trans->aborted;  		}  		atomic_inc(&cur_trans->use_count);  		atomic_inc(&cur_trans->num_writers);  		cur_trans->num_joined++; -		spin_unlock(&root->fs_info->trans_lock); +		spin_unlock(&fs_info->trans_lock);  		return 0;  	} -	spin_unlock(&root->fs_info->trans_lock); +	spin_unlock(&fs_info->trans_lock);  	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);  	if (!cur_trans)  		return -ENOMEM; -	spin_lock(&root->fs_info->trans_lock); -	if (root->fs_info->running_transaction) { +	spin_lock(&fs_info->trans_lock); +	if (fs_info->running_transaction) {  		/*  		 * someone started a transaction after we unlocked.  Make sure  		 * to redo the trans_no_join checks above  		 */  		kmem_cache_free(btrfs_transaction_cachep, cur_trans); -		cur_trans = root->fs_info->running_transaction; +		cur_trans = fs_info->running_transaction;  		goto loop;  	} @@ -122,20 +123,38 @@ loop:  	cur_trans->delayed_refs.flushing = 0;  	cur_trans->delayed_refs.run_delayed_start = 0;  	cur_trans->delayed_refs.seq = 1; + +	/* +	 * although the tree mod log is per file system and not per transaction, +	 * the log must never go across transaction boundaries. +	 */ +	smp_mb(); +	if (!list_empty(&fs_info->tree_mod_seq_list)) { +		printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " +			"creating a fresh transaction\n"); +		WARN_ON(1); +	} +	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { +		printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " +			"creating a fresh transaction\n"); +		WARN_ON(1); +	} +	atomic_set(&fs_info->tree_mod_seq, 0); +  	init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);  	spin_lock_init(&cur_trans->commit_lock);  	spin_lock_init(&cur_trans->delayed_refs.lock);  	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);  	INIT_LIST_HEAD(&cur_trans->pending_snapshots); -	list_add_tail(&cur_trans->list, &root->fs_info->trans_list); +	list_add_tail(&cur_trans->list, &fs_info->trans_list);  	extent_io_tree_init(&cur_trans->dirty_pages, -			     root->fs_info->btree_inode->i_mapping); -	root->fs_info->generation++; -	cur_trans->transid = root->fs_info->generation; -	root->fs_info->running_transaction = cur_trans; +			     fs_info->btree_inode->i_mapping); +	fs_info->generation++; +	cur_trans->transid = fs_info->generation; +	fs_info->running_transaction = cur_trans;  	cur_trans->aborted = 0; -	spin_unlock(&root->fs_info->trans_lock); +	spin_unlock(&fs_info->trans_lock);  	return 0;  } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index ad993bc2df9..ab942f46b3d 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -23,9 +23,9 @@   *   * ulist = ulist_alloc();   * ulist_add(ulist, root); - * elem = NULL; + * ULIST_ITER_INIT(&uiter);   * - * while ((elem = ulist_next(ulist, elem)) { + * while ((elem = ulist_next(ulist, &uiter)) {   * 	for (all child nodes n in elem)   *		ulist_add(ulist, n);   *	do something useful with the node; @@ -146,11 +146,20 @@ EXPORT_SYMBOL(ulist_free);  int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,  	      gfp_t gfp_mask)  { +	return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); +} + +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, +		    unsigned long *old_aux, gfp_t gfp_mask) +{  	int i;  	for (i = 0; i < ulist->nnodes; ++i) { -		if (ulist->nodes[i].val == val) +		if (ulist->nodes[i].val == val) { +			if (old_aux) +				*old_aux = ulist->nodes[i].aux;  			return 0; +		}  	}  	if (ulist->nnodes >= ulist->nodes_alloced) { @@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);  /**   * ulist_next - iterate ulist   * @ulist:	ulist to iterate - * @prev:	previously returned element or %NULL to start iteration + * @uiter:	iterator variable, initialized with ULIST_ITER_INIT(&iterator)   *   * Note: locking must be provided by the caller. In case of rwlocks only read   *       locking is needed   * - * This function is used to iterate an ulist. The iteration is started with - * @prev = %NULL. It returns the next element from the ulist or %NULL when the + * This function is used to iterate an ulist. + * It returns the next element from the ulist or %NULL when the   * end is reached. No guarantee is made with respect to the order in which   * the elements are returned. They might neither be returned in order of   * addition nor in ascending order.   * It is allowed to call ulist_add during an enumeration. Newly added items   * are guaranteed to show up in the running enumeration.   */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)  { -	int next; -  	if (ulist->nnodes == 0)  		return NULL; - -	if (!prev) -		return &ulist->nodes[0]; - -	next = (prev - ulist->nodes) + 1; -	if (next < 0 || next >= ulist->nnodes) +	if (uiter->i < 0 || uiter->i >= ulist->nnodes)  		return NULL; -	return &ulist->nodes[next]; +	return &ulist->nodes[uiter->i++];  }  EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 6568c352773..21bdc8ec813 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -24,6 +24,10 @@   */  #define ULIST_SIZE 16 +struct ulist_iterator { +	int i; +}; +  /*   * element of the list   */ @@ -61,7 +65,13 @@ void ulist_fini(struct ulist *ulist);  void ulist_reinit(struct ulist *ulist);  struct ulist *ulist_alloc(gfp_t gfp_mask);  void ulist_free(struct ulist *ulist); -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, gfp_t gfp_mask); -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, +	      gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, +		    unsigned long *old_aux, gfp_t gfp_mask); +struct ulist_node *ulist_next(struct ulist *ulist, +			      struct ulist_iterator *uiter); + +#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)  #endif  |