diff options
Diffstat (limited to 'fs/btrfs/ctree.c')
| -rw-r--r-- | fs/btrfs/ctree.c | 909 | 
1 files changed, 879 insertions, 30 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4106264fbc6..8206b390058 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@  #include <linux/sched.h>  #include <linux/slab.h> +#include <linux/rbtree.h>  #include "ctree.h"  #include "disk-io.h"  #include "transaction.h" @@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  			      struct extent_buffer *dst_buf,  			      struct extent_buffer *src_buf);  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		   struct btrfs_path *path, int level, int slot); +		    struct btrfs_path *path, int level, int slot, +		    int tree_mod_log); +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +				 struct extent_buffer *eb); +struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, +					  u32 blocksize, u64 parent_transid, +					  u64 time_seq); +struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, +						u64 bytenr, u32 blocksize, +						u64 time_seq);  struct btrfs_path *btrfs_alloc_path(void)  { @@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,  				     new_root_objectid, &disk_key, level, -				     buf->start, 0, 1); +				     buf->start, 0);  	if (IS_ERR(cow))  		return PTR_ERR(cow); @@ -288,6 +298,449 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	return 0;  } +enum mod_log_op { +	MOD_LOG_KEY_REPLACE, +	MOD_LOG_KEY_ADD, +	MOD_LOG_KEY_REMOVE, +	MOD_LOG_KEY_REMOVE_WHILE_FREEING, +	MOD_LOG_KEY_REMOVE_WHILE_MOVING, +	MOD_LOG_MOVE_KEYS, +	MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { +	int dst_slot; +	int nr_items; +}; + +struct tree_mod_root { +	u64 logical; +	u8 level; +}; + +struct tree_mod_elem { +	struct rb_node node; +	u64 index;		/* shifted logical */ +	struct seq_list elem; +	enum mod_log_op op; + +	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ +	int slot; + +	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ +	u64 generation; + +	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ +	struct btrfs_disk_key key; +	u64 blockptr; + +	/* this is used for op == MOD_LOG_MOVE_KEYS */ +	struct tree_mod_move move; + +	/* this is used for op == MOD_LOG_ROOT_REPLACE */ +	struct tree_mod_root old_root; +}; + +static inline void +__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +{ +	elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); +	list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +} + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem) +{ +	elem->flags = 1; +	spin_lock(&fs_info->tree_mod_seq_lock); +	__get_tree_mod_seq(fs_info, elem); +	spin_unlock(&fs_info->tree_mod_seq_lock); +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, +			    struct seq_list *elem) +{ +	struct rb_root *tm_root; +	struct rb_node *node; +	struct rb_node *next; +	struct seq_list *cur_elem; +	struct tree_mod_elem *tm; +	u64 min_seq = (u64)-1; +	u64 seq_putting = elem->seq; + +	if (!seq_putting) +		return; + +	BUG_ON(!(elem->flags & 1)); +	spin_lock(&fs_info->tree_mod_seq_lock); +	list_del(&elem->list); + +	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { +		if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { +			if (seq_putting > cur_elem->seq) { +				/* +				 * blocker with lower sequence number exists, we +				 * cannot remove anything from the log +				 */ +				goto out; +			} +			min_seq = cur_elem->seq; +		} +	} + +	/* +	 * anything that's lower than the lowest existing (read: blocked) +	 * sequence number can be removed from the tree. +	 */ +	write_lock(&fs_info->tree_mod_log_lock); +	tm_root = &fs_info->tree_mod_log; +	for (node = rb_first(tm_root); node; node = next) { +		next = rb_next(node); +		tm = container_of(node, struct tree_mod_elem, node); +		if (tm->elem.seq > min_seq) +			continue; +		rb_erase(node, tm_root); +		list_del(&tm->elem.list); +		kfree(tm); +	} +	write_unlock(&fs_info->tree_mod_log_lock); +out: +	spin_unlock(&fs_info->tree_mod_seq_lock); +} + +/* + * key order of the log: + *       index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ +	struct rb_root *tm_root; +	struct rb_node **new; +	struct rb_node *parent = NULL; +	struct tree_mod_elem *cur; +	int ret = 0; + +	BUG_ON(!tm || !tm->elem.seq); + +	write_lock(&fs_info->tree_mod_log_lock); +	tm_root = &fs_info->tree_mod_log; +	new = &tm_root->rb_node; +	while (*new) { +		cur = container_of(*new, struct tree_mod_elem, node); +		parent = *new; +		if (cur->index < tm->index) +			new = &((*new)->rb_left); +		else if (cur->index > tm->index) +			new = &((*new)->rb_right); +		else if (cur->elem.seq < tm->elem.seq) +			new = &((*new)->rb_left); +		else if (cur->elem.seq > tm->elem.seq) +			new = &((*new)->rb_right); +		else { +			kfree(tm); +			ret = -EEXIST; +			goto unlock; +		} +	} + +	rb_link_node(&tm->node, parent, new); +	rb_insert_color(&tm->node, tm_root); +unlock: +	write_unlock(&fs_info->tree_mod_log_lock); +	return ret; +} + +static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, +				    struct extent_buffer *eb) { +	smp_mb(); +	if (list_empty(&(fs_info)->tree_mod_seq_list)) +		return 1; +	if (!eb) +		return 0; +	if (btrfs_header_level(eb) == 0) +		return 1; +	return 0; +} + +/* + * This allocates memory and gets a tree modification sequence number when + * needed. + * + * Returns 0 when no sequence number is needed, < 0 on error. + * Returns 1 when a sequence number was added. In this case, + * fs_info->tree_mod_seq_lock was acquired and must be released by the caller + * after inserting into the rb tree. + */ +static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, +				 struct tree_mod_elem **tm_ret) +{ +	struct tree_mod_elem *tm; +	int seq; + +	if (tree_mod_dont_log(fs_info, NULL)) +		return 0; + +	tm = *tm_ret = kzalloc(sizeof(*tm), flags); +	if (!tm) +		return -ENOMEM; + +	tm->elem.flags = 0; +	spin_lock(&fs_info->tree_mod_seq_lock); +	if (list_empty(&fs_info->tree_mod_seq_list)) { +		/* +		 * someone emptied the list while we were waiting for the lock. +		 * we must not add to the list, because no blocker exists. items +		 * are removed from the list only when the existing blocker is +		 * removed from the list. +		 */ +		kfree(tm); +		seq = 0; +		spin_unlock(&fs_info->tree_mod_seq_lock); +	} else { +		__get_tree_mod_seq(fs_info, &tm->elem); +		seq = tm->elem.seq; +	} + +	return seq; +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, +			     struct extent_buffer *eb, int slot, +			     enum mod_log_op op, gfp_t flags) +{ +	struct tree_mod_elem *tm; +	int ret; + +	ret = tree_mod_alloc(fs_info, flags, &tm); +	if (ret <= 0) +		return ret; + +	tm->index = eb->start >> PAGE_CACHE_SHIFT; +	if (op != MOD_LOG_KEY_ADD) { +		btrfs_node_key(eb, &tm->key, slot); +		tm->blockptr = btrfs_node_blockptr(eb, slot); +	} +	tm->op = op; +	tm->slot = slot; +	tm->generation = btrfs_node_ptr_generation(eb, slot); + +	ret = __tree_mod_log_insert(fs_info, tm); +	spin_unlock(&fs_info->tree_mod_seq_lock); +	return ret; +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, +			int slot, enum mod_log_op op) +{ +	return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, +			 struct extent_buffer *eb, int dst_slot, int src_slot, +			 int nr_items, gfp_t flags) +{ +	struct tree_mod_elem *tm; +	int ret; +	int i; + +	if (tree_mod_dont_log(fs_info, eb)) +		return 0; + +	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { +		ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, +					      MOD_LOG_KEY_REMOVE_WHILE_MOVING); +		BUG_ON(ret < 0); +	} + +	ret = tree_mod_alloc(fs_info, flags, &tm); +	if (ret <= 0) +		return ret; + +	tm->index = eb->start >> PAGE_CACHE_SHIFT; +	tm->slot = src_slot; +	tm->move.dst_slot = dst_slot; +	tm->move.nr_items = nr_items; +	tm->op = MOD_LOG_MOVE_KEYS; + +	ret = __tree_mod_log_insert(fs_info, tm); +	spin_unlock(&fs_info->tree_mod_seq_lock); +	return ret; +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, +			 struct extent_buffer *old_root, +			 struct extent_buffer *new_root, gfp_t flags) +{ +	struct tree_mod_elem *tm; +	int ret; + +	ret = tree_mod_alloc(fs_info, flags, &tm); +	if (ret <= 0) +		return ret; + +	tm->index = new_root->start >> PAGE_CACHE_SHIFT; +	tm->old_root.logical = old_root->start; +	tm->old_root.level = btrfs_header_level(old_root); +	tm->generation = btrfs_header_generation(old_root); +	tm->op = MOD_LOG_ROOT_REPLACE; + +	ret = __tree_mod_log_insert(fs_info, tm); +	spin_unlock(&fs_info->tree_mod_seq_lock); +	return ret; +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, +		      int smallest) +{ +	struct rb_root *tm_root; +	struct rb_node *node; +	struct tree_mod_elem *cur = NULL; +	struct tree_mod_elem *found = NULL; +	u64 index = start >> PAGE_CACHE_SHIFT; + +	read_lock(&fs_info->tree_mod_log_lock); +	tm_root = &fs_info->tree_mod_log; +	node = tm_root->rb_node; +	while (node) { +		cur = container_of(node, struct tree_mod_elem, node); +		if (cur->index < index) { +			node = node->rb_left; +		} else if (cur->index > index) { +			node = node->rb_right; +		} else if (cur->elem.seq < min_seq) { +			node = node->rb_left; +		} else if (!smallest) { +			/* we want the node with the highest seq */ +			if (found) +				BUG_ON(found->elem.seq > cur->elem.seq); +			found = cur; +			node = node->rb_left; +		} else if (cur->elem.seq > min_seq) { +			/* we want the node with the smallest seq */ +			if (found) +				BUG_ON(found->elem.seq < cur->elem.seq); +			found = cur; +			node = node->rb_right; +		} else { +			found = cur; +			break; +		} +	} +	read_unlock(&fs_info->tree_mod_log_lock); + +	return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, +			   u64 min_seq) +{ +	return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ +	return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static inline void +tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, +		     struct extent_buffer *src, unsigned long dst_offset, +		     unsigned long src_offset, int nr_items) +{ +	int ret; +	int i; + +	if (tree_mod_dont_log(fs_info, NULL)) +		return; + +	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) +		return; + +	/* speed this up by single seq for all operations? */ +	for (i = 0; i < nr_items; i++) { +		ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, +					      MOD_LOG_KEY_REMOVE); +		BUG_ON(ret < 0); +		ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, +					      MOD_LOG_KEY_ADD); +		BUG_ON(ret < 0); +	} +} + +static inline void +tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, +		     int dst_offset, int src_offset, int nr_items) +{ +	int ret; +	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, +				       nr_items, GFP_NOFS); +	BUG_ON(ret < 0); +} + +static inline void +tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, +			  struct extent_buffer *eb, +			  struct btrfs_disk_key *disk_key, int slot, int atomic) +{ +	int ret; + +	ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, +					   MOD_LOG_KEY_REPLACE, +					   atomic ? GFP_ATOMIC : GFP_NOFS); +	BUG_ON(ret < 0); +} + +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +				 struct extent_buffer *eb) +{ +	int i; +	int ret; +	u32 nritems; + +	if (tree_mod_dont_log(fs_info, eb)) +		return; + +	nritems = btrfs_header_nritems(eb); +	for (i = nritems - 1; i >= 0; i--) { +		ret = tree_mod_log_insert_key(fs_info, eb, i, +					      MOD_LOG_KEY_REMOVE_WHILE_FREEING); +		BUG_ON(ret < 0); +	} +} + +static inline void +tree_mod_log_set_root_pointer(struct btrfs_root *root, +			      struct extent_buffer *new_root_node) +{ +	int ret; +	tree_mod_log_free_eb(root->fs_info, root->node); +	ret = tree_mod_log_insert_root(root->fs_info, root->node, +				       new_root_node, GFP_NOFS); +	BUG_ON(ret < 0); +} +  /*   * check if the tree block can be shared by multiple trees   */ @@ -409,6 +862,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,  			ret = btrfs_dec_ref(trans, root, buf, 1, 1);  			BUG_ON(ret); /* -ENOMEM */  		} +		/* +		 * don't log freeing in case we're freeing the root node, this +		 * is done by tree_mod_log_set_root_pointer later +		 */ +		if (buf != root->node && btrfs_header_level(buf) != 0) +			tree_mod_log_free_eb(root->fs_info, buf);  		clean_tree_block(trans, root, buf);  		*last_ref = 1;  	} @@ -467,7 +926,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,  				     root->root_key.objectid, &disk_key, -				     level, search_start, empty_size, 1); +				     level, search_start, empty_size);  	if (IS_ERR(cow))  		return PTR_ERR(cow); @@ -506,10 +965,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  			parent_start = 0;  		extent_buffer_get(cow); +		tree_mod_log_set_root_pointer(root, cow);  		rcu_assign_pointer(root->node, cow);  		btrfs_free_tree_block(trans, root, buf, parent_start, -				      last_ref, 1); +				      last_ref);  		free_extent_buffer(buf);  		add_root_to_dirty_list(root);  	} else { @@ -519,13 +979,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  			parent_start = 0;  		WARN_ON(trans->transid != btrfs_header_generation(parent)); +		tree_mod_log_insert_key(root->fs_info, parent, parent_slot, +					MOD_LOG_KEY_REPLACE);  		btrfs_set_node_blockptr(parent, parent_slot,  					cow->start);  		btrfs_set_node_ptr_generation(parent, parent_slot,  					      trans->transid);  		btrfs_mark_buffer_dirty(parent);  		btrfs_free_tree_block(trans, root, buf, parent_start, -				      last_ref, 1); +				      last_ref);  	}  	if (unlock_orig)  		btrfs_tree_unlock(buf); @@ -535,6 +997,229 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	return 0;  } +/* + * returns the logical address of the oldest predecessor of the given root. + * entries older than time_seq are ignored. + */ +static struct tree_mod_elem * +__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, +			   struct btrfs_root *root, u64 time_seq) +{ +	struct tree_mod_elem *tm; +	struct tree_mod_elem *found = NULL; +	u64 root_logical = root->node->start; +	int looped = 0; + +	if (!time_seq) +		return 0; + +	/* +	 * the very last operation that's logged for a root is the replacement +	 * operation (if it is replaced at all). this has the index of the *new* +	 * root, making it the very first operation that's logged for this root. +	 */ +	while (1) { +		tm = tree_mod_log_search_oldest(fs_info, root_logical, +						time_seq); +		if (!looped && !tm) +			return 0; +		/* +		 * if there are no tree operation for the oldest root, we simply +		 * return it. this should only happen if that (old) root is at +		 * level 0. +		 */ +		if (!tm) +			break; + +		/* +		 * if there's an operation that's not a root replacement, we +		 * found the oldest version of our root. normally, we'll find a +		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. +		 */ +		if (tm->op != MOD_LOG_ROOT_REPLACE) +			break; + +		found = tm; +		root_logical = tm->old_root.logical; +		BUG_ON(root_logical == root->node->start); +		looped = 1; +	} + +	/* if there's no old root to return, return what we found instead */ +	if (!found) +		found = tm; + +	return found; +} + +/* + * tm is a pointer to the first operation to rewind within eb. then, all + * previous operations will be rewinded (until we reach something older than + * time_seq). + */ +static void +__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, +		      struct tree_mod_elem *first_tm) +{ +	u32 n; +	struct rb_node *next; +	struct tree_mod_elem *tm = first_tm; +	unsigned long o_dst; +	unsigned long o_src; +	unsigned long p_size = sizeof(struct btrfs_key_ptr); + +	n = btrfs_header_nritems(eb); +	while (tm && tm->elem.seq >= time_seq) { +		/* +		 * all the operations are recorded with the operator used for +		 * the modification. as we're going backwards, we do the +		 * opposite of each operation here. +		 */ +		switch (tm->op) { +		case MOD_LOG_KEY_REMOVE_WHILE_FREEING: +			BUG_ON(tm->slot < n); +		case MOD_LOG_KEY_REMOVE_WHILE_MOVING: +		case MOD_LOG_KEY_REMOVE: +			btrfs_set_node_key(eb, &tm->key, tm->slot); +			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); +			btrfs_set_node_ptr_generation(eb, tm->slot, +						      tm->generation); +			n++; +			break; +		case MOD_LOG_KEY_REPLACE: +			BUG_ON(tm->slot >= n); +			btrfs_set_node_key(eb, &tm->key, tm->slot); +			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); +			btrfs_set_node_ptr_generation(eb, tm->slot, +						      tm->generation); +			break; +		case MOD_LOG_KEY_ADD: +			/* if a move operation is needed it's in the log */ +			n--; +			break; +		case MOD_LOG_MOVE_KEYS: +			o_dst = btrfs_node_key_ptr_offset(tm->slot); +			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); +			memmove_extent_buffer(eb, o_dst, o_src, +					      tm->move.nr_items * p_size); +			break; +		case MOD_LOG_ROOT_REPLACE: +			/* +			 * this operation is special. for roots, this must be +			 * handled explicitly before rewinding. +			 * for non-roots, this operation may exist if the node +			 * was a root: root A -> child B; then A gets empty and +			 * B is promoted to the new root. in the mod log, we'll +			 * have a root-replace operation for B, a tree block +			 * that is no root. we simply ignore that operation. +			 */ +			break; +		} +		next = rb_next(&tm->node); +		if (!next) +			break; +		tm = container_of(next, struct tree_mod_elem, node); +		if (tm->index != first_tm->index) +			break; +	} +	btrfs_set_header_nritems(eb, n); +} + +static struct extent_buffer * +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, +		    u64 time_seq) +{ +	struct extent_buffer *eb_rewin; +	struct tree_mod_elem *tm; + +	if (!time_seq) +		return eb; + +	if (btrfs_header_level(eb) == 0) +		return eb; + +	tm = tree_mod_log_search(fs_info, eb->start, time_seq); +	if (!tm) +		return eb; + +	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { +		BUG_ON(tm->slot != 0); +		eb_rewin = alloc_dummy_extent_buffer(eb->start, +						fs_info->tree_root->nodesize); +		BUG_ON(!eb_rewin); +		btrfs_set_header_bytenr(eb_rewin, eb->start); +		btrfs_set_header_backref_rev(eb_rewin, +					     btrfs_header_backref_rev(eb)); +		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); +		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); +	} else { +		eb_rewin = btrfs_clone_extent_buffer(eb); +		BUG_ON(!eb_rewin); +	} + +	extent_buffer_get(eb_rewin); +	free_extent_buffer(eb); + +	__tree_mod_log_rewind(eb_rewin, time_seq, tm); + +	return eb_rewin; +} + +/* + * get_old_root() rewinds the state of @root's root node to the given @time_seq + * value. If there are no changes, the current root->root_node is returned. If + * anything changed in between, there's a fresh buffer allocated on which the + * rewind operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ +static inline struct extent_buffer * +get_old_root(struct btrfs_root *root, u64 time_seq) +{ +	struct tree_mod_elem *tm; +	struct extent_buffer *eb; +	struct tree_mod_root *old_root = NULL; +	u64 old_generation = 0; +	u64 logical; + +	eb = btrfs_read_lock_root_node(root); +	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); +	if (!tm) +		return root->node; + +	if (tm->op == MOD_LOG_ROOT_REPLACE) { +		old_root = &tm->old_root; +		old_generation = tm->generation; +		logical = old_root->logical; +	} else { +		logical = root->node->start; +	} + +	tm = tree_mod_log_search(root->fs_info, logical, time_seq); +	if (old_root) +		eb = alloc_dummy_extent_buffer(logical, root->nodesize); +	else +		eb = btrfs_clone_extent_buffer(root->node); +	btrfs_tree_read_unlock(root->node); +	free_extent_buffer(root->node); +	if (!eb) +		return NULL; +	btrfs_tree_read_lock(eb); +	if (old_root) { +		btrfs_set_header_bytenr(eb, eb->start); +		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); +		btrfs_set_header_owner(eb, root->root_key.objectid); +		btrfs_set_header_level(eb, old_root->level); +		btrfs_set_header_generation(eb, old_generation); +	} +	if (tm) +		__tree_mod_log_rewind(eb, time_seq, tm); +	else +		WARN_ON(btrfs_header_level(eb) != 0); +	extent_buffer_get(eb); + +	return eb; +} +  static inline int should_cow_block(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root,  				   struct extent_buffer *buf) @@ -739,7 +1424,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  				if (!cur)  					return -EIO;  			} else if (!uptodate) { -				btrfs_read_buffer(cur, gen); +				err = btrfs_read_buffer(cur, gen); +				if (err) { +					free_extent_buffer(cur); +					return err; +				}  			}  		}  		if (search_start == 0) @@ -854,20 +1543,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,  static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,  		      int level, int *slot)  { -	if (level == 0) { +	if (level == 0)  		return generic_bin_search(eb,  					  offsetof(struct btrfs_leaf, items),  					  sizeof(struct btrfs_item),  					  key, btrfs_header_nritems(eb),  					  slot); -	} else { +	else  		return generic_bin_search(eb,  					  offsetof(struct btrfs_node, ptrs),  					  sizeof(struct btrfs_key_ptr),  					  key, btrfs_header_nritems(eb),  					  slot); -	} -	return -1;  }  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, @@ -974,6 +1661,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  			goto enospc;  		} +		tree_mod_log_set_root_pointer(root, child);  		rcu_assign_pointer(root->node, child);  		add_root_to_dirty_list(root); @@ -987,7 +1675,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		free_extent_buffer(mid);  		root_sub_used(root, mid->len); -		btrfs_free_tree_block(trans, root, mid, 0, 1, 0); +		btrfs_free_tree_block(trans, root, mid, 0, 1);  		/* once for the root ptr */  		free_extent_buffer_stale(mid);  		return 0; @@ -996,8 +1684,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)  		return 0; -	btrfs_header_nritems(mid); -  	left = read_node_slot(root, parent, pslot - 1);  	if (left) {  		btrfs_tree_lock(left); @@ -1027,7 +1713,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		wret = push_node_left(trans, root, left, mid, 1);  		if (wret < 0)  			ret = wret; -		btrfs_header_nritems(mid);  	}  	/* @@ -1040,14 +1725,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		if (btrfs_header_nritems(right) == 0) {  			clean_tree_block(trans, root, right);  			btrfs_tree_unlock(right); -			del_ptr(trans, root, path, level + 1, pslot + 1); +			del_ptr(trans, root, path, level + 1, pslot + 1, 1);  			root_sub_used(root, right->len); -			btrfs_free_tree_block(trans, root, right, 0, 1, 0); +			btrfs_free_tree_block(trans, root, right, 0, 1);  			free_extent_buffer_stale(right);  			right = NULL;  		} else {  			struct btrfs_disk_key right_key;  			btrfs_node_key(right, &right_key, 0); +			tree_mod_log_set_node_key(root->fs_info, parent, +						  &right_key, pslot + 1, 0);  			btrfs_set_node_key(parent, &right_key, pslot + 1);  			btrfs_mark_buffer_dirty(parent);  		} @@ -1082,15 +1769,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	if (btrfs_header_nritems(mid) == 0) {  		clean_tree_block(trans, root, mid);  		btrfs_tree_unlock(mid); -		del_ptr(trans, root, path, level + 1, pslot); +		del_ptr(trans, root, path, level + 1, pslot, 1);  		root_sub_used(root, mid->len); -		btrfs_free_tree_block(trans, root, mid, 0, 1, 0); +		btrfs_free_tree_block(trans, root, mid, 0, 1);  		free_extent_buffer_stale(mid);  		mid = NULL;  	} else {  		/* update the parent key to reflect our changes */  		struct btrfs_disk_key mid_key;  		btrfs_node_key(mid, &mid_key, 0); +		tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, +					  pslot, 0);  		btrfs_set_node_key(parent, &mid_key, pslot);  		btrfs_mark_buffer_dirty(parent);  	} @@ -1188,6 +1877,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  			struct btrfs_disk_key disk_key;  			orig_slot += left_nr;  			btrfs_node_key(mid, &disk_key, 0); +			tree_mod_log_set_node_key(root->fs_info, parent, +						  &disk_key, pslot, 0);  			btrfs_set_node_key(parent, &disk_key, pslot);  			btrfs_mark_buffer_dirty(parent);  			if (btrfs_header_nritems(left) > orig_slot) { @@ -1239,6 +1930,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  			struct btrfs_disk_key disk_key;  			btrfs_node_key(right, &disk_key, 0); +			tree_mod_log_set_node_key(root->fs_info, parent, +						  &disk_key, pslot + 1, 0);  			btrfs_set_node_key(parent, &disk_key, pslot + 1);  			btrfs_mark_buffer_dirty(parent); @@ -1496,7 +2189,7 @@ static int  read_block_for_search(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct btrfs_path *p,  		       struct extent_buffer **eb_ret, int level, int slot, -		       struct btrfs_key *key) +		       struct btrfs_key *key, u64 time_seq)  {  	u64 blocknr;  	u64 gen; @@ -1850,7 +2543,7 @@ cow_done:  			}  			err = read_block_for_search(trans, root, p, -						    &b, level, slot, key); +						    &b, level, slot, key, 0);  			if (err == -EAGAIN)  				goto again;  			if (err) { @@ -1922,6 +2615,113 @@ done:  }  /* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, +			  struct btrfs_path *p, u64 time_seq) +{ +	struct extent_buffer *b; +	int slot; +	int ret; +	int err; +	int level; +	int lowest_unlock = 1; +	u8 lowest_level = 0; + +	lowest_level = p->lowest_level; +	WARN_ON(p->nodes[0] != NULL); + +	if (p->search_commit_root) { +		BUG_ON(time_seq); +		return btrfs_search_slot(NULL, root, key, p, 0, 0); +	} + +again: +	b = get_old_root(root, time_seq); +	level = btrfs_header_level(b); +	p->locks[level] = BTRFS_READ_LOCK; + +	while (b) { +		level = btrfs_header_level(b); +		p->nodes[level] = b; +		btrfs_clear_path_blocking(p, NULL, 0); + +		/* +		 * we have a lock on b and as long as we aren't changing +		 * the tree, there is no way to for the items in b to change. +		 * It is safe to drop the lock on our parent before we +		 * go through the expensive btree search on b. +		 */ +		btrfs_unlock_up_safe(p, level + 1); + +		ret = bin_search(b, key, level, &slot); + +		if (level != 0) { +			int dec = 0; +			if (ret && slot > 0) { +				dec = 1; +				slot -= 1; +			} +			p->slots[level] = slot; +			unlock_up(p, level, lowest_unlock, 0, NULL); + +			if (level == lowest_level) { +				if (dec) +					p->slots[level]++; +				goto done; +			} + +			err = read_block_for_search(NULL, root, p, &b, level, +						    slot, key, time_seq); +			if (err == -EAGAIN) +				goto again; +			if (err) { +				ret = err; +				goto done; +			} + +			level = btrfs_header_level(b); +			err = btrfs_try_tree_read_lock(b); +			if (!err) { +				btrfs_set_path_blocking(p); +				btrfs_tree_read_lock(b); +				btrfs_clear_path_blocking(p, b, +							  BTRFS_READ_LOCK); +			} +			p->locks[level] = BTRFS_READ_LOCK; +			p->nodes[level] = b; +			b = tree_mod_log_rewind(root->fs_info, b, time_seq); +			if (b != p->nodes[level]) { +				btrfs_tree_unlock_rw(p->nodes[level], +						     p->locks[level]); +				p->locks[level] = 0; +				p->nodes[level] = b; +			} +		} else { +			p->slots[level] = slot; +			unlock_up(p, level, lowest_unlock, 0, NULL); +			goto done; +		} +	} +	ret = 1; +done: +	if (!p->leave_spinning) +		btrfs_set_path_blocking(p); +	if (ret < 0) +		btrfs_release_path(p); + +	return ret; +} + +/*   * adjust the pointers going up the tree, starting at level   * making sure the right key of each node is points to 'key'.   * This is used after shifting pointers to the left, so it stops @@ -1941,6 +2741,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,  		if (!path->nodes[i])  			break;  		t = path->nodes[i]; +		tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);  		btrfs_set_node_key(t, key, tslot);  		btrfs_mark_buffer_dirty(path->nodes[i]);  		if (tslot != 0) @@ -2023,12 +2824,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,  	} else  		push_items = min(src_nritems - 8, push_items); +	tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, +			     push_items);  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(dst_nritems),  			   btrfs_node_key_ptr_offset(0),  			   push_items * sizeof(struct btrfs_key_ptr));  	if (push_items < src_nritems) { +		tree_mod_log_eb_move(root->fs_info, src, 0, push_items, +				     src_nritems - push_items);  		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),  				      btrfs_node_key_ptr_offset(push_items),  				      (src_nritems - push_items) * @@ -2082,11 +2887,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  	if (max_push < push_items)  		push_items = max_push; +	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);  	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),  				      btrfs_node_key_ptr_offset(0),  				      (dst_nritems) *  				      sizeof(struct btrfs_key_ptr)); +	tree_mod_log_eb_copy(root->fs_info, dst, src, 0, +			     src_nritems - push_items, push_items);  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -2129,7 +2937,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,  				   root->root_key.objectid, &lower_key, -				   level, root->node->start, 0, 0); +				   level, root->node->start, 0);  	if (IS_ERR(c))  		return PTR_ERR(c); @@ -2161,6 +2969,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(c);  	old = root->node; +	tree_mod_log_set_root_pointer(root, c);  	rcu_assign_pointer(root->node, c);  	/* the super has an extra ref to root->node */ @@ -2188,6 +2997,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,  {  	struct extent_buffer *lower;  	int nritems; +	int ret;  	BUG_ON(!path->nodes[level]);  	btrfs_assert_tree_locked(path->nodes[level]); @@ -2196,11 +3006,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,  	BUG_ON(slot > nritems);  	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));  	if (slot != nritems) { +		if (level) +			tree_mod_log_eb_move(root->fs_info, lower, slot + 1, +					     slot, nritems - slot);  		memmove_extent_buffer(lower,  			      btrfs_node_key_ptr_offset(slot + 1),  			      btrfs_node_key_ptr_offset(slot),  			      (nritems - slot) * sizeof(struct btrfs_key_ptr));  	} +	if (level) { +		ret = tree_mod_log_insert_key(root->fs_info, lower, slot, +					      MOD_LOG_KEY_ADD); +		BUG_ON(ret < 0); +	}  	btrfs_set_node_key(lower, key, slot);  	btrfs_set_node_blockptr(lower, slot, bytenr);  	WARN_ON(trans->transid == 0); @@ -2252,7 +3070,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,  					root->root_key.objectid, -					&disk_key, level, c->start, 0, 0); +					&disk_key, level, c->start, 0);  	if (IS_ERR(split))  		return PTR_ERR(split); @@ -2271,7 +3089,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  			    (unsigned long)btrfs_header_chunk_tree_uuid(split),  			    BTRFS_UUID_SIZE); - +	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);  	copy_extent_buffer(split, c,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(mid), @@ -3004,7 +3822,7 @@ again:  	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,  					root->root_key.objectid, -					&disk_key, 0, l->start, 0, 0); +					&disk_key, 0, l->start, 0);  	if (IS_ERR(right))  		return PTR_ERR(right); @@ -3749,19 +4567,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root   * empty a node.   */  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		    struct btrfs_path *path, int level, int slot) +		    struct btrfs_path *path, int level, int slot, +		    int tree_mod_log)  {  	struct extent_buffer *parent = path->nodes[level];  	u32 nritems; +	int ret;  	nritems = btrfs_header_nritems(parent);  	if (slot != nritems - 1) { +		if (tree_mod_log && level) +			tree_mod_log_eb_move(root->fs_info, parent, slot, +					     slot + 1, nritems - slot - 1);  		memmove_extent_buffer(parent,  			      btrfs_node_key_ptr_offset(slot),  			      btrfs_node_key_ptr_offset(slot + 1),  			      sizeof(struct btrfs_key_ptr) *  			      (nritems - slot - 1)); +	} else if (tree_mod_log && level) { +		ret = tree_mod_log_insert_key(root->fs_info, parent, slot, +					      MOD_LOG_KEY_REMOVE); +		BUG_ON(ret < 0);  	} +  	nritems--;  	btrfs_set_header_nritems(parent, nritems);  	if (nritems == 0 && parent == root->node) { @@ -3793,7 +4621,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,  				    struct extent_buffer *leaf)  {  	WARN_ON(btrfs_header_generation(leaf) != trans->transid); -	del_ptr(trans, root, path, 1, path->slots[1]); +	del_ptr(trans, root, path, 1, path->slots[1], 1);  	/*  	 * btrfs_free_extent is expensive, we want to make sure we @@ -3804,7 +4632,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,  	root_sub_used(root, leaf->len);  	extent_buffer_get(leaf); -	btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); +	btrfs_free_tree_block(trans, root, leaf, 0, 1);  	free_extent_buffer_stale(leaf);  }  /* @@ -4202,6 +5030,12 @@ next:   */  int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)  { +	return btrfs_next_old_leaf(root, path, 0); +} + +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, +			u64 time_seq) +{  	int slot;  	int level;  	struct extent_buffer *c; @@ -4226,7 +5060,10 @@ again:  	path->keep_locks = 1;  	path->leave_spinning = 1; -	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (time_seq) +		ret = btrfs_search_old_slot(root, &key, path, time_seq); +	else +		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  	path->keep_locks = 0;  	if (ret < 0) @@ -4271,7 +5108,7 @@ again:  		next = c;  		next_rw_lock = path->locks[level];  		ret = read_block_for_search(NULL, root, path, &next, level, -					    slot, &key); +					    slot, &key, 0);  		if (ret == -EAGAIN)  			goto again; @@ -4282,6 +5119,18 @@ again:  		if (!path->skip_locking) {  			ret = btrfs_try_tree_read_lock(next); +			if (!ret && time_seq) { +				/* +				 * If we don't get the lock, we may be racing +				 * with push_leaf_left, holding that lock while +				 * itself waiting for the leaf we've currently +				 * locked. To solve this situation, we give up +				 * on our lock and cycle. +				 */ +				btrfs_release_path(path); +				cond_resched(); +				goto again; +			}  			if (!ret) {  				btrfs_set_path_blocking(path);  				btrfs_tree_read_lock(next); @@ -4308,7 +5157,7 @@ again:  			break;  		ret = read_block_for_search(NULL, root, path, &next, level, -					    0, &key); +					    0, &key, 0);  		if (ret == -EAGAIN)  			goto again;  |