diff options
Diffstat (limited to 'fs')
38 files changed, 3563 insertions, 1614 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ff6475f409d..f3187938e08 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,6 +16,7 @@   * Boston, MA 021110-1307, USA.   */ +#include <linux/vmalloc.h>  #include "ctree.h"  #include "disk-io.h"  #include "backref.h" @@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,  			}  			if (!ret) {  				ret = ulist_add(parents, eb->start, -						(unsigned long)eie, GFP_NOFS); +						(uintptr_t)eie, GFP_NOFS);  				if (ret < 0)  					break;  				if (!extent_item_pos) { @@ -363,8 +364,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  		ULIST_ITER_INIT(&uiter);  		node = ulist_next(parents, &uiter);  		ref->parent = node ? node->val : 0; -		ref->inode_list = -			node ? (struct extent_inode_elem *)node->aux : 0; +		ref->inode_list = node ? +			(struct extent_inode_elem *)(uintptr_t)node->aux : 0;  		/* additional parents require new refs being added here */  		while ((node = ulist_next(parents, &uiter))) { @@ -375,8 +376,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,  			}  			memcpy(new_ref, ref, sizeof(*ref));  			new_ref->parent = node->val; -			new_ref->inode_list = -					(struct extent_inode_elem *)node->aux; +			new_ref->inode_list = (struct extent_inode_elem *) +							(uintptr_t)node->aux;  			list_add(&new_ref->list, &ref->list);  		}  		ulist_reinit(parents); @@ -914,8 +915,8 @@ again:  				free_extent_buffer(eb);  			}  			ret = ulist_add_merge(refs, ref->parent, -					      (unsigned long)ref->inode_list, -					      (unsigned long *)&eie, GFP_NOFS); +					      (uintptr_t)ref->inode_list, +					      (u64 *)&eie, GFP_NOFS);  			if (!ret && extent_item_pos) {  				/*  				 * we've recorded that parent, so we must extend @@ -959,7 +960,7 @@ static void free_leaf_list(struct ulist *blocks)  	while ((node = ulist_next(blocks, &uiter))) {  		if (!node->aux)  			continue; -		eie = (struct extent_inode_elem *)node->aux; +		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;  		for (; eie; eie = eie_next) {  			eie_next = eie->next;  			kfree(eie); @@ -1108,26 +1109,80 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,  				found_key);  } -/* - * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements - * of the path are separated by '/' and the path is guaranteed to be - * 0-terminated. the path is only given within the current file system. - * Therefore, it never starts with a '/'. the caller is responsible to provide - * "size" bytes in "dest". the dest buffer will be filled backwards. finally, - * the start point of the resulting string is returned. this pointer is within - * dest, normally. - * in case the path buffer would overflow, the pointer is decremented further - * as if output was written to the buffer, though no more output is actually - * generated. that way, the caller can determine how much space would be - * required for the path to fit into the buffer. in that case, the returned - * value will be smaller than dest. callers must check this! - */ -char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, -			 struct btrfs_inode_ref *iref, +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, +			  u64 start_off, struct btrfs_path *path, +			  struct btrfs_inode_extref **ret_extref, +			  u64 *found_off) +{ +	int ret, slot; +	struct btrfs_key key; +	struct btrfs_key found_key; +	struct btrfs_inode_extref *extref; +	struct extent_buffer *leaf; +	unsigned long ptr; + +	key.objectid = inode_objectid; +	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); +	key.offset = start_off; + +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) +		return ret; + +	while (1) { +		leaf = path->nodes[0]; +		slot = path->slots[0]; +		if (slot >= btrfs_header_nritems(leaf)) { +			/* +			 * If the item at offset is not found, +			 * btrfs_search_slot will point us to the slot +			 * where it should be inserted. In our case +			 * that will be the slot directly before the +			 * next INODE_REF_KEY_V2 item. In the case +			 * that we're pointing to the last slot in a +			 * leaf, we must move one leaf over. +			 */ +			ret = btrfs_next_leaf(root, path); +			if (ret) { +				if (ret >= 1) +					ret = -ENOENT; +				break; +			} +			continue; +		} + +		btrfs_item_key_to_cpu(leaf, &found_key, slot); + +		/* +		 * Check that we're still looking at an extended ref key for +		 * this particular objectid. If we have different +		 * objectid or type then there are no more to be found +		 * in the tree and we can exit. +		 */ +		ret = -ENOENT; +		if (found_key.objectid != inode_objectid) +			break; +		if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) +			break; + +		ret = 0; +		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); +		extref = (struct btrfs_inode_extref *)ptr; +		*ret_extref = extref; +		if (found_off) +			*found_off = found_key.offset; +		break; +	} + +	return ret; +} + +static char *ref_to_path(struct btrfs_root *fs_root, +			 struct btrfs_path *path, +			 u32 name_len, unsigned long name_off,  			 struct extent_buffer *eb_in, u64 parent,  			 char *dest, u32 size)  { -	u32 len;  	int slot;  	u64 next_inum;  	int ret; @@ -1135,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,  	struct extent_buffer *eb = eb_in;  	struct btrfs_key found_key;  	int leave_spinning = path->leave_spinning; +	struct btrfs_inode_ref *iref;  	if (bytes_left >= 0)  		dest[bytes_left] = '\0';  	path->leave_spinning = 1;  	while (1) { -		len = btrfs_inode_ref_name_len(eb, iref); -		bytes_left -= len; +		bytes_left -= name_len;  		if (bytes_left >= 0)  			read_extent_buffer(eb, dest + bytes_left, -						(unsigned long)(iref + 1), len); +					   name_off, name_len);  		if (eb != eb_in) {  			btrfs_tree_read_unlock_blocking(eb);  			free_extent_buffer(eb); @@ -1155,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,  			ret = -ENOENT;  		if (ret)  			break; +  		next_inum = found_key.offset;  		/* regular exit ahead */ @@ -1170,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,  			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);  		}  		btrfs_release_path(path); -  		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + +		name_len = btrfs_inode_ref_name_len(eb, iref); +		name_off = (unsigned long)(iref + 1); +  		parent = next_inum;  		--bytes_left;  		if (bytes_left >= 0) @@ -1188,12 +1247,39 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,  }  /* + * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements + * of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +char *btrfs_iref_to_path(struct btrfs_root *fs_root, +			 struct btrfs_path *path, +			 struct btrfs_inode_ref *iref, +			 struct extent_buffer *eb_in, u64 parent, +			 char *dest, u32 size) +{ +	return ref_to_path(fs_root, path, +			   btrfs_inode_ref_name_len(eb_in, iref), +			   (unsigned long)(iref + 1), +			   eb_in, parent, dest, size); +} + +/*   * this makes the path point to (logical EXTENT_ITEM *)   * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for   * tree blocks and <0 on error.   */  int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, -			struct btrfs_path *path, struct btrfs_key *found_key) +			struct btrfs_path *path, struct btrfs_key *found_key, +			u64 *flags_ret)  {  	int ret;  	u64 flags; @@ -1237,10 +1323,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,  		 (unsigned long long)found_key->objectid,  		 (unsigned long long)found_key->offset,  		 (unsigned long long)flags, item_size); -	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) -		return BTRFS_EXTENT_FLAG_TREE_BLOCK; -	if (flags & BTRFS_EXTENT_FLAG_DATA) -		return BTRFS_EXTENT_FLAG_DATA; + +	WARN_ON(!flags_ret); +	if (flags_ret) { +		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) +			*flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK; +		else if (flags & BTRFS_EXTENT_FLAG_DATA) +			*flags_ret = BTRFS_EXTENT_FLAG_DATA; +		else +			BUG_ON(1); +		return 0; +	}  	return -EIO;  } @@ -1404,12 +1497,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,  		ULIST_ITER_INIT(&root_uiter);  		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {  			pr_debug("root %llu references leaf %llu, data list " -				 "%#lx\n", root_node->val, ref_node->val, -				 ref_node->aux); -			ret = iterate_leaf_refs( -				(struct extent_inode_elem *)ref_node->aux, -				root_node->val, extent_item_objectid, -				iterate, ctx); +				 "%#llx\n", root_node->val, ref_node->val, +				 (long long)ref_node->aux); +			ret = iterate_leaf_refs((struct extent_inode_elem *) +						(uintptr_t)ref_node->aux, +						root_node->val, +						extent_item_objectid, +						iterate, ctx);  		}  		ulist_free(roots);  		roots = NULL; @@ -1432,15 +1526,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,  {  	int ret;  	u64 extent_item_pos; +	u64 flags = 0;  	struct btrfs_key found_key;  	int search_commit_root = path->search_commit_root; -	ret = extent_from_logical(fs_info, logical, path, -					&found_key); +	ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);  	btrfs_release_path(path);  	if (ret < 0)  		return ret; -	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) +	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)  		return -EINVAL;  	extent_item_pos = logical - found_key.objectid; @@ -1451,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,  	return ret;  } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, -				struct btrfs_path *path, -				iterate_irefs_t *iterate, void *ctx) +typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, +			      struct extent_buffer *eb, void *ctx); + +static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, +			      struct btrfs_path *path, +			      iterate_irefs_t *iterate, void *ctx)  {  	int ret = 0;  	int slot; @@ -1470,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,  	while (!ret) {  		path->leave_spinning = 1;  		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, -					&found_key); +				     &found_key);  		if (ret < 0)  			break;  		if (ret) { @@ -1498,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,  				 "tree %llu\n", cur,  				 (unsigned long long)found_key.objectid,  				 (unsigned long long)fs_root->objectid); -			ret = iterate(parent, iref, eb, ctx); +			ret = iterate(parent, name_len, +				      (unsigned long)(iref + 1), eb, ctx);  			if (ret)  				break;  			len = sizeof(*iref) + name_len; @@ -1513,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,  	return ret;  } +static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, +				 struct btrfs_path *path, +				 iterate_irefs_t *iterate, void *ctx) +{ +	int ret; +	int slot; +	u64 offset = 0; +	u64 parent; +	int found = 0; +	struct extent_buffer *eb; +	struct btrfs_inode_extref *extref; +	struct extent_buffer *leaf; +	u32 item_size; +	u32 cur_offset; +	unsigned long ptr; + +	while (1) { +		ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref, +					    &offset); +		if (ret < 0) +			break; +		if (ret) { +			ret = found ? 0 : -ENOENT; +			break; +		} +		++found; + +		slot = path->slots[0]; +		eb = path->nodes[0]; +		/* make sure we can use eb after releasing the path */ +		atomic_inc(&eb->refs); + +		btrfs_tree_read_lock(eb); +		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); +		btrfs_release_path(path); + +		leaf = path->nodes[0]; +		item_size = btrfs_item_size_nr(leaf, path->slots[0]); +		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); +		cur_offset = 0; + +		while (cur_offset < item_size) { +			u32 name_len; + +			extref = (struct btrfs_inode_extref *)(ptr + cur_offset); +			parent = btrfs_inode_extref_parent(eb, extref); +			name_len = btrfs_inode_extref_name_len(eb, extref); +			ret = iterate(parent, name_len, +				      (unsigned long)&extref->name, eb, ctx); +			if (ret) +				break; + +			cur_offset += btrfs_inode_extref_name_len(leaf, extref); +			cur_offset += sizeof(*extref); +		} +		btrfs_tree_read_unlock_blocking(eb); +		free_extent_buffer(eb); + +		offset++; +	} + +	btrfs_release_path(path); + +	return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, +			 struct btrfs_path *path, iterate_irefs_t *iterate, +			 void *ctx) +{ +	int ret; +	int found_refs = 0; + +	ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); +	if (!ret) +		++found_refs; +	else if (ret != -ENOENT) +		return ret; + +	ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); +	if (ret == -ENOENT && found_refs) +		return 0; + +	return ret; +} +  /*   * returns 0 if the path could be dumped (probably truncated)   * returns <0 in case of an error   */ -static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, -				struct extent_buffer *eb, void *ctx) +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, +			 struct extent_buffer *eb, void *ctx)  {  	struct inode_fs_paths *ipath = ctx;  	char *fspath; @@ -1531,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,  					ipath->fspath->bytes_left - s_ptr : 0;  	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; -	fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, -				inum, fspath_min, bytes_left); +	fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, +			     name_off, eb, inum, fspath_min, +			     bytes_left);  	if (IS_ERR(fspath))  		return PTR_ERR(fspath);  	if (fspath > fspath_min) { -		pr_debug("path resolved: %s\n", fspath);  		ipath->fspath->val[i] = (u64)(unsigned long)fspath;  		++ipath->fspath->elem_cnt;  		ipath->fspath->bytes_left = fspath - fspath_min;  	} else { -		pr_debug("missed path, not enough space. missing bytes: %lu, " -			 "constructed so far: %s\n", -			 (unsigned long)(fspath_min - fspath), fspath_min);  		++ipath->fspath->elem_missed;  		ipath->fspath->bytes_missing += fspath_min - fspath;  		ipath->fspath->bytes_left = 0; @@ -1566,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,  int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)  {  	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, -				inode_to_path, ipath); +			     inode_to_path, ipath);  }  struct btrfs_data_container *init_data_container(u32 total_bytes) @@ -1575,7 +1756,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)  	size_t alloc_bytes;  	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); -	data = kmalloc(alloc_bytes, GFP_NOFS); +	data = vmalloc(alloc_bytes);  	if (!data)  		return ERR_PTR(-ENOMEM); @@ -1626,6 +1807,6 @@ void free_ipath(struct inode_fs_paths *ipath)  {  	if (!ipath)  		return; -	kfree(ipath->fspath); +	vfree(ipath->fspath);  	kfree(ipath);  } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 032f4dc7eab..e75533043a5 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -33,14 +33,13 @@ struct inode_fs_paths {  typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,  		void *ctx); -typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, -				struct extent_buffer *eb, void *ctx);  int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,  			struct btrfs_path *path);  int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, -			struct btrfs_path *path, struct btrfs_key *found_key); +			struct btrfs_path *path, struct btrfs_key *found_key, +			u64 *flags);  int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,  				struct btrfs_extent_item *ei, u32 item_size, @@ -69,4 +68,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,  					struct btrfs_path *path);  void free_ipath(struct inode_fs_paths *ipath); +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, +			  u64 start_off, struct btrfs_path *path, +			  struct btrfs_inode_extref **ret_extref, +			  u64 *found_off); +  #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5b2ad6bc4fe..ed8ca7ca5ef 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -38,6 +38,7 @@  #define BTRFS_INODE_DELALLOC_META_RESERVED	4  #define BTRFS_INODE_HAS_ORPHAN_ITEM		5  #define BTRFS_INODE_HAS_ASYNC_EXTENT		6 +#define BTRFS_INODE_NEEDS_FULL_SYNC		7  /* in memory btrfs inode */  struct btrfs_inode { @@ -143,6 +144,9 @@ struct btrfs_inode {  	/* flags field from the on disk inode */  	u32 flags; +	/* a local copy of root's last_log_commit */ +	unsigned long last_log_commit; +  	/*  	 * Counters to keep track of the number of extent item's we may use due  	 * to delalloc and such.  outstanding_extents is the number of extent @@ -202,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)  static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)  { -	struct btrfs_root *root = BTRFS_I(inode)->root; -	int ret = 0; - -	mutex_lock(&root->log_mutex);  	if (BTRFS_I(inode)->logged_trans == generation && -	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) -		ret = 1; -	mutex_unlock(&root->log_mutex); -	return ret; +	    BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit) +		return 1; +	return 0;  }  #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 9197e2e3340..5a3e45db642 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -37,8 +37,9 @@   *        the file system was mounted, (i.e., they have been   *        referenced by the super block) or they have been   *        written since then and the write completion callback - *        was called and a FLUSH request to the device where - *        these blocks are located was received and completed. + *        was called and no write error was indicated and a + *        FLUSH request to the device where these blocks are + *        located was received and completed.   *    2b. All referenced blocks need to have a generation   *        number which is equal to the parent's number.   * @@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,  			       (unsigned long long)l->block_ref_to->dev_bytenr,  			       l->block_ref_to->mirror_num);  			ret = -1; +		} else if (l->block_ref_to->iodone_w_error) { +			printk(KERN_INFO "btrfs: attempt to write superblock" +			       " which references block %c @%llu (%s/%llu/%d)" +			       " which has write error!\n", +			       btrfsic_get_block_type(state, l->block_ref_to), +			       (unsigned long long) +			       l->block_ref_to->logical_bytenr, +			       l->block_ref_to->dev_state->name, +			       (unsigned long long)l->block_ref_to->dev_bytenr, +			       l->block_ref_to->mirror_num); +			ret = -1;  		} else if (l->parent_generation !=  			   l->block_ref_to->generation &&  			   BTRFSIC_GENERATION_UNKNOWN != diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 43d1c5a3a03..c6467aa88be 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  	u64 em_start;  	struct extent_map *em;  	int ret = -ENOMEM; +	int faili = 0;  	u32 *sums;  	tree = &BTRFS_I(inode)->io_tree; @@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  	for (pg_index = 0; pg_index < nr_pages; pg_index++) {  		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |  							      __GFP_HIGHMEM); -		if (!cb->compressed_pages[pg_index]) +		if (!cb->compressed_pages[pg_index]) { +			faili = pg_index - 1; +			ret = -ENOMEM;  			goto fail2; +		}  	} +	faili = nr_pages - 1;  	cb->nr_pages = nr_pages;  	add_ra_bio_pages(inode, em_start + em_len, cb); @@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  	return 0;  fail2: -	for (pg_index = 0; pg_index < nr_pages; pg_index++) -		free_page((unsigned long)cb->compressed_pages[pg_index]); +	while (faili >= 0) { +		__free_page(cb->compressed_pages[faili]); +		faili--; +	}  	kfree(cb->compressed_pages);  fail1: diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d183f60d63..b3343621100 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4402,149 +4402,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,  }  /* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. - * Returns the number of keys that were inserted. - */ -int btrfs_insert_some_items(struct btrfs_trans_handle *trans, -			    struct btrfs_root *root, -			    struct btrfs_path *path, -			    struct btrfs_key *cpu_key, u32 *data_size, -			    int nr) -{ -	struct extent_buffer *leaf; -	struct btrfs_item *item; -	int ret = 0; -	int slot; -	int i; -	u32 nritems; -	u32 total_data = 0; -	u32 total_size = 0; -	unsigned int data_end; -	struct btrfs_disk_key disk_key; -	struct btrfs_key found_key; -	struct btrfs_map_token token; - -	btrfs_init_map_token(&token); - -	for (i = 0; i < nr; i++) { -		if (total_size + data_size[i] + sizeof(struct btrfs_item) > -		    BTRFS_LEAF_DATA_SIZE(root)) { -			break; -			nr = i; -		} -		total_data += data_size[i]; -		total_size += data_size[i] + sizeof(struct btrfs_item); -	} -	BUG_ON(nr == 0); - -	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); -	if (ret == 0) -		return -EEXIST; -	if (ret < 0) -		goto out; - -	leaf = path->nodes[0]; - -	nritems = btrfs_header_nritems(leaf); -	data_end = leaf_data_end(root, leaf); - -	if (btrfs_leaf_free_space(root, leaf) < total_size) { -		for (i = nr; i >= 0; i--) { -			total_data -= data_size[i]; -			total_size -= data_size[i] + sizeof(struct btrfs_item); -			if (total_size < btrfs_leaf_free_space(root, leaf)) -				break; -		} -		nr = i; -	} - -	slot = path->slots[0]; -	BUG_ON(slot < 0); - -	if (slot != nritems) { -		unsigned int old_data = btrfs_item_end_nr(leaf, slot); - -		item = btrfs_item_nr(leaf, slot); -		btrfs_item_key_to_cpu(leaf, &found_key, slot); - -		/* figure out how many keys we can insert in here */ -		total_data = data_size[0]; -		for (i = 1; i < nr; i++) { -			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) -				break; -			total_data += data_size[i]; -		} -		nr = i; - -		if (old_data < data_end) { -			btrfs_print_leaf(root, leaf); -			printk(KERN_CRIT "slot %d old_data %d data_end %d\n", -			       slot, old_data, data_end); -			BUG_ON(1); -		} -		/* -		 * item0..itemN ... dataN.offset..dataN.size .. data0.size -		 */ -		/* first correct the data pointers */ -		for (i = slot; i < nritems; i++) { -			u32 ioff; - -			item = btrfs_item_nr(leaf, i); -			ioff = btrfs_token_item_offset(leaf, item, &token); -			btrfs_set_token_item_offset(leaf, item, -						    ioff - total_data, &token); -		} -		/* shift the items */ -		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), -			      btrfs_item_nr_offset(slot), -			      (nritems - slot) * sizeof(struct btrfs_item)); - -		/* shift the data */ -		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + -			      data_end - total_data, btrfs_leaf_data(leaf) + -			      data_end, old_data - data_end); -		data_end = old_data; -	} else { -		/* -		 * this sucks but it has to be done, if we are inserting at -		 * the end of the leaf only insert 1 of the items, since we -		 * have no way of knowing whats on the next leaf and we'd have -		 * to drop our current locks to figure it out -		 */ -		nr = 1; -	} - -	/* setup the item for the new data */ -	for (i = 0; i < nr; i++) { -		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); -		btrfs_set_item_key(leaf, &disk_key, slot + i); -		item = btrfs_item_nr(leaf, slot + i); -		btrfs_set_token_item_offset(leaf, item, -					    data_end - data_size[i], &token); -		data_end -= data_size[i]; -		btrfs_set_token_item_size(leaf, item, data_size[i], &token); -	} -	btrfs_set_header_nritems(leaf, nritems + nr); -	btrfs_mark_buffer_dirty(leaf); - -	ret = 0; -	if (slot == 0) { -		btrfs_cpu_key_to_disk(&disk_key, cpu_key); -		fixup_low_keys(trans, root, path, &disk_key, 1); -	} - -	if (btrfs_leaf_free_space(root, leaf) < 0) { -		btrfs_print_leaf(root, leaf); -		BUG(); -	} -out: -	if (!ret) -		ret = nr; -	return ret; -} - -/*   * this is a helper for btrfs_insert_empty_items, the main goal here is   * to save stack depth by doing the bulk of the work in a function   * that doesn't call btrfs_search_slot @@ -5073,6 +4930,7 @@ static void tree_move_down(struct btrfs_root *root,  			   struct btrfs_path *path,  			   int *level, int root_level)  { +	BUG_ON(*level == 0);  	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],  					path->slots[*level]);  	path->slots[*level - 1] = 0; @@ -5089,7 +4947,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,  	path->slots[*level]++; -	while (path->slots[*level] == nritems) { +	while (path->slots[*level] >= nritems) {  		if (*level == root_level)  			return -1; @@ -5433,9 +5291,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  					goto out;  				advance_right = ADVANCE;  			} else { +				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));  				ret = tree_compare_item(left_root, left_path,  						right_path, tmp_buf);  				if (ret) { +					WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));  					ret = changed_cb(left_root, right_root,  						left_path, right_path,  						&left_key, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9821b672f5a..926c9ffc66d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -154,6 +154,13 @@ struct btrfs_ordered_sum;   */  #define BTRFS_NAME_LEN 255 +/* + * Theoretical limit is larger, but we keep this down to a sane + * value. That should limit greatly the possibility of collisions on + * inode ref items. + */ +#define BTRFS_LINK_MAX 65535U +  /* 32 bytes in various csum fields */  #define BTRFS_CSUM_SIZE 32 @@ -489,6 +496,8 @@ struct btrfs_super_block {   */  #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5) +#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6) +  #define BTRFS_FEATURE_COMPAT_SUPP		0ULL  #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL  #define BTRFS_FEATURE_INCOMPAT_SUPP			\ @@ -496,7 +505,8 @@ struct btrfs_super_block {  	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\  	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\  	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\ -	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) +	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\ +	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)  /*   * A leaf is full of items. offset and size tell us where to find @@ -643,6 +653,14 @@ struct btrfs_inode_ref {  	/* name goes here */  } __attribute__ ((__packed__)); +struct btrfs_inode_extref { +	__le64 parent_objectid; +	__le64 index; +	__le16 name_len; +	__u8   name[0]; +	/* name goes here */ +} __attribute__ ((__packed__)); +  struct btrfs_timespec {  	__le64 sec;  	__le32 nsec; @@ -1028,12 +1046,22 @@ struct btrfs_space_info {  	wait_queue_head_t wait;  }; +#define	BTRFS_BLOCK_RSV_GLOBAL		1 +#define	BTRFS_BLOCK_RSV_DELALLOC	2 +#define	BTRFS_BLOCK_RSV_TRANS		3 +#define	BTRFS_BLOCK_RSV_CHUNK		4 +#define	BTRFS_BLOCK_RSV_DELOPS		5 +#define	BTRFS_BLOCK_RSV_EMPTY		6 +#define	BTRFS_BLOCK_RSV_TEMP		7 +  struct btrfs_block_rsv {  	u64 size;  	u64 reserved;  	struct btrfs_space_info *space_info;  	spinlock_t lock; -	unsigned int full; +	unsigned short full; +	unsigned short type; +	unsigned short failfast;  };  /* @@ -1127,6 +1155,9 @@ struct btrfs_block_group_cache {  	 * Today it will only have one thing on it, but that may change  	 */  	struct list_head cluster_list; + +	/* For delayed block group creation */ +	struct list_head new_bg_list;  };  /* delayed seq elem */ @@ -1240,7 +1271,6 @@ struct btrfs_fs_info {  	struct mutex reloc_mutex;  	struct list_head trans_list; -	struct list_head hashers;  	struct list_head dead_roots;  	struct list_head caching_block_groups; @@ -1366,9 +1396,6 @@ struct btrfs_fs_info {  	struct rb_root defrag_inodes;  	atomic_t defrag_running; -	spinlock_t ref_cache_lock; -	u64 total_ref_cache_size; -  	/*  	 * these three are in extended format (availability of single  	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other @@ -1441,6 +1468,8 @@ struct btrfs_fs_info {  	/* next backup root to be overwritten */  	int backup_root_index; + +	int num_tolerated_disk_barrier_failures;  };  /* @@ -1481,9 +1510,9 @@ struct btrfs_root {  	wait_queue_head_t log_commit_wait[2];  	atomic_t log_writers;  	atomic_t log_commit[2]; +	atomic_t log_batch;  	unsigned long log_transid;  	unsigned long last_log_commit; -	unsigned long log_batch;  	pid_t log_start_pid;  	bool log_multiple_pids; @@ -1592,6 +1621,7 @@ struct btrfs_ioctl_defrag_range_args {   */  #define BTRFS_INODE_ITEM_KEY		1  #define BTRFS_INODE_REF_KEY		12 +#define BTRFS_INODE_EXTREF_KEY		13  #define BTRFS_XATTR_ITEM_KEY		24  #define BTRFS_ORPHAN_ITEM_KEY		48  /* reserve 2-15 close to the inode for later flexibility */ @@ -1978,6 +2008,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,  BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);  BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, +		   parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, +		   name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); +  /* struct btrfs_inode_item */  BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);  BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); @@ -2858,6 +2895,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  			   u64 size);  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root, u64 group_start); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, +				       struct btrfs_root *root);  u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);  u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);  void btrfs_clear_space_info_full(struct btrfs_fs_info *info); @@ -2874,8 +2913,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);  int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);  void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, +					      unsigned short type);  void btrfs_free_block_rsv(struct btrfs_root *root,  			  struct btrfs_block_rsv *rsv);  int btrfs_block_rsv_add(struct btrfs_root *root, @@ -3172,12 +3212,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   const char *name, int name_len,  			   u64 inode_objectid, u64 ref_objectid, u64 *index); -struct btrfs_inode_ref * -btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, -			struct btrfs_root *root, -			struct btrfs_path *path, -			const char *name, int name_len, -			u64 inode_objectid, u64 ref_objectid, int mod); +int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, +			      struct btrfs_root *root, +			      struct btrfs_path *path, +			      const char *name, int name_len, +			      u64 inode_objectid, u64 ref_objectid, int mod, +			      u64 *ret_index);  int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root,  			     struct btrfs_path *path, u64 objectid); @@ -3185,6 +3225,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root  		       *root, struct btrfs_path *path,  		       struct btrfs_key *location, int mod); +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, +			  struct btrfs_root *root, +			  struct btrfs_path *path, +			  const char *name, int name_len, +			  u64 inode_objectid, u64 ref_objectid, int ins_len, +			  int cow); + +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, +				   u64 ref_objectid, const char *name, +				   int name_len, +				   struct btrfs_inode_extref **extref_ret); +  /* file-item.c */  int btrfs_del_csums(struct btrfs_trans_handle *trans,  		    struct btrfs_root *root, u64 bytenr, u64 len); @@ -3249,6 +3302,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,  			struct btrfs_root *root,  			struct inode *dir, u64 objectid,  			const char *name, int name_len); +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, +			int front);  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  			       struct btrfs_root *root,  			       struct inode *inode, u64 new_size, @@ -3308,16 +3363,27 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);  int btrfs_defrag_file(struct inode *inode, struct file *file,  		      struct btrfs_ioctl_defrag_range_args *range,  		      u64 newer_than, unsigned long max_pages); +void btrfs_get_block_group_info(struct list_head *groups_list, +				struct btrfs_ioctl_space_info *space); +  /* file.c */  int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,  			   struct inode *inode);  int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, -			    int skip_pinned); +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, +			     int skip_pinned); +int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace, +			       u64 start, u64 end, int skip_pinned, +			       int modified);  extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, -		       u64 start, u64 end, u64 *hint_byte, int drop_cache); +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, +			 struct btrfs_root *root, struct inode *inode, +			 struct btrfs_path *path, u64 start, u64 end, +			 u64 *drop_end, int drop_cache); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, +		       struct btrfs_root *root, struct inode *inode, u64 start, +		       u64 end, int drop_cache);  int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,  			      struct inode *inode, u64 start, u64 end);  int btrfs_release_file(struct inode *inode, struct file *file); @@ -3378,6 +3444,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,  	}  } +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ +  #define btrfs_abort_transaction(trans, root, errno)		\  do {								\  	__btrfs_abort_transaction(trans, root, __func__,	\ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 52c85e2b95d..478f66bdc57 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;  int __init btrfs_delayed_inode_init(void)  { -	delayed_node_cache = kmem_cache_create("delayed_node", +	delayed_node_cache = kmem_cache_create("btrfs_delayed_node",  					sizeof(struct btrfs_delayed_node),  					0,  					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, @@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(  	 * we're accounted for.  	 */  	if (!src_rsv || (!trans->bytes_reserved && -	    src_rsv != &root->fs_info->delalloc_block_rsv)) { +			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {  		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);  		/*  		 * Since we're under a transaction reserve_metadata_bytes could @@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata(  						      num_bytes, 1);  		}  		return ret; -	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) { +	} else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {  		spin_lock(&BTRFS_I(inode)->lock);  		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,  				       &BTRFS_I(inode)->runtime_flags)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 22e98e04c2e..7cda51995c1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,10 @@  #include "check-integrity.h"  #include "rcu-string.h" +#ifdef CONFIG_X86 +#include <asm/cpufeature.h> +#endif +  static struct extent_io_ops btree_extent_io_ops;  static void end_workqueue_fn(struct btrfs_work *work);  static void free_fs_root(struct btrfs_root *root); @@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,  	write_lock(&em_tree->lock);  	ret = add_extent_mapping(em_tree, em);  	if (ret == -EEXIST) { -		u64 failed_start = em->start; -		u64 failed_len = em->len; -  		free_extent_map(em);  		em = lookup_extent_mapping(em_tree, start, len); -		if (em) { -			ret = 0; -		} else { -			em = lookup_extent_mapping(em_tree, failed_start, -						   failed_len); -			ret = -EIO; -		} +		if (!em) +			em = ERR_PTR(-EIO);  	} else if (ret) {  		free_extent_map(em); -		em = NULL; +		em = ERR_PTR(ret);  	}  	write_unlock(&em_tree->lock); -	if (ret) -		em = ERR_PTR(ret);  out:  	return em;  } @@ -439,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)  		WARN_ON(1);  		return 0;  	} -	if (eb->pages[0] != page) { -		WARN_ON(1); -		return 0; -	}  	if (!PageUptodate(page)) {  		WARN_ON(1);  		return 0; @@ -869,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,  	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);  } +static int check_async_write(struct inode *inode, unsigned long bio_flags) +{ +	if (bio_flags & EXTENT_BIO_TREE_LOG) +		return 0; +#ifdef CONFIG_X86 +	if (cpu_has_xmm4_2) +		return 0; +#endif +	return 1; +} +  static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  				 int mirror_num, unsigned long bio_flags,  				 u64 bio_offset)  { +	int async = check_async_write(inode, bio_flags);  	int ret;  	if (!(rw & REQ_WRITE)) { @@ -887,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  			return ret;  		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,  				     mirror_num, 0); +	} else if (!async) { +		ret = btree_csum_one_bio(bio); +		if (ret) +			return ret; +		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, +				     mirror_num, 0);  	}  	/* @@ -1168,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,  	atomic_set(&root->log_commit[0], 0);  	atomic_set(&root->log_commit[1], 0);  	atomic_set(&root->log_writers, 0); +	atomic_set(&root->log_batch, 0);  	atomic_set(&root->orphan_inodes, 0); -	root->log_batch = 0;  	root->log_transid = 0;  	root->last_log_commit = 0;  	extent_io_tree_init(&root->dirty_log_pages, @@ -1667,9 +1675,10 @@ static int transaction_kthread(void *arg)  		spin_unlock(&root->fs_info->trans_lock);  		/* If the file system is aborted, this will always fail. */ -		trans = btrfs_join_transaction(root); +		trans = btrfs_attach_transaction(root);  		if (IS_ERR(trans)) { -			cannot_commit = true; +			if (PTR_ERR(trans) != -ENOENT) +				cannot_commit = true;  			goto sleep;  		}  		if (transid == trans->transid) { @@ -1994,13 +2003,11 @@ int open_ctree(struct super_block *sb,  	INIT_LIST_HEAD(&fs_info->trans_list);  	INIT_LIST_HEAD(&fs_info->dead_roots);  	INIT_LIST_HEAD(&fs_info->delayed_iputs); -	INIT_LIST_HEAD(&fs_info->hashers);  	INIT_LIST_HEAD(&fs_info->delalloc_inodes);  	INIT_LIST_HEAD(&fs_info->ordered_operations);  	INIT_LIST_HEAD(&fs_info->caching_block_groups);  	spin_lock_init(&fs_info->delalloc_lock);  	spin_lock_init(&fs_info->trans_lock); -	spin_lock_init(&fs_info->ref_cache_lock);  	spin_lock_init(&fs_info->fs_roots_radix_lock);  	spin_lock_init(&fs_info->delayed_iput_lock);  	spin_lock_init(&fs_info->defrag_inodes_lock); @@ -2014,12 +2021,15 @@ int open_ctree(struct super_block *sb,  	INIT_LIST_HEAD(&fs_info->space_info);  	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);  	btrfs_mapping_init(&fs_info->mapping_tree); -	btrfs_init_block_rsv(&fs_info->global_block_rsv); -	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); -	btrfs_init_block_rsv(&fs_info->trans_block_rsv); -	btrfs_init_block_rsv(&fs_info->chunk_block_rsv); -	btrfs_init_block_rsv(&fs_info->empty_block_rsv); -	btrfs_init_block_rsv(&fs_info->delayed_block_rsv); +	btrfs_init_block_rsv(&fs_info->global_block_rsv, +			     BTRFS_BLOCK_RSV_GLOBAL); +	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, +			     BTRFS_BLOCK_RSV_DELALLOC); +	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); +	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); +	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); +	btrfs_init_block_rsv(&fs_info->delayed_block_rsv, +			     BTRFS_BLOCK_RSV_DELOPS);  	atomic_set(&fs_info->nr_async_submits, 0);  	atomic_set(&fs_info->async_delalloc_pages, 0);  	atomic_set(&fs_info->async_submit_draining, 0); @@ -2491,6 +2501,8 @@ retry_root_backup:  		printk(KERN_ERR "Failed to read block groups: %d\n", ret);  		goto fail_block_groups;  	} +	fs_info->num_tolerated_disk_barrier_failures = +		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);  	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,  					       "btrfs-cleaner"); @@ -2874,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)  			printk_in_rcu("btrfs: disabling barriers on dev %s\n",  				      rcu_str_deref(device->name));  			device->nobarriers = 1; -		} -		if (!bio_flagged(bio, BIO_UPTODATE)) { +		} else if (!bio_flagged(bio, BIO_UPTODATE)) {  			ret = -EIO; -			if (!bio_flagged(bio, BIO_EOPNOTSUPP)) -				btrfs_dev_stat_inc_and_print(device, -					BTRFS_DEV_STAT_FLUSH_ERRS); +			btrfs_dev_stat_inc_and_print(device, +				BTRFS_DEV_STAT_FLUSH_ERRS);  		}  		/* drop the reference from the wait == 0 run */ @@ -2918,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)  {  	struct list_head *head;  	struct btrfs_device *dev; -	int errors = 0; +	int errors_send = 0; +	int errors_wait = 0;  	int ret;  	/* send down all the barriers */  	head = &info->fs_devices->devices;  	list_for_each_entry_rcu(dev, head, dev_list) {  		if (!dev->bdev) { -			errors++; +			errors_send++;  			continue;  		}  		if (!dev->in_fs_metadata || !dev->writeable) @@ -2933,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)  		ret = write_dev_flush(dev, 0);  		if (ret) -			errors++; +			errors_send++;  	}  	/* wait for all the barriers */  	list_for_each_entry_rcu(dev, head, dev_list) {  		if (!dev->bdev) { -			errors++; +			errors_wait++;  			continue;  		}  		if (!dev->in_fs_metadata || !dev->writeable) @@ -2947,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)  		ret = write_dev_flush(dev, 1);  		if (ret) -			errors++; +			errors_wait++;  	} -	if (errors) +	if (errors_send > info->num_tolerated_disk_barrier_failures || +	    errors_wait > info->num_tolerated_disk_barrier_failures)  		return -EIO;  	return 0;  } +int btrfs_calc_num_tolerated_disk_barrier_failures( +	struct btrfs_fs_info *fs_info) +{ +	struct btrfs_ioctl_space_info space; +	struct btrfs_space_info *sinfo; +	u64 types[] = {BTRFS_BLOCK_GROUP_DATA, +		       BTRFS_BLOCK_GROUP_SYSTEM, +		       BTRFS_BLOCK_GROUP_METADATA, +		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; +	int num_types = 4; +	int i; +	int c; +	int num_tolerated_disk_barrier_failures = +		(int)fs_info->fs_devices->num_devices; + +	for (i = 0; i < num_types; i++) { +		struct btrfs_space_info *tmp; + +		sinfo = NULL; +		rcu_read_lock(); +		list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { +			if (tmp->flags == types[i]) { +				sinfo = tmp; +				break; +			} +		} +		rcu_read_unlock(); + +		if (!sinfo) +			continue; + +		down_read(&sinfo->groups_sem); +		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { +			if (!list_empty(&sinfo->block_groups[c])) { +				u64 flags; + +				btrfs_get_block_group_info( +					&sinfo->block_groups[c], &space); +				if (space.total_bytes == 0 || +				    space.used_bytes == 0) +					continue; +				flags = space.flags; +				/* +				 * return +				 * 0: if dup, single or RAID0 is configured for +				 *    any of metadata, system or data, else +				 * 1: if RAID5 is configured, or if RAID1 or +				 *    RAID10 is configured and only two mirrors +				 *    are used, else +				 * 2: if RAID6 is configured, else +				 * num_mirrors - 1: if RAID1 or RAID10 is +				 *                  configured and more than +				 *                  2 mirrors are used. +				 */ +				if (num_tolerated_disk_barrier_failures > 0 && +				    ((flags & (BTRFS_BLOCK_GROUP_DUP | +					       BTRFS_BLOCK_GROUP_RAID0)) || +				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) +				      == 0))) +					num_tolerated_disk_barrier_failures = 0; +				else if (num_tolerated_disk_barrier_failures > 1 +					 && +					 (flags & (BTRFS_BLOCK_GROUP_RAID1 | +						   BTRFS_BLOCK_GROUP_RAID10))) +					num_tolerated_disk_barrier_failures = 1; +			} +		} +		up_read(&sinfo->groups_sem); +	} + +	return num_tolerated_disk_barrier_failures; +} +  int write_all_supers(struct btrfs_root *root, int max_mirrors)  {  	struct list_head *head; @@ -2976,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)  	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);  	head = &root->fs_info->fs_devices->devices; -	if (do_barriers) -		barrier_all_devices(root->fs_info); +	if (do_barriers) { +		ret = barrier_all_devices(root->fs_info); +		if (ret) { +			mutex_unlock( +				&root->fs_info->fs_devices->device_list_mutex); +			btrfs_error(root->fs_info, ret, +				    "errors while submitting device barriers."); +			return ret; +		} +	}  	list_for_each_entry_rcu(dev, head, dev_list) {  		if (!dev->bdev) { @@ -3211,10 +3304,6 @@ int close_ctree(struct btrfs_root *root)  		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",  		       (unsigned long long)fs_info->delalloc_bytes);  	} -	if (fs_info->total_ref_cache_size) { -		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", -		       (unsigned long long)fs_info->total_ref_cache_size); -	}  	free_extent_buffer(fs_info->extent_root->node);  	free_extent_buffer(fs_info->extent_root->commit_root); @@ -3360,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)  	return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);  } -int btree_lock_page_hook(struct page *page, void *data, -				void (*flush_fn)(void *)) -{ -	struct inode *inode = page->mapping->host; -	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct extent_buffer *eb; - -	/* -	 * We culled this eb but the page is still hanging out on the mapping, -	 * carry on. -	 */ -	if (!PagePrivate(page)) -		goto out; - -	eb = (struct extent_buffer *)page->private; -	if (!eb) { -		WARN_ON(1); -		goto out; -	} -	if (page != eb->pages[0]) -		goto out; - -	if (!btrfs_try_tree_write_lock(eb)) { -		flush_fn(data); -		btrfs_tree_lock(eb); -	} -	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - -	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { -		spin_lock(&root->fs_info->delalloc_lock); -		if (root->fs_info->dirty_metadata_bytes >= eb->len) -			root->fs_info->dirty_metadata_bytes -= eb->len; -		else -			WARN_ON(1); -		spin_unlock(&root->fs_info->delalloc_lock); -	} - -	btrfs_tree_unlock(eb); -out: -	if (!trylock_page(page)) { -		flush_fn(data); -		lock_page(page); -	} -	return 0; -} -  static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,  			      int read_only)  { @@ -3608,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,  	while (1) {  		ret = find_first_extent_bit(dirty_pages, start, &start, &end, -					    mark); +					    mark, NULL);  		if (ret)  			break; @@ -3663,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,  again:  	while (1) {  		ret = find_first_extent_bit(unpin, 0, &start, &end, -					    EXTENT_DIRTY); +					    EXTENT_DIRTY, NULL);  		if (ret)  			break; @@ -3800,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)  }  static struct extent_io_ops btree_extent_io_ops = { -	.write_cache_pages_lock_hook = btree_lock_page_hook,  	.readpage_end_io_hook = btree_readpage_end_io_hook,  	.readpage_io_failed_hook = btree_io_failed_hook,  	.submit_bio_hook = btree_submit_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c5b00a735fe..2025a9132c1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,  				     u64 objectid);  int btree_lock_page_hook(struct page *page, void *data,  				void (*flush_fn)(void *)); +int btrfs_calc_num_tolerated_disk_barrier_failures( +	struct btrfs_fs_info *fs_info);  #ifdef CONFIG_DEBUG_LOCK_ALLOC  void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ba58024d40d..3d3e2c17d8d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  				     u64 flags, struct btrfs_disk_key *key,  				     int level, struct btrfs_key *ins);  static int do_chunk_alloc(struct btrfs_trans_handle *trans, -			  struct btrfs_root *extent_root, u64 alloc_bytes, -			  u64 flags, int force); +			  struct btrfs_root *extent_root, u64 flags, +			  int force);  static int find_next_key(struct btrfs_path *path, int level,  			 struct btrfs_key *key);  static void dump_space_info(struct btrfs_space_info *info, u64 bytes, @@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,  	while (start < end) {  		ret = find_first_extent_bit(info->pinned_extents, start,  					    &extent_start, &extent_end, -					    EXTENT_DIRTY | EXTENT_UPTODATE); +					    EXTENT_DIRTY | EXTENT_UPTODATE, +					    NULL);  		if (ret)  			break; @@ -2361,10 +2362,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  		}  next: -		do_chunk_alloc(trans, fs_info->extent_root, -			       2 * 1024 * 1024, -			       btrfs_get_alloc_profile(root, 0), -			       CHUNK_ALLOC_NO_FORCE);  		cond_resched();  		spin_lock(&delayed_refs->lock);  	} @@ -2478,10 +2475,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  	if (root == root->fs_info->extent_root)  		root = root->fs_info->tree_root; -	do_chunk_alloc(trans, root->fs_info->extent_root, -		       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), -		       CHUNK_ALLOC_NO_FORCE); -  	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);  	delayed_refs = &trans->transaction->delayed_refs; @@ -2551,6 +2544,12 @@ again:  	}  	if (run_all) { +		if (!list_empty(&trans->new_bgs)) { +			spin_unlock(&delayed_refs->lock); +			btrfs_create_pending_block_groups(trans, root); +			spin_lock(&delayed_refs->lock); +		} +  		node = rb_first(&delayed_refs->root);  		if (!node)  			goto out; @@ -3406,7 +3405,6 @@ alloc:  				return PTR_ERR(trans);  			ret = do_chunk_alloc(trans, root->fs_info->extent_root, -					     bytes + 2 * 1024 * 1024,  					     alloc_target,  					     CHUNK_ALLOC_NO_FORCE);  			btrfs_end_transaction(trans, root); @@ -3488,8 +3486,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)  }  static int should_alloc_chunk(struct btrfs_root *root, -			      struct btrfs_space_info *sinfo, u64 alloc_bytes, -			      int force) +			      struct btrfs_space_info *sinfo, int force)  {  	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;  	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; @@ -3504,7 +3501,8 @@ static int should_alloc_chunk(struct btrfs_root *root,  	 * and purposes it's used space.  Don't worry about locking the  	 * global_rsv, it doesn't change except when the transaction commits.  	 */ -	num_allocated += global_rsv->size; +	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) +		num_allocated += global_rsv->size;  	/*  	 * in limited mode, we want to have some free space up to @@ -3518,15 +3516,8 @@ static int should_alloc_chunk(struct btrfs_root *root,  		if (num_bytes - num_allocated < thresh)  			return 1;  	} -	thresh = btrfs_super_total_bytes(root->fs_info->super_copy); -	/* 256MB or 2% of the FS */ -	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); -	/* system chunks need a much small threshold */ -	if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) -		thresh = 32 * 1024 * 1024; - -	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) +	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))  		return 0;  	return 1;  } @@ -3576,8 +3567,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,  }  static int do_chunk_alloc(struct btrfs_trans_handle *trans, -			  struct btrfs_root *extent_root, u64 alloc_bytes, -			  u64 flags, int force) +			  struct btrfs_root *extent_root, u64 flags, int force)  {  	struct btrfs_space_info *space_info;  	struct btrfs_fs_info *fs_info = extent_root->fs_info; @@ -3601,7 +3591,7 @@ again:  		return 0;  	} -	if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { +	if (!should_alloc_chunk(extent_root, space_info, force)) {  		spin_unlock(&space_info->lock);  		return 0;  	} else if (space_info->chunk_alloc) { @@ -3669,6 +3659,46 @@ out:  	return ret;  } +static int can_overcommit(struct btrfs_root *root, +			  struct btrfs_space_info *space_info, u64 bytes, +			  int flush) +{ +	u64 profile = btrfs_get_alloc_profile(root, 0); +	u64 avail; +	u64 used; + +	used = space_info->bytes_used + space_info->bytes_reserved + +		space_info->bytes_pinned + space_info->bytes_readonly + +		space_info->bytes_may_use; + +	spin_lock(&root->fs_info->free_chunk_lock); +	avail = root->fs_info->free_chunk_space; +	spin_unlock(&root->fs_info->free_chunk_lock); + +	/* +	 * If we have dup, raid1 or raid10 then only half of the free +	 * space is actually useable. +	 */ +	if (profile & (BTRFS_BLOCK_GROUP_DUP | +		       BTRFS_BLOCK_GROUP_RAID1 | +		       BTRFS_BLOCK_GROUP_RAID10)) +		avail >>= 1; + +	/* +	 * If we aren't flushing don't let us overcommit too much, say +	 * 1/8th of the space.  If we can flush, let it overcommit up to +	 * 1/2 of the space. +	 */ +	if (flush) +		avail >>= 3; +	else +		avail >>= 1; + +	if (used + bytes < space_info->total_bytes + avail) +		return 1; +	return 0; +} +  /*   * shrink metadata reservation for delalloc   */ @@ -3693,7 +3723,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  	if (delalloc_bytes == 0) {  		if (trans)  			return; -		btrfs_wait_ordered_extents(root, 0, 0); +		btrfs_wait_ordered_extents(root, 0);  		return;  	} @@ -3703,11 +3733,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,  					       WB_REASON_FS_FREE_SPACE); +		/* +		 * We need to wait for the async pages to actually start before +		 * we do anything. +		 */ +		wait_event(root->fs_info->async_submit_wait, +			   !atomic_read(&root->fs_info->async_delalloc_pages)); +  		spin_lock(&space_info->lock); -		if (space_info->bytes_used + space_info->bytes_reserved + -		    space_info->bytes_pinned + space_info->bytes_readonly + -		    space_info->bytes_may_use + orig <= -		    space_info->total_bytes) { +		if (can_overcommit(root, space_info, orig, !trans)) {  			spin_unlock(&space_info->lock);  			break;  		} @@ -3715,7 +3749,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  		loops++;  		if (wait_ordered && !trans) { -			btrfs_wait_ordered_extents(root, 0, 0); +			btrfs_wait_ordered_extents(root, 0);  		} else {  			time_left = schedule_timeout_killable(1);  			if (time_left) @@ -3784,11 +3818,12 @@ commit:  }  enum flush_state { -	FLUSH_DELALLOC		=	1, -	FLUSH_DELALLOC_WAIT	=	2, -	FLUSH_DELAYED_ITEMS_NR	=	3, -	FLUSH_DELAYED_ITEMS	=	4, -	COMMIT_TRANS		=	5, +	FLUSH_DELAYED_ITEMS_NR	=	1, +	FLUSH_DELAYED_ITEMS	=	2, +	FLUSH_DELALLOC		=	3, +	FLUSH_DELALLOC_WAIT	=	4, +	ALLOC_CHUNK		=	5, +	COMMIT_TRANS		=	6,  };  static int flush_space(struct btrfs_root *root, @@ -3800,11 +3835,6 @@ static int flush_space(struct btrfs_root *root,  	int ret = 0;  	switch (state) { -	case FLUSH_DELALLOC: -	case FLUSH_DELALLOC_WAIT: -		shrink_delalloc(root, num_bytes, orig_bytes, -				state == FLUSH_DELALLOC_WAIT); -		break;  	case FLUSH_DELAYED_ITEMS_NR:  	case FLUSH_DELAYED_ITEMS:  		if (state == FLUSH_DELAYED_ITEMS_NR) { @@ -3825,6 +3855,24 @@ static int flush_space(struct btrfs_root *root,  		ret = btrfs_run_delayed_items_nr(trans, root, nr);  		btrfs_end_transaction(trans, root);  		break; +	case FLUSH_DELALLOC: +	case FLUSH_DELALLOC_WAIT: +		shrink_delalloc(root, num_bytes, orig_bytes, +				state == FLUSH_DELALLOC_WAIT); +		break; +	case ALLOC_CHUNK: +		trans = btrfs_join_transaction(root); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); +			break; +		} +		ret = do_chunk_alloc(trans, root->fs_info->extent_root, +				     btrfs_get_alloc_profile(root, 0), +				     CHUNK_ALLOC_NO_FORCE); +		btrfs_end_transaction(trans, root); +		if (ret == -ENOSPC) +			ret = 0; +		break;  	case COMMIT_TRANS:  		ret = may_commit_transaction(root, space_info, orig_bytes, 0);  		break; @@ -3856,10 +3904,9 @@ static int reserve_metadata_bytes(struct btrfs_root *root,  	struct btrfs_space_info *space_info = block_rsv->space_info;  	u64 used;  	u64 num_bytes = orig_bytes; -	int flush_state = FLUSH_DELALLOC; +	int flush_state = FLUSH_DELAYED_ITEMS_NR;  	int ret = 0;  	bool flushing = false; -	bool committed = false;  again:  	ret = 0; @@ -3922,57 +3969,12 @@ again:  			(orig_bytes * 2);  	} -	if (ret) { -		u64 profile = btrfs_get_alloc_profile(root, 0); -		u64 avail; - -		/* -		 * If we have a lot of space that's pinned, don't bother doing -		 * the overcommit dance yet and just commit the transaction. -		 */ -		avail = (space_info->total_bytes - space_info->bytes_used) * 8; -		do_div(avail, 10); -		if (space_info->bytes_pinned >= avail && flush && !committed) { -			space_info->flush = 1; -			flushing = true; -			spin_unlock(&space_info->lock); -			ret = may_commit_transaction(root, space_info, -						     orig_bytes, 1); -			if (ret) -				goto out; -			committed = true; -			goto again; -		} - -		spin_lock(&root->fs_info->free_chunk_lock); -		avail = root->fs_info->free_chunk_space; - -		/* -		 * If we have dup, raid1 or raid10 then only half of the free -		 * space is actually useable. -		 */ -		if (profile & (BTRFS_BLOCK_GROUP_DUP | -			       BTRFS_BLOCK_GROUP_RAID1 | -			       BTRFS_BLOCK_GROUP_RAID10)) -			avail >>= 1; - -		/* -		 * If we aren't flushing don't let us overcommit too much, say -		 * 1/8th of the space.  If we can flush, let it overcommit up to -		 * 1/2 of the space. -		 */ -		if (flush) -			avail >>= 3; -		else -			avail >>= 1; -		 spin_unlock(&root->fs_info->free_chunk_lock); - -		if (used + num_bytes < space_info->total_bytes + avail) { -			space_info->bytes_may_use += orig_bytes; -			trace_btrfs_space_reservation(root->fs_info, -				"space_info", space_info->flags, orig_bytes, 1); -			ret = 0; -		} +	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { +		space_info->bytes_may_use += orig_bytes; +		trace_btrfs_space_reservation(root->fs_info, "space_info", +					      space_info->flags, orig_bytes, +					      1); +		ret = 0;  	}  	/* @@ -4114,13 +4116,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,  	return 0;  } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)  {  	memset(rsv, 0, sizeof(*rsv));  	spin_lock_init(&rsv->lock); +	rsv->type = type;  } -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, +					      unsigned short type)  {  	struct btrfs_block_rsv *block_rsv;  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -4129,7 +4133,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)  	if (!block_rsv)  		return NULL; -	btrfs_init_block_rsv(block_rsv); +	btrfs_init_block_rsv(block_rsv, type);  	block_rsv->space_info = __find_space_info(fs_info,  						  BTRFS_BLOCK_GROUP_METADATA);  	return block_rsv; @@ -4138,6 +4142,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)  void btrfs_free_block_rsv(struct btrfs_root *root,  			  struct btrfs_block_rsv *rsv)  { +	if (!rsv) +		return;  	btrfs_block_rsv_release(root, rsv, (u64)-1);  	kfree(rsv);  } @@ -4416,10 +4422,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,  	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);  	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;  	/* -	 * two for root back/forward refs, two for directory entries -	 * and one for root of the snapshot. +	 * two for root back/forward refs, two for directory entries, +	 * one for root of the snapshot and one for parent inode.  	 */ -	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); +	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);  	dst_rsv->space_info = src_rsv->space_info;  	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);  } @@ -5018,7 +5024,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,  	while (1) {  		ret = find_first_extent_bit(unpin, 0, &start, &end, -					    EXTENT_DIRTY); +					    EXTENT_DIRTY, NULL);  		if (ret)  			break; @@ -5096,8 +5102,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			ret = remove_extent_backref(trans, extent_root, path,  						    NULL, refs_to_drop,  						    is_data); -			if (ret) -				goto abort; +			if (ret) { +				btrfs_abort_transaction(trans, extent_root, ret); +				goto out; +			}  			btrfs_release_path(path);  			path->leave_spinning = 1; @@ -5115,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  					btrfs_print_leaf(extent_root,  							 path->nodes[0]);  			} -			if (ret < 0) -				goto abort; +			if (ret < 0) { +				btrfs_abort_transaction(trans, extent_root, ret); +				goto out; +			}  			extent_slot = path->slots[0];  		}  	} else if (ret == -ENOENT) { @@ -5130,7 +5140,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  		       (unsigned long long)owner_objectid,  		       (unsigned long long)owner_offset);  	} else { -		goto abort; +		btrfs_abort_transaction(trans, extent_root, ret); +		goto out;  	}  	leaf = path->nodes[0]; @@ -5140,8 +5151,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  		BUG_ON(found_extent || extent_slot != path->slots[0]);  		ret = convert_extent_item_v0(trans, extent_root, path,  					     owner_objectid, 0); -		if (ret < 0) -			goto abort; +		if (ret < 0) { +			btrfs_abort_transaction(trans, extent_root, ret); +			goto out; +		}  		btrfs_release_path(path);  		path->leave_spinning = 1; @@ -5158,8 +5171,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			       (unsigned long long)bytenr);  			btrfs_print_leaf(extent_root, path->nodes[0]);  		} -		if (ret < 0) -			goto abort; +		if (ret < 0) { +			btrfs_abort_transaction(trans, extent_root, ret); +			goto out; +		} +  		extent_slot = path->slots[0];  		leaf = path->nodes[0];  		item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -5196,8 +5212,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			ret = remove_extent_backref(trans, extent_root, path,  						    iref, refs_to_drop,  						    is_data); -			if (ret) -				goto abort; +			if (ret) { +				btrfs_abort_transaction(trans, extent_root, ret); +				goto out; +			}  		}  	} else {  		if (found_extent) { @@ -5214,27 +5232,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],  				      num_to_del); -		if (ret) -			goto abort; +		if (ret) { +			btrfs_abort_transaction(trans, extent_root, ret); +			goto out; +		}  		btrfs_release_path(path);  		if (is_data) {  			ret = btrfs_del_csums(trans, root, bytenr, num_bytes); -			if (ret) -				goto abort; +			if (ret) { +				btrfs_abort_transaction(trans, extent_root, ret); +				goto out; +			}  		}  		ret = update_block_group(trans, root, bytenr, num_bytes, 0); -		if (ret) -			goto abort; +		if (ret) { +			btrfs_abort_transaction(trans, extent_root, ret); +			goto out; +		}  	}  out:  	btrfs_free_path(path);  	return ret; - -abort: -	btrfs_abort_transaction(trans, extent_root, ret); -	goto out;  }  /* @@ -5497,8 +5517,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,  	struct btrfs_block_group_cache *used_block_group;  	u64 search_start = 0;  	int empty_cluster = 2 * 1024 * 1024; -	int allowed_chunk_alloc = 0; -	int done_chunk_alloc = 0;  	struct btrfs_space_info *space_info;  	int loop = 0;  	int index = 0; @@ -5530,9 +5548,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,  	if (btrfs_mixed_space_info(space_info))  		use_cluster = false; -	if (orig_root->ref_cows || empty_size) -		allowed_chunk_alloc = 1; -  	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {  		last_ptr = &root->fs_info->meta_alloc_cluster;  		if (!btrfs_test_opt(root, SSD)) @@ -5806,10 +5821,6 @@ checks:  		trace_btrfs_reserve_extent(orig_root, block_group,  					   search_start, num_bytes); -		if (offset < search_start) -			btrfs_add_free_space(used_block_group, offset, -					     search_start - offset); -		BUG_ON(offset > search_start);  		if (used_block_group != block_group)  			btrfs_put_block_group(used_block_group);  		btrfs_put_block_group(block_group); @@ -5842,34 +5853,17 @@ loop:  		index = 0;  		loop++;  		if (loop == LOOP_ALLOC_CHUNK) { -		       if (allowed_chunk_alloc) { -				ret = do_chunk_alloc(trans, root, num_bytes + -						     2 * 1024 * 1024, data, -						     CHUNK_ALLOC_LIMITED); -				/* -				 * Do not bail out on ENOSPC since we -				 * can do more things. -				 */ -				if (ret < 0 && ret != -ENOSPC) { -					btrfs_abort_transaction(trans, -								root, ret); -					goto out; -				} -				allowed_chunk_alloc = 0; -				if (ret == 1) -					done_chunk_alloc = 1; -			} else if (!done_chunk_alloc && -				   space_info->force_alloc == -				   CHUNK_ALLOC_NO_FORCE) { -				space_info->force_alloc = CHUNK_ALLOC_LIMITED; +			ret = do_chunk_alloc(trans, root, data, +					     CHUNK_ALLOC_FORCE); +			/* +			 * Do not bail out on ENOSPC since we +			 * can do more things. +			 */ +			if (ret < 0 && ret != -ENOSPC) { +				btrfs_abort_transaction(trans, +							root, ret); +				goto out;  			} - -		       /* -			* We didn't allocate a chunk, go ahead and drop the -			* empty size and loop again. -			*/ -		       if (!done_chunk_alloc) -			       loop = LOOP_NO_EMPTY_SIZE;  		}  		if (loop == LOOP_NO_EMPTY_SIZE) { @@ -5944,20 +5938,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,  	data = btrfs_get_alloc_profile(root, data);  again: -	/* -	 * the only place that sets empty_size is btrfs_realloc_node, which -	 * is not called recursively on allocations -	 */ -	if (empty_size || root->ref_cows) { -		ret = do_chunk_alloc(trans, root->fs_info->extent_root, -				     num_bytes + 2 * 1024 * 1024, data, -				     CHUNK_ALLOC_NO_FORCE); -		if (ret < 0 && ret != -ENOSPC) { -			btrfs_abort_transaction(trans, root, ret); -			return ret; -		} -	} -  	WARN_ON(num_bytes < root->sectorsize);  	ret = find_free_extent(trans, root, num_bytes, empty_size,  			       hint_byte, ins, data); @@ -5967,12 +5947,6 @@ again:  			num_bytes = num_bytes >> 1;  			num_bytes = num_bytes & ~(root->sectorsize - 1);  			num_bytes = max(num_bytes, min_alloc_size); -			ret = do_chunk_alloc(trans, root->fs_info->extent_root, -				       num_bytes, data, CHUNK_ALLOC_FORCE); -			if (ret < 0 && ret != -ENOSPC) { -				btrfs_abort_transaction(trans, root, ret); -				return ret; -			}  			if (num_bytes == min_alloc_size)  				final_tried = true;  			goto again; @@ -6314,7 +6288,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,  	ret = block_rsv_use_bytes(block_rsv, blocksize);  	if (!ret)  		return block_rsv; -	if (ret) { +	if (ret && !block_rsv->failfast) {  		static DEFINE_RATELIMIT_STATE(_rs,  				DEFAULT_RATELIMIT_INTERVAL,  				/*DEFAULT_RATELIMIT_BURST*/ 2); @@ -7279,7 +7253,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,  	alloc_flags = update_block_group_flags(root, cache->flags);  	if (alloc_flags != cache->flags) { -		ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, +		ret = do_chunk_alloc(trans, root, alloc_flags,  				     CHUNK_ALLOC_FORCE);  		if (ret < 0)  			goto out; @@ -7289,7 +7263,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,  	if (!ret)  		goto out;  	alloc_flags = get_alloc_profile(root, cache->space_info->flags); -	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, +	ret = do_chunk_alloc(trans, root, alloc_flags,  			     CHUNK_ALLOC_FORCE);  	if (ret < 0)  		goto out; @@ -7303,7 +7277,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,  			    struct btrfs_root *root, u64 type)  {  	u64 alloc_flags = get_alloc_profile(root, type); -	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, +	return do_chunk_alloc(trans, root, alloc_flags,  			      CHUNK_ALLOC_FORCE);  } @@ -7810,6 +7784,34 @@ error:  	return ret;  } +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, +				       struct btrfs_root *root) +{ +	struct btrfs_block_group_cache *block_group, *tmp; +	struct btrfs_root *extent_root = root->fs_info->extent_root; +	struct btrfs_block_group_item item; +	struct btrfs_key key; +	int ret = 0; + +	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, +				 new_bg_list) { +		list_del_init(&block_group->new_bg_list); + +		if (ret) +			continue; + +		spin_lock(&block_group->lock); +		memcpy(&item, &block_group->item, sizeof(item)); +		memcpy(&key, &block_group->key, sizeof(key)); +		spin_unlock(&block_group->lock); + +		ret = btrfs_insert_item(trans, extent_root, &key, &item, +					sizeof(item)); +		if (ret) +			btrfs_abort_transaction(trans, extent_root, ret); +	} +} +  int btrfs_make_block_group(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root, u64 bytes_used,  			   u64 type, u64 chunk_objectid, u64 chunk_offset, @@ -7843,6 +7845,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	spin_lock_init(&cache->lock);  	INIT_LIST_HEAD(&cache->list);  	INIT_LIST_HEAD(&cache->cluster_list); +	INIT_LIST_HEAD(&cache->new_bg_list);  	btrfs_init_free_space_ctl(cache); @@ -7874,12 +7877,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	ret = btrfs_add_block_group_cache(root->fs_info, cache);  	BUG_ON(ret); /* Logic error */ -	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, -				sizeof(cache->item)); -	if (ret) { -		btrfs_abort_transaction(trans, extent_root, ret); -		return ret; -	} +	list_add_tail(&cache->new_bg_list, &trans->new_bgs);  	set_avail_alloc_bits(extent_root->fs_info, type); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b08ea4717e9..8036d3a8485 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -45,6 +45,7 @@ struct extent_page_data {  	struct bio *bio;  	struct extent_io_tree *tree;  	get_extent_t *get_extent; +	unsigned long bio_flags;  	/* tells writepage not to lock the state bits for this range  	 * it still does the unlocking @@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)  int __init extent_io_init(void)  { -	extent_state_cache = kmem_cache_create("extent_state", +	extent_state_cache = kmem_cache_create("btrfs_extent_state",  			sizeof(struct extent_state), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!extent_state_cache)  		return -ENOMEM; -	extent_buffer_cache = kmem_cache_create("extent_buffers", +	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",  			sizeof(struct extent_buffer), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!extent_buffer_cache) @@ -942,6 +943,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,   * @end:	the end offset in bytes (inclusive)   * @bits:	the bits to set in this range   * @clear_bits:	the bits to clear in this range + * @cached_state:	state that we're going to cache   * @mask:	the allocation mask   *   * This will go through and set bits for the given range.  If any states exist @@ -951,7 +953,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,   * boundary bits like LOCK.   */  int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, -		       int bits, int clear_bits, gfp_t mask) +		       int bits, int clear_bits, +		       struct extent_state **cached_state, gfp_t mask)  {  	struct extent_state *state;  	struct extent_state *prealloc = NULL; @@ -968,6 +971,15 @@ again:  	}  	spin_lock(&tree->lock); +	if (cached_state && *cached_state) { +		state = *cached_state; +		if (state->start <= start && state->end > start && +		    state->tree) { +			node = &state->rb_node; +			goto hit_next; +		} +	} +  	/*  	 * this search will find all the extents that end after  	 * our range starts. @@ -998,6 +1010,7 @@ hit_next:  	 */  	if (state->start == start && state->end <= end) {  		set_state_bits(tree, state, &bits); +		cache_state(state, cached_state);  		state = clear_state_bit(tree, state, &clear_bits, 0);  		if (last_end == (u64)-1)  			goto out; @@ -1038,6 +1051,7 @@ hit_next:  			goto out;  		if (state->end <= end) {  			set_state_bits(tree, state, &bits); +			cache_state(state, cached_state);  			state = clear_state_bit(tree, state, &clear_bits, 0);  			if (last_end == (u64)-1)  				goto out; @@ -1076,6 +1090,7 @@ hit_next:  				   &bits);  		if (err)  			extent_io_tree_panic(tree, err); +		cache_state(prealloc, cached_state);  		prealloc = NULL;  		start = this_end + 1;  		goto search_again; @@ -1098,6 +1113,7 @@ hit_next:  			extent_io_tree_panic(tree, err);  		set_state_bits(tree, prealloc, &bits); +		cache_state(prealloc, cached_state);  		clear_state_bit(tree, prealloc, &clear_bits, 0);  		prealloc = NULL;  		goto out; @@ -1150,6 +1166,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,  			      NULL, cached_state, mask);  } +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, +		      struct extent_state **cached_state, gfp_t mask) +{ +	return set_extent_bit(tree, start, end, +			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, +			      NULL, cached_state, mask); +} +  int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,  		       gfp_t mask)  { @@ -1294,18 +1318,42 @@ out:   * If nothing was found, 1 is returned. If found something, return 0.   */  int find_first_extent_bit(struct extent_io_tree *tree, u64 start, -			  u64 *start_ret, u64 *end_ret, int bits) +			  u64 *start_ret, u64 *end_ret, int bits, +			  struct extent_state **cached_state)  {  	struct extent_state *state; +	struct rb_node *n;  	int ret = 1;  	spin_lock(&tree->lock); +	if (cached_state && *cached_state) { +		state = *cached_state; +		if (state->end == start - 1 && state->tree) { +			n = rb_next(&state->rb_node); +			while (n) { +				state = rb_entry(n, struct extent_state, +						 rb_node); +				if (state->state & bits) +					goto got_it; +				n = rb_next(n); +			} +			free_extent_state(*cached_state); +			*cached_state = NULL; +			goto out; +		} +		free_extent_state(*cached_state); +		*cached_state = NULL; +	} +  	state = find_first_extent_bit_state(tree, start, bits); +got_it:  	if (state) { +		cache_state(state, cached_state);  		*start_ret = state->start;  		*end_ret = state->end;  		ret = 0;  	} +out:  	spin_unlock(&tree->lock);  	return ret;  } @@ -2068,7 +2116,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,  		}  		read_unlock(&em_tree->lock); -		if (!em || IS_ERR(em)) { +		if (!em) {  			kfree(failrec);  			return -EIO;  		} @@ -2304,8 +2352,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)  		struct extent_state *cached = NULL;  		struct extent_state *state; -		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " -			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, +		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " +			 "mirror=%ld\n", (u64)bio->bi_sector, err,  			 (long int)bio->bi_bdev);  		tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -2709,12 +2757,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  					 end_bio_extent_readpage, mirror_num,  					 *bio_flags,  					 this_bio_flag); -			BUG_ON(ret == -ENOMEM); -			nr++; -			*bio_flags = this_bio_flag; +			if (!ret) { +				nr++; +				*bio_flags = this_bio_flag; +			}  		} -		if (ret) +		if (ret) {  			SetPageError(page); +			unlock_extent(tree, cur, cur + iosize - 1); +		}  		cur = cur + iosize;  		pg_offset += iosize;  	} @@ -3161,12 +3212,16 @@ static int write_one_eb(struct extent_buffer *eb,  	struct block_device *bdev = fs_info->fs_devices->latest_bdev;  	u64 offset = eb->start;  	unsigned long i, num_pages; +	unsigned long bio_flags = 0;  	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);  	int ret = 0;  	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);  	num_pages = num_extent_pages(eb->start, eb->len);  	atomic_set(&eb->io_pages, num_pages); +	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) +		bio_flags = EXTENT_BIO_TREE_LOG; +  	for (i = 0; i < num_pages; i++) {  		struct page *p = extent_buffer_page(eb, i); @@ -3175,7 +3230,8 @@ static int write_one_eb(struct extent_buffer *eb,  		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,  					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,  					 -1, end_bio_extent_buffer_writepage, -					 0, 0, 0); +					 0, epd->bio_flags, bio_flags); +		epd->bio_flags = bio_flags;  		if (ret) {  			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);  			SetPageError(p); @@ -3210,6 +3266,7 @@ int btree_write_cache_pages(struct address_space *mapping,  		.tree = tree,  		.extent_locked = 0,  		.sync_io = wbc->sync_mode == WB_SYNC_ALL, +		.bio_flags = 0,  	};  	int ret = 0;  	int done = 0; @@ -3254,19 +3311,34 @@ retry:  				break;  			} +			spin_lock(&mapping->private_lock); +			if (!PagePrivate(page)) { +				spin_unlock(&mapping->private_lock); +				continue; +			} +  			eb = (struct extent_buffer *)page->private; + +			/* +			 * Shouldn't happen and normally this would be a BUG_ON +			 * but no sense in crashing the users box for something +			 * we can survive anyway. +			 */  			if (!eb) { +				spin_unlock(&mapping->private_lock);  				WARN_ON(1);  				continue;  			} -			if (eb == prev_eb) +			if (eb == prev_eb) { +				spin_unlock(&mapping->private_lock);  				continue; +			} -			if (!atomic_inc_not_zero(&eb->refs)) { -				WARN_ON(1); +			ret = atomic_inc_not_zero(&eb->refs); +			spin_unlock(&mapping->private_lock); +			if (!ret)  				continue; -			}  			prev_eb = eb;  			ret = lock_extent_buffer_for_io(eb, fs_info, &epd); @@ -3457,7 +3529,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)  		if (epd->sync_io)  			rw = WRITE_SYNC; -		ret = submit_one_bio(rw, epd->bio, 0, 0); +		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);  		BUG_ON(ret < 0); /* -ENOMEM */  		epd->bio = NULL;  	} @@ -3480,6 +3552,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,  		.get_extent = get_extent,  		.extent_locked = 0,  		.sync_io = wbc->sync_mode == WB_SYNC_ALL, +		.bio_flags = 0,  	};  	ret = __extent_writepage(page, wbc, &epd); @@ -3504,6 +3577,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,  		.get_extent = get_extent,  		.extent_locked = 1,  		.sync_io = mode == WB_SYNC_ALL, +		.bio_flags = 0,  	};  	struct writeback_control wbc_writepages = {  		.sync_mode	= mode, @@ -3543,6 +3617,7 @@ int extent_writepages(struct extent_io_tree *tree,  		.get_extent = get_extent,  		.extent_locked = 0,  		.sync_io = wbc->sync_mode == WB_SYNC_ALL, +		.bio_flags = 0,  	};  	ret = extent_write_cache_pages(tree, mapping, wbc, @@ -3920,18 +3995,6 @@ out:  	return ret;  } -inline struct page *extent_buffer_page(struct extent_buffer *eb, -					      unsigned long i) -{ -	return eb->pages[i]; -} - -inline unsigned long num_extent_pages(u64 start, u64 len) -{ -	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - -		(start >> PAGE_CACHE_SHIFT); -} -  static void __free_extent_buffer(struct extent_buffer *eb)  {  #if LEAK_DEBUG @@ -4047,7 +4110,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)  	return eb;  err: -	for (i--; i > 0; i--) +	for (i--; i >= 0; i--)  		__free_page(eb->pages[i]);  	__free_extent_buffer(eb);  	return NULL; @@ -4192,10 +4255,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,  	for (i = 0; i < num_pages; i++, index++) {  		p = find_or_create_page(mapping, index, GFP_NOFS); -		if (!p) { -			WARN_ON(1); +		if (!p)  			goto free_eb; -		}  		spin_lock(&mapping->private_lock);  		if (PagePrivate(p)) { @@ -4338,7 +4399,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)  		/* Should be safe to release our pages at this point */  		btrfs_release_extent_buffer_page(eb, 0); -  		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);  		return 1;  	} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 25900af5b15..711d12b8002 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -27,6 +27,7 @@   * type for this bio   */  #define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_TREE_LOG 2  #define EXTENT_BIO_FLAG_SHIFT 16  /* these are bit numbers for test/set bit */ @@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,  int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,  		       gfp_t mask);  int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, -		       int bits, int clear_bits, gfp_t mask); +		       int bits, int clear_bits, +		       struct extent_state **cached_state, gfp_t mask);  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,  			struct extent_state **cached_state, gfp_t mask); +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, +		      struct extent_state **cached_state, gfp_t mask);  int find_first_extent_bit(struct extent_io_tree *tree, u64 start, -			  u64 *start_ret, u64 *end_ret, int bits); +			  u64 *start_ret, u64 *end_ret, int bits, +			  struct extent_state **cached_state);  struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,  						 u64 start, int bits);  int extent_invalidatepage(struct extent_io_tree *tree, @@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);  int read_extent_buffer_pages(struct extent_io_tree *tree,  			     struct extent_buffer *eb, u64 start, int wait,  			     get_extent_t *get_extent, int mirror_num); -unsigned long num_extent_pages(u64 start, u64 len); -struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ +	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - +		(start >> PAGE_CACHE_SHIFT); +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, +					      unsigned long i) +{ +	return eb->pages[i]; +}  static inline void extent_buffer_get(struct extent_buffer *eb)  { diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7c97b330145..b8cbc8d5c7f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;  int __init extent_map_init(void)  { -	extent_map_cache = kmem_cache_create("extent_map", +	extent_map_cache = kmem_cache_create("btrfs_extent_map",  			sizeof(struct extent_map), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!extent_map_cache) @@ -35,6 +35,7 @@ void extent_map_exit(void)  void extent_map_tree_init(struct extent_map_tree *tree)  {  	tree->map = RB_ROOT; +	INIT_LIST_HEAD(&tree->modified_extents);  	rwlock_init(&tree->lock);  } @@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)  	em->in_tree = 0;  	em->flags = 0;  	em->compress_type = BTRFS_COMPRESS_NONE; +	em->generation = 0;  	atomic_set(&em->refs, 1); +	INIT_LIST_HEAD(&em->list);  	return em;  } @@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)  	WARN_ON(atomic_read(&em->refs) == 0);  	if (atomic_dec_and_test(&em->refs)) {  		WARN_ON(em->in_tree); +		WARN_ON(!list_empty(&em->list));  		kmem_cache_free(extent_map_cache, em);  	}  } @@ -198,6 +202,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)  			em->block_len += merge->block_len;  			em->block_start = merge->block_start;  			merge->in_tree = 0; +			if (merge->generation > em->generation) { +				em->mod_start = em->start; +				em->mod_len = em->len; +				em->generation = merge->generation; +				list_move(&em->list, &tree->modified_extents); +			} + +			list_del_init(&merge->list);  			rb_erase(&merge->rb_node, &tree->map);  			free_extent_map(merge);  		} @@ -211,14 +223,34 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)  		em->block_len += merge->len;  		rb_erase(&merge->rb_node, &tree->map);  		merge->in_tree = 0; +		if (merge->generation > em->generation) { +			em->mod_len = em->len; +			em->generation = merge->generation; +			list_move(&em->list, &tree->modified_extents); +		} +		list_del_init(&merge->list);  		free_extent_map(merge);  	}  } -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +/** + * unpint_extent_cache - unpin an extent from the cache + * @tree:	tree to unpin the extent in + * @start:	logical offset in the file + * @len:	length of the extent + * @gen:	generation that this extent has been modified in + * @prealloc:	if this is set we need to clear the prealloc flag + * + * Called after an extent has been written to disk properly.  Set the generation + * to the generation that actually added the file item to the inode so we know + * we need to sync this extent when we call fsync(). + */ +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, +		       u64 gen)  {  	int ret = 0;  	struct extent_map *em; +	bool prealloc = false;  	write_lock(&tree->lock);  	em = lookup_extent_mapping(tree, start, len); @@ -228,10 +260,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)  	if (!em)  		goto out; +	list_move(&em->list, &tree->modified_extents); +	em->generation = gen;  	clear_bit(EXTENT_FLAG_PINNED, &em->flags); +	em->mod_start = em->start; +	em->mod_len = em->len; + +	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { +		prealloc = true; +		clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); +	}  	try_merge_map(tree, em); +	if (prealloc) { +		em->mod_start = em->start; +		em->mod_len = em->len; +	} +  	free_extent_map(em);  out:  	write_unlock(&tree->lock); @@ -269,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,  	}  	atomic_inc(&em->refs); +	em->mod_start = em->start; +	em->mod_len = em->len; +  	try_merge_map(tree, em);  out:  	return ret; @@ -358,6 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)  	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));  	rb_erase(&em->rb_node, &tree->map); +	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) +		list_del_init(&em->list);  	em->in_tree = 0;  	return ret;  } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 1195f09761f..679225555f7 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -13,6 +13,7 @@  #define EXTENT_FLAG_COMPRESSED 1  #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */  #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ +#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */  struct extent_map {  	struct rb_node rb_node; @@ -20,18 +21,23 @@ struct extent_map {  	/* all of these are in bytes */  	u64 start;  	u64 len; +	u64 mod_start; +	u64 mod_len;  	u64 orig_start;  	u64 block_start;  	u64 block_len; +	u64 generation;  	unsigned long flags;  	struct block_device *bdev;  	atomic_t refs;  	unsigned int in_tree;  	unsigned int compress_type; +	struct list_head list;  };  struct extent_map_tree {  	struct rb_root map; +	struct list_head modified_extents;  	rwlock_t lock;  }; @@ -60,7 +66,7 @@ struct extent_map *alloc_extent_map(void);  void free_extent_map(struct extent_map *em);  int __init extent_map_init(void);  void extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);  struct extent_map *search_extent_mapping(struct extent_map_tree *tree,  					 u64 start, u64 len);  #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 857d93cd01d..1ad08e4e4a1 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -25,11 +25,12 @@  #include "transaction.h"  #include "print-tree.h" -#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ +#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \  				   sizeof(struct btrfs_item) * 2) / \  				  size) - 1)) -#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) +#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ +				       PAGE_CACHE_SIZE))  #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \  				   sizeof(struct btrfs_ordered_sum)) / \ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f6b40e86121..9ab1bed8811 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -39,6 +39,7 @@  #include "tree-log.h"  #include "locking.h"  #include "compat.h" +#include "volumes.h"  /*   * when auto defrag is enabled we @@ -458,14 +459,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,   * this drops all the extents in the cache that intersect the range   * [start, end].  Existing extents are split as required.   */ -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, -			    int skip_pinned) +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, +			     int skip_pinned)  {  	struct extent_map *em;  	struct extent_map *split = NULL;  	struct extent_map *split2 = NULL;  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;  	u64 len = end - start + 1; +	u64 gen;  	int ret;  	int testend = 1;  	unsigned long flags; @@ -477,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  		testend = 0;  	}  	while (1) { +		int no_splits = 0; +  		if (!split)  			split = alloc_extent_map();  		if (!split2)  			split2 = alloc_extent_map(); -		BUG_ON(!split || !split2); /* -ENOMEM */ +		if (!split || !split2) +			no_splits = 1;  		write_lock(&em_tree->lock);  		em = lookup_extent_mapping(em_tree, start, len); @@ -490,6 +495,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			break;  		}  		flags = em->flags; +		gen = em->generation;  		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {  			if (testend && em->start + em->len >= start + len) {  				free_extent_map(em); @@ -506,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);  		clear_bit(EXTENT_FLAG_PINNED, &em->flags);  		remove_extent_mapping(em_tree, em); +		if (no_splits) +			goto next;  		if (em->block_start < EXTENT_MAP_LAST_BYTE &&  		    em->start < start) { @@ -518,12 +526,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  				split->block_len = em->block_len;  			else  				split->block_len = split->len; - +			split->generation = gen;  			split->bdev = em->bdev;  			split->flags = flags;  			split->compress_type = em->compress_type;  			ret = add_extent_mapping(em_tree, split);  			BUG_ON(ret); /* Logic error */ +			list_move(&split->list, &em_tree->modified_extents);  			free_extent_map(split);  			split = split2;  			split2 = NULL; @@ -537,6 +546,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			split->bdev = em->bdev;  			split->flags = flags;  			split->compress_type = em->compress_type; +			split->generation = gen;  			if (compressed) {  				split->block_len = em->block_len; @@ -550,9 +560,11 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			ret = add_extent_mapping(em_tree, split);  			BUG_ON(ret); /* Logic error */ +			list_move(&split->list, &em_tree->modified_extents);  			free_extent_map(split);  			split = NULL;  		} +next:  		write_unlock(&em_tree->lock);  		/* once for us */ @@ -564,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  		free_extent_map(split);  	if (split2)  		free_extent_map(split2); -	return 0;  }  /* @@ -576,13 +587,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,   * it is either truncated or split.  Anything entirely inside the range   * is deleted from the tree.   */ -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, -		       u64 start, u64 end, u64 *hint_byte, int drop_cache) +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, +			 struct btrfs_root *root, struct inode *inode, +			 struct btrfs_path *path, u64 start, u64 end, +			 u64 *drop_end, int drop_cache)  { -	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct extent_buffer *leaf;  	struct btrfs_file_extent_item *fi; -	struct btrfs_path *path;  	struct btrfs_key key;  	struct btrfs_key new_key;  	u64 ino = btrfs_ino(inode); @@ -597,14 +608,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,  	int recow;  	int ret;  	int modify_tree = -1; +	int update_refs = (root->ref_cows || root == root->fs_info->tree_root); +	int found = 0;  	if (drop_cache)  		btrfs_drop_extent_cache(inode, start, end - 1, 0); -	path = btrfs_alloc_path(); -	if (!path) -		return -ENOMEM; -  	if (start >= BTRFS_I(inode)->disk_i_size)  		modify_tree = 0; @@ -666,6 +675,7 @@ next_slot:  			goto next_slot;  		} +		found = 1;  		search_start = max(key.offset, start);  		if (recow || !modify_tree) {  			modify_tree = -1; @@ -707,14 +717,13 @@ next_slot:  							extent_end - start);  			btrfs_mark_buffer_dirty(leaf); -			if (disk_bytenr > 0) { +			if (update_refs && disk_bytenr > 0) {  				ret = btrfs_inc_extent_ref(trans, root,  						disk_bytenr, num_bytes, 0,  						root->root_key.objectid,  						new_key.objectid,  						start - extent_offset, 0);  				BUG_ON(ret); /* -ENOMEM */ -				*hint_byte = disk_bytenr;  			}  			key.offset = start;  		} @@ -734,10 +743,8 @@ next_slot:  			btrfs_set_file_extent_num_bytes(leaf, fi,  							extent_end - end);  			btrfs_mark_buffer_dirty(leaf); -			if (disk_bytenr > 0) { +			if (update_refs && disk_bytenr > 0)  				inode_sub_bytes(inode, end - key.offset); -				*hint_byte = disk_bytenr; -			}  			break;  		} @@ -753,10 +760,8 @@ next_slot:  			btrfs_set_file_extent_num_bytes(leaf, fi,  							start - key.offset);  			btrfs_mark_buffer_dirty(leaf); -			if (disk_bytenr > 0) { +			if (update_refs && disk_bytenr > 0)  				inode_sub_bytes(inode, extent_end - start); -				*hint_byte = disk_bytenr; -			}  			if (end == extent_end)  				break; @@ -777,12 +782,13 @@ next_slot:  				del_nr++;  			} -			if (extent_type == BTRFS_FILE_EXTENT_INLINE) { +			if (update_refs && +			    extent_type == BTRFS_FILE_EXTENT_INLINE) {  				inode_sub_bytes(inode,  						extent_end - key.offset);  				extent_end = ALIGN(extent_end,  						   root->sectorsize); -			} else if (disk_bytenr > 0) { +			} else if (update_refs && disk_bytenr > 0) {  				ret = btrfs_free_extent(trans, root,  						disk_bytenr, num_bytes, 0,  						root->root_key.objectid, @@ -791,7 +797,6 @@ next_slot:  				BUG_ON(ret); /* -ENOMEM */  				inode_sub_bytes(inode,  						extent_end - key.offset); -				*hint_byte = disk_bytenr;  			}  			if (end == extent_end) @@ -806,7 +811,7 @@ next_slot:  					      del_nr);  			if (ret) {  				btrfs_abort_transaction(trans, root, ret); -				goto out; +				break;  			}  			del_nr = 0; @@ -825,7 +830,24 @@ next_slot:  			btrfs_abort_transaction(trans, root, ret);  	} -out: +	if (drop_end) +		*drop_end = found ? min(end, extent_end) : end; +	btrfs_release_path(path); +	return ret; +} + +int btrfs_drop_extents(struct btrfs_trans_handle *trans, +		       struct btrfs_root *root, struct inode *inode, u64 start, +		       u64 end, int drop_cache) +{ +	struct btrfs_path *path; +	int ret; + +	path = btrfs_alloc_path(); +	if (!path) +		return -ENOMEM; +	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, +				   drop_cache);  	btrfs_free_path(path);  	return ret;  } @@ -892,8 +914,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,  	int ret;  	u64 ino = btrfs_ino(inode); -	btrfs_drop_extent_cache(inode, start, end - 1, 0); -  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; @@ -935,12 +955,16 @@ again:  			btrfs_set_item_key_safe(trans, root, path, &new_key);  			fi = btrfs_item_ptr(leaf, path->slots[0],  					    struct btrfs_file_extent_item); +			btrfs_set_file_extent_generation(leaf, fi, +							 trans->transid);  			btrfs_set_file_extent_num_bytes(leaf, fi,  							extent_end - end);  			btrfs_set_file_extent_offset(leaf, fi,  						     end - orig_offset);  			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,  					    struct btrfs_file_extent_item); +			btrfs_set_file_extent_generation(leaf, fi, +							 trans->transid);  			btrfs_set_file_extent_num_bytes(leaf, fi,  							end - other_start);  			btrfs_mark_buffer_dirty(leaf); @@ -958,12 +982,16 @@ again:  					    struct btrfs_file_extent_item);  			btrfs_set_file_extent_num_bytes(leaf, fi,  							start - key.offset); +			btrfs_set_file_extent_generation(leaf, fi, +							 trans->transid);  			path->slots[0]++;  			new_key.offset = start;  			btrfs_set_item_key_safe(trans, root, path, &new_key);  			fi = btrfs_item_ptr(leaf, path->slots[0],  					    struct btrfs_file_extent_item); +			btrfs_set_file_extent_generation(leaf, fi, +							 trans->transid);  			btrfs_set_file_extent_num_bytes(leaf, fi,  							other_end - start);  			btrfs_set_file_extent_offset(leaf, fi, @@ -991,12 +1019,14 @@ again:  		leaf = path->nodes[0];  		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,  				    struct btrfs_file_extent_item); +		btrfs_set_file_extent_generation(leaf, fi, trans->transid);  		btrfs_set_file_extent_num_bytes(leaf, fi,  						split - key.offset);  		fi = btrfs_item_ptr(leaf, path->slots[0],  				    struct btrfs_file_extent_item); +		btrfs_set_file_extent_generation(leaf, fi, trans->transid);  		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);  		btrfs_set_file_extent_num_bytes(leaf, fi,  						extent_end - split); @@ -1056,12 +1086,14 @@ again:  			   struct btrfs_file_extent_item);  		btrfs_set_file_extent_type(leaf, fi,  					   BTRFS_FILE_EXTENT_REG); +		btrfs_set_file_extent_generation(leaf, fi, trans->transid);  		btrfs_mark_buffer_dirty(leaf);  	} else {  		fi = btrfs_item_ptr(leaf, del_slot - 1,  			   struct btrfs_file_extent_item);  		btrfs_set_file_extent_type(leaf, fi,  					   BTRFS_FILE_EXTENT_REG); +		btrfs_set_file_extent_generation(leaf, fi, trans->transid);  		btrfs_set_file_extent_num_bytes(leaf, fi,  						extent_end - key.offset);  		btrfs_mark_buffer_dirty(leaf); @@ -1173,8 +1205,8 @@ again:  		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,  				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | -				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, -				  GFP_NOFS); +				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, +				  0, 0, &cached_state, GFP_NOFS);  		unlock_extent_cached(&BTRFS_I(inode)->io_tree,  				     start_pos, last_pos - 1, &cached_state,  				     GFP_NOFS); @@ -1514,16 +1546,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	trace_btrfs_sync_file(file, datasync); +	/* +	 * We write the dirty pages in the range and wait until they complete +	 * out of the ->i_mutex. If so, we can flush the dirty pages by +	 * multi-task, and make the performance up. +	 */ +	ret = filemap_write_and_wait_range(inode->i_mapping, start, end); +	if (ret) +		return ret; +  	mutex_lock(&inode->i_mutex);  	/* -	 * we wait first, since the writeback may change the inode, also wait -	 * ordered range does a filemape_write_and_wait_range which is why we -	 * don't do it above like other file systems. +	 * We flush the dirty pages again to avoid some dirty pages in the +	 * range being left.  	 */ -	root->log_batch++; +	atomic_inc(&root->log_batch);  	btrfs_wait_ordered_range(inode, start, end); -	root->log_batch++; +	atomic_inc(&root->log_batch);  	/*  	 * check the transaction that last modified this inode @@ -1544,6 +1584,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	    BTRFS_I(inode)->last_trans <=  	    root->fs_info->last_trans_committed) {  		BTRFS_I(inode)->last_trans = 0; + +		/* +		 * We'v had everything committed since the last time we were +		 * modified so clear this flag in case it was set for whatever +		 * reason, it's no longer relevant. +		 */ +		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +			  &BTRFS_I(inode)->runtime_flags);  		mutex_unlock(&inode->i_mutex);  		goto out;  	} @@ -1615,6 +1663,324 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)  	return 0;  } +static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, +			  int slot, u64 start, u64 end) +{ +	struct btrfs_file_extent_item *fi; +	struct btrfs_key key; + +	if (slot < 0 || slot >= btrfs_header_nritems(leaf)) +		return 0; + +	btrfs_item_key_to_cpu(leaf, &key, slot); +	if (key.objectid != btrfs_ino(inode) || +	    key.type != BTRFS_EXTENT_DATA_KEY) +		return 0; + +	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + +	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) +		return 0; + +	if (btrfs_file_extent_disk_bytenr(leaf, fi)) +		return 0; + +	if (key.offset == end) +		return 1; +	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) +		return 1; +	return 0; +} + +static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, +		      struct btrfs_path *path, u64 offset, u64 end) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct extent_buffer *leaf; +	struct btrfs_file_extent_item *fi; +	struct extent_map *hole_em; +	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +	struct btrfs_key key; +	int ret; + +	key.objectid = btrfs_ino(inode); +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = offset; + + +	ret = btrfs_search_slot(trans, root, &key, path, 0, 1); +	if (ret < 0) +		return ret; +	BUG_ON(!ret); + +	leaf = path->nodes[0]; +	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { +		u64 num_bytes; + +		path->slots[0]--; +		fi = btrfs_item_ptr(leaf, path->slots[0], +				    struct btrfs_file_extent_item); +		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + +			end - offset; +		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); +		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); +		btrfs_set_file_extent_offset(leaf, fi, 0); +		btrfs_mark_buffer_dirty(leaf); +		goto out; +	} + +	if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { +		u64 num_bytes; + +		path->slots[0]++; +		key.offset = offset; +		btrfs_set_item_key_safe(trans, root, path, &key); +		fi = btrfs_item_ptr(leaf, path->slots[0], +				    struct btrfs_file_extent_item); +		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - +			offset; +		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); +		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); +		btrfs_set_file_extent_offset(leaf, fi, 0); +		btrfs_mark_buffer_dirty(leaf); +		goto out; +	} +	btrfs_release_path(path); + +	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, +				       0, 0, end - offset, 0, end - offset, +				       0, 0, 0); +	if (ret) +		return ret; + +out: +	btrfs_release_path(path); + +	hole_em = alloc_extent_map(); +	if (!hole_em) { +		btrfs_drop_extent_cache(inode, offset, end - 1, 0); +		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +			&BTRFS_I(inode)->runtime_flags); +	} else { +		hole_em->start = offset; +		hole_em->len = end - offset; +		hole_em->orig_start = offset; + +		hole_em->block_start = EXTENT_MAP_HOLE; +		hole_em->block_len = 0; +		hole_em->bdev = root->fs_info->fs_devices->latest_bdev; +		hole_em->compress_type = BTRFS_COMPRESS_NONE; +		hole_em->generation = trans->transid; + +		do { +			btrfs_drop_extent_cache(inode, offset, end - 1, 0); +			write_lock(&em_tree->lock); +			ret = add_extent_mapping(em_tree, hole_em); +			if (!ret) +				list_move(&hole_em->list, +					  &em_tree->modified_extents); +			write_unlock(&em_tree->lock); +		} while (ret == -EEXIST); +		free_extent_map(hole_em); +		if (ret) +			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +				&BTRFS_I(inode)->runtime_flags); +	} + +	return 0; +} + +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct extent_state *cached_state = NULL; +	struct btrfs_path *path; +	struct btrfs_block_rsv *rsv; +	struct btrfs_trans_handle *trans; +	u64 mask = BTRFS_I(inode)->root->sectorsize - 1; +	u64 lockstart = (offset + mask) & ~mask; +	u64 lockend = ((offset + len) & ~mask) - 1; +	u64 cur_offset = lockstart; +	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); +	u64 drop_end; +	unsigned long nr; +	int ret = 0; +	int err = 0; +	bool same_page = (offset >> PAGE_CACHE_SHIFT) == +		((offset + len) >> PAGE_CACHE_SHIFT); + +	btrfs_wait_ordered_range(inode, offset, len); + +	mutex_lock(&inode->i_mutex); +	if (offset >= inode->i_size) { +		mutex_unlock(&inode->i_mutex); +		return 0; +	} + +	/* +	 * Only do this if we are in the same page and we aren't doing the +	 * entire page. +	 */ +	if (same_page && len < PAGE_CACHE_SIZE) { +		ret = btrfs_truncate_page(inode, offset, len, 0); +		mutex_unlock(&inode->i_mutex); +		return ret; +	} + +	/* zero back part of the first page */ +	ret = btrfs_truncate_page(inode, offset, 0, 0); +	if (ret) { +		mutex_unlock(&inode->i_mutex); +		return ret; +	} + +	/* zero the front end of the last page */ +	ret = btrfs_truncate_page(inode, offset + len, 0, 1); +	if (ret) { +		mutex_unlock(&inode->i_mutex); +		return ret; +	} + +	if (lockend < lockstart) { +		mutex_unlock(&inode->i_mutex); +		return 0; +	} + +	while (1) { +		struct btrfs_ordered_extent *ordered; + +		truncate_pagecache_range(inode, lockstart, lockend); + +		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, +				 0, &cached_state); +		ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + +		/* +		 * We need to make sure we have no ordered extents in this range +		 * and nobody raced in and read a page in this range, if we did +		 * we need to try again. +		 */ +		if ((!ordered || +		    (ordered->file_offset + ordered->len < lockstart || +		     ordered->file_offset > lockend)) && +		     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, +				     lockend, EXTENT_UPTODATE, 0, +				     cached_state)) { +			if (ordered) +				btrfs_put_ordered_extent(ordered); +			break; +		} +		if (ordered) +			btrfs_put_ordered_extent(ordered); +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, +				     lockend, &cached_state, GFP_NOFS); +		btrfs_wait_ordered_range(inode, lockstart, +					 lockend - lockstart + 1); +	} + +	path = btrfs_alloc_path(); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} + +	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); +	if (!rsv) { +		ret = -ENOMEM; +		goto out_free; +	} +	rsv->size = btrfs_calc_trunc_metadata_size(root, 1); +	rsv->failfast = 1; + +	/* +	 * 1 - update the inode +	 * 1 - removing the extents in the range +	 * 1 - adding the hole extent +	 */ +	trans = btrfs_start_transaction(root, 3); +	if (IS_ERR(trans)) { +		err = PTR_ERR(trans); +		goto out_free; +	} + +	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, +				      min_size); +	BUG_ON(ret); +	trans->block_rsv = rsv; + +	while (cur_offset < lockend) { +		ret = __btrfs_drop_extents(trans, root, inode, path, +					   cur_offset, lockend + 1, +					   &drop_end, 1); +		if (ret != -ENOSPC) +			break; + +		trans->block_rsv = &root->fs_info->trans_block_rsv; + +		ret = fill_holes(trans, inode, path, cur_offset, drop_end); +		if (ret) { +			err = ret; +			break; +		} + +		cur_offset = drop_end; + +		ret = btrfs_update_inode(trans, root, inode); +		if (ret) { +			err = ret; +			break; +		} + +		nr = trans->blocks_used; +		btrfs_end_transaction(trans, root); +		btrfs_btree_balance_dirty(root, nr); + +		trans = btrfs_start_transaction(root, 3); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); +			trans = NULL; +			break; +		} + +		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, +					      rsv, min_size); +		BUG_ON(ret);	/* shouldn't happen */ +		trans->block_rsv = rsv; +	} + +	if (ret) { +		err = ret; +		goto out_trans; +	} + +	trans->block_rsv = &root->fs_info->trans_block_rsv; +	ret = fill_holes(trans, inode, path, cur_offset, drop_end); +	if (ret) { +		err = ret; +		goto out_trans; +	} + +out_trans: +	if (!trans) +		goto out_free; + +	trans->block_rsv = &root->fs_info->trans_block_rsv; +	ret = btrfs_update_inode(trans, root, inode); +	nr = trans->blocks_used; +	btrfs_end_transaction(trans, root); +	btrfs_btree_balance_dirty(root, nr); +out_free: +	btrfs_free_path(path); +	btrfs_free_block_rsv(root, rsv); +out: +	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, +			     &cached_state, GFP_NOFS); +	mutex_unlock(&inode->i_mutex); +	if (ret && !err) +		err = ret; +	return err; +} +  static long btrfs_fallocate(struct file *file, int mode,  			    loff_t offset, loff_t len)  { @@ -1633,15 +1999,18 @@ static long btrfs_fallocate(struct file *file, int mode,  	alloc_start = offset & ~mask;  	alloc_end =  (offset + len + mask) & ~mask; -	/* We only support the FALLOC_FL_KEEP_SIZE mode */ -	if (mode & ~FALLOC_FL_KEEP_SIZE) +	/* Make sure we aren't being give some crap mode */ +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))  		return -EOPNOTSUPP; +	if (mode & FALLOC_FL_PUNCH_HOLE) +		return btrfs_punch_hole(inode, offset, len); +  	/*  	 * Make sure we have enough space before we do the  	 * allocation.  	 */ -	ret = btrfs_check_data_free_space(inode, len); +	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);  	if (ret)  		return ret; @@ -1748,7 +2117,7 @@ static long btrfs_fallocate(struct file *file, int mode,  out:  	mutex_unlock(&inode->i_mutex);  	/* Let go of our reservation. */ -	btrfs_free_reserved_data_space(inode, len); +	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);  	return ret;  } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6b10acfc2f5..1027b854b90 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  			       block_group->key.offset)) {  		ret = find_first_extent_bit(unpin, start,  					    &extent_start, &extent_end, -					    EXTENT_DIRTY); +					    EXTENT_DIRTY, NULL);  		if (ret) {  			ret = 0;  			break; @@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,  			  max_t(u64, *offset, bitmap_info->offset));  	bits = bytes_to_bits(*bytes, ctl->unit); -	for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); -	     i < BITS_PER_BITMAP; -	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { +	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {  		next_zero = find_next_zero_bit(bitmap_info->bitmap,  					       BITS_PER_BITMAP, i);  		if ((next_zero - i) >= bits) { @@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,  again:  	found_bits = 0; -	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); -	     i < BITS_PER_BITMAP; -	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { +	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {  		next_zero = find_next_zero_bit(entry->bitmap,  					       BITS_PER_BITMAP, i);  		if (next_zero - i >= min_bits) { diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index db2ff9773b9..1d982812ab6 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)  {  	return crc32c((u32)~1, name, len);  } + +/* + * Figure the key offset of an extended inode ref + */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, +				    int len) +{ +	return (u64) crc32c(parent_objectid, name, len); +} +  #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index a13cf1a96c7..48b8fda9313 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -18,6 +18,7 @@  #include "ctree.h"  #include "disk-io.h" +#include "hash.h"  #include "transaction.h"  #include "print-tree.h" @@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,  	return 0;  } -struct btrfs_inode_ref * +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, +				   const char *name, int name_len, +				   struct btrfs_inode_extref **extref_ret) +{ +	struct extent_buffer *leaf; +	struct btrfs_inode_extref *extref; +	unsigned long ptr; +	unsigned long name_ptr; +	u32 item_size; +	u32 cur_offset = 0; +	int ref_name_len; + +	leaf = path->nodes[0]; +	item_size = btrfs_item_size_nr(leaf, path->slots[0]); +	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + +	/* +	 * Search all extended backrefs in this item. We're only +	 * looking through any collisions so most of the time this is +	 * just going to compare against one buffer. If all is well, +	 * we'll return success and the inode ref object. +	 */ +	while (cur_offset < item_size) { +		extref = (struct btrfs_inode_extref *) (ptr + cur_offset); +		name_ptr = (unsigned long)(&extref->name); +		ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + +		if (ref_name_len == name_len && +		    btrfs_inode_extref_parent(leaf, extref) == ref_objectid && +		    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) { +			if (extref_ret) +				*extref_ret = extref; +			return 1; +		} + +		cur_offset += ref_name_len + sizeof(*extref); +	} +	return 0; +} + +static struct btrfs_inode_ref *  btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, -			struct btrfs_root *root, -			struct btrfs_path *path, -			const char *name, int name_len, -			u64 inode_objectid, u64 ref_objectid, int mod) +		       struct btrfs_root *root, +		       struct btrfs_path *path, +		       const char *name, int name_len, +		       u64 inode_objectid, u64 ref_objectid, int ins_len, +		       int cow)  { +	int ret;  	struct btrfs_key key;  	struct btrfs_inode_ref *ref; -	int ins_len = mod < 0 ? -1 : 0; -	int cow = mod != 0; -	int ret;  	key.objectid = inode_objectid;  	key.type = BTRFS_INODE_REF_KEY; @@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,  	return ref;  } -int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, +/* Returns NULL if no extref found */ +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, +			  struct btrfs_root *root, +			  struct btrfs_path *path, +			  const char *name, int name_len, +			  u64 inode_objectid, u64 ref_objectid, int ins_len, +			  int cow) +{ +	int ret; +	struct btrfs_key key; +	struct btrfs_inode_extref *extref; + +	key.objectid = inode_objectid; +	key.type = BTRFS_INODE_EXTREF_KEY; +	key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + +	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); +	if (ret < 0) +		return ERR_PTR(ret); +	if (ret > 0) +		return NULL; +	if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) +		return NULL; +	return extref; +} + +int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, +			      struct btrfs_root *root, +			      struct btrfs_path *path, +			      const char *name, int name_len, +			      u64 inode_objectid, u64 ref_objectid, int mod, +			      u64 *ret_index) +{ +	struct btrfs_inode_ref *ref; +	struct btrfs_inode_extref *extref; +	int ins_len = mod < 0 ? -1 : 0; +	int cow = mod != 0; + +	ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len, +				     inode_objectid, ref_objectid, ins_len, +				     cow); +	if (IS_ERR(ref)) +		return PTR_ERR(ref); + +	if (ref != NULL) { +		*ret_index = btrfs_inode_ref_index(path->nodes[0], ref); +		return 0; +	} + +	btrfs_release_path(path); + +	extref = btrfs_lookup_inode_extref(trans, root, path, name, +					   name_len, inode_objectid, +					   ref_objectid, ins_len, cow); +	if (IS_ERR(extref)) +		return PTR_ERR(extref); + +	if (extref) { +		*ret_index = btrfs_inode_extref_index(path->nodes[0], extref); +		return 0; +	} + +	return -ENOENT; +} + +int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   const char *name, int name_len,  			   u64 inode_objectid, u64 ref_objectid, u64 *index)  {  	struct btrfs_path *path;  	struct btrfs_key key; +	struct btrfs_inode_extref *extref; +	struct extent_buffer *leaf; +	int ret; +	int del_len = name_len + sizeof(*extref); +	unsigned long ptr; +	unsigned long item_start; +	u32 item_size; + +	key.objectid = inode_objectid; +	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); +	key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + +	path = btrfs_alloc_path(); +	if (!path) +		return -ENOMEM; + +	path->leave_spinning = 1; + +	ret = btrfs_search_slot(trans, root, &key, path, -1, 1); +	if (ret > 0) +		ret = -ENOENT; +	if (ret < 0) +		goto out; + +	/* +	 * Sanity check - did we find the right item for this name? +	 * This should always succeed so error here will make the FS +	 * readonly. +	 */ +	if (!btrfs_find_name_in_ext_backref(path, ref_objectid, +					    name, name_len, &extref)) { +		btrfs_std_error(root->fs_info, -ENOENT); +		ret = -EROFS; +		goto out; +	} + +	leaf = path->nodes[0]; +	item_size = btrfs_item_size_nr(leaf, path->slots[0]); +	if (index) +		*index = btrfs_inode_extref_index(leaf, extref); + +	if (del_len == item_size) { +		/* +		 * Common case only one ref in the item, remove the +		 * whole item. +		 */ +		ret = btrfs_del_item(trans, root, path); +		goto out; +	} + +	ptr = (unsigned long)extref; +	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + +	memmove_extent_buffer(leaf, ptr, ptr + del_len, +			      item_size - (ptr + del_len - item_start)); + +	btrfs_truncate_item(trans, root, path, item_size - del_len, 1); + +out: +	btrfs_free_path(path); + +	return ret; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, +			struct btrfs_root *root, +			const char *name, int name_len, +			u64 inode_objectid, u64 ref_objectid, u64 *index) +{ +	struct btrfs_path *path; +	struct btrfs_key key;  	struct btrfs_inode_ref *ref;  	struct extent_buffer *leaf;  	unsigned long ptr; @@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,  	u32 item_size;  	u32 sub_item_len;  	int ret; +	int search_ext_refs = 0;  	int del_len = name_len + sizeof(*ref);  	key.objectid = inode_objectid; @@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);  	if (ret > 0) {  		ret = -ENOENT; +		search_ext_refs = 1;  		goto out;  	} else if (ret < 0) {  		goto out;  	}  	if (!find_name_in_backref(path, name, name_len, &ref)) {  		ret = -ENOENT; +		search_ext_refs = 1;  		goto out;  	}  	leaf = path->nodes[0]; @@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,  	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);  	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,  			      item_size - (ptr + sub_item_len - item_start)); -	btrfs_truncate_item(trans, root, path, -				  item_size - sub_item_len, 1); +	btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1); +out: +	btrfs_free_path(path); + +	if (search_ext_refs) { +		/* +		 * No refs were found, or we could not find the +		 * name in our ref array. Find and remove the extended +		 * inode ref then. +		 */ +		return btrfs_del_inode_extref(trans, root, name, name_len, +					      inode_objectid, ref_objectid, index); +	} + +	return ret; +} + +/* + * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * + * The caller must have checked against BTRFS_LINK_MAX already. + */ +static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, +				     struct btrfs_root *root, +				     const char *name, int name_len, +				     u64 inode_objectid, u64 ref_objectid, u64 index) +{ +	struct btrfs_inode_extref *extref; +	int ret; +	int ins_len = name_len + sizeof(*extref); +	unsigned long ptr; +	struct btrfs_path *path; +	struct btrfs_key key; +	struct extent_buffer *leaf; +	struct btrfs_item *item; + +	key.objectid = inode_objectid; +	key.type = BTRFS_INODE_EXTREF_KEY; +	key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + +	path = btrfs_alloc_path(); +	if (!path) +		return -ENOMEM; + +	path->leave_spinning = 1; +	ret = btrfs_insert_empty_item(trans, root, path, &key, +				      ins_len); +	if (ret == -EEXIST) { +		if (btrfs_find_name_in_ext_backref(path, ref_objectid, +						   name, name_len, NULL)) +			goto out; + +		btrfs_extend_item(trans, root, path, ins_len); +		ret = 0; +	} +	if (ret < 0) +		goto out; + +	leaf = path->nodes[0]; +	item = btrfs_item_nr(leaf, path->slots[0]); +	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); +	ptr += btrfs_item_size(leaf, item) - ins_len; +	extref = (struct btrfs_inode_extref *)ptr; + +	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); +	btrfs_set_inode_extref_index(path->nodes[0], extref, index); +	btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); + +	ptr = (unsigned long)&extref->name; +	write_extent_buffer(path->nodes[0], name, ptr, name_len); +	btrfs_mark_buffer_dirty(path->nodes[0]); +  out:  	btrfs_free_path(path);  	return ret; @@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,  out:  	btrfs_free_path(path); + +	if (ret == -EMLINK) { +		struct btrfs_super_block *disk_super = root->fs_info->super_copy; +		/* We ran out of space in the ref array. Need to +		 * add an extended ref. */ +		if (btrfs_super_incompat_flags(disk_super) +		    & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +			ret = btrfs_insert_inode_extref(trans, root, name, +							name_len, +							inode_objectid, +							ref_objectid, index); +	} +  	return ret;  } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a6ed6944e50..85a1e5053fe 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,  	u64 inline_len = actual_end - start;  	u64 aligned_end = (end + root->sectorsize - 1) &  			~((u64)root->sectorsize - 1); -	u64 hint_byte;  	u64 data_len = inline_len;  	int ret; @@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,  		return 1;  	} -	ret = btrfs_drop_extents(trans, inode, start, aligned_end, -				 &hint_byte, 1); +	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);  	if (ret)  		return ret; @@ -664,7 +662,7 @@ retry:  					   async_extent->compressed_size,  					   async_extent->compressed_size,  					   0, alloc_hint, &ins, 1); -			if (ret) +			if (ret && ret != -ENOSPC)  				btrfs_abort_transaction(trans, root, ret);  			btrfs_end_transaction(trans, root);  		} @@ -1308,6 +1306,7 @@ out_check:  			em->block_start = disk_bytenr;  			em->bdev = root->fs_info->fs_devices->latest_bdev;  			set_bit(EXTENT_FLAG_PINNED, &em->flags); +			set_bit(EXTENT_FLAG_PREALLOC, &em->flags);  			while (1) {  				write_lock(&em_tree->lock);  				ret = add_extent_mapping(em_tree, em); @@ -1364,11 +1363,7 @@ out_check:  	}  error: -	if (nolock) { -		err = btrfs_end_transaction_nolock(trans, root); -	} else { -		err = btrfs_end_transaction(trans, root); -	} +	err = btrfs_end_transaction(trans, root);  	if (!ret)  		ret = err; @@ -1785,7 +1780,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	struct btrfs_key ins; -	u64 hint;  	int ret;  	path = btrfs_alloc_path(); @@ -1803,8 +1797,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	 * the caller is expected to unpin it and allow it to be merged  	 * with the others.  	 */ -	ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, -				 &hint, 0); +	ret = btrfs_drop_extents(trans, root, inode, file_pos, +				 file_pos + num_bytes, 0);  	if (ret)  		goto out; @@ -1828,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	btrfs_set_file_extent_encryption(leaf, fi, encryption);  	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); -	btrfs_unlock_up_safe(path, 1); -	btrfs_set_lock_blocking(leaf); -  	btrfs_mark_buffer_dirty(leaf); +	btrfs_release_path(path);  	inode_add_bytes(inode, num_bytes); @@ -1929,11 +1921,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  						ordered_extent->len,  						compress_type, 0, 0,  						BTRFS_FILE_EXTENT_REG); -		unpin_extent_cache(&BTRFS_I(inode)->extent_tree, -				   ordered_extent->file_offset, -				   ordered_extent->len);  	} - +	unpin_extent_cache(&BTRFS_I(inode)->extent_tree, +			   ordered_extent->file_offset, ordered_extent->len, +			   trans->transid);  	if (ret < 0) {  		btrfs_abort_transaction(trans, root, ret);  		goto out_unlock; @@ -1949,6 +1940,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  			btrfs_abort_transaction(trans, root, ret);  			goto out_unlock;  		} +	} else { +		btrfs_set_inode_last_trans(trans, inode);  	}  	ret = 0;  out_unlock: @@ -1958,12 +1951,8 @@ out_unlock:  out:  	if (root != root->fs_info->tree_root)  		btrfs_delalloc_release_metadata(inode, ordered_extent->len); -	if (trans) { -		if (nolock) -			btrfs_end_transaction_nolock(trans, root); -		else -			btrfs_end_transaction(trans, root); -	} +	if (trans) +		btrfs_end_transaction(trans, root);  	if (ret)  		clear_extent_uptodate(io_tree, ordered_extent->file_offset, @@ -2119,7 +2108,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)  	if (empty)  		return; -	down_read(&root->fs_info->cleanup_work_sem);  	spin_lock(&fs_info->delayed_iput_lock);  	list_splice_init(&fs_info->delayed_iputs, &list);  	spin_unlock(&fs_info->delayed_iput_lock); @@ -2130,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)  		iput(delayed->inode);  		kfree(delayed);  	} -	up_read(&root->fs_info->cleanup_work_sem);  }  enum btrfs_orphan_cleanup_state { @@ -2198,7 +2185,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  	int ret;  	if (!root->orphan_block_rsv) { -		block_rsv = btrfs_alloc_block_rsv(root); +		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);  		if (!block_rsv)  			return -ENOMEM;  	} @@ -2225,7 +2212,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  			insert = 1;  #endif  		insert = 1; -		atomic_dec(&root->orphan_inodes); +		atomic_inc(&root->orphan_inodes);  	}  	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, @@ -2590,6 +2577,18 @@ static void btrfs_read_locked_inode(struct inode *inode)  	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));  	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); +	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + +	/* +	 * If we were modified in the current generation and evicted from memory +	 * and then re-read we need to do a full sync since we don't have any +	 * idea about which extents were modified before we were evicted from +	 * cache. +	 */ +	if (BTRFS_I(inode)->last_trans == root->fs_info->generation) +		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +			&BTRFS_I(inode)->runtime_flags); +  	inode->i_version = btrfs_inode_sequence(leaf, inode_item);  	inode->i_generation = BTRFS_I(inode)->generation;  	inode->i_rdev = 0; @@ -2894,7 +2893,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct btrfs_path *path; -	struct btrfs_inode_ref *ref;  	struct btrfs_dir_item *di;  	struct inode *inode = dentry->d_inode;  	u64 index; @@ -3008,17 +3006,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,  	}  	btrfs_release_path(path); -	ref = btrfs_lookup_inode_ref(trans, root, path, -				dentry->d_name.name, dentry->d_name.len, -				ino, dir_ino, 0); -	if (IS_ERR(ref)) { -		err = PTR_ERR(ref); +	ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name, +					dentry->d_name.len, ino, dir_ino, 0, +					&index); +	if (ret) { +		err = ret;  		goto out;  	} -	BUG_ON(!ref); /* Logic error */ +  	if (check_path_shared(root, path))  		goto out; -	index = btrfs_inode_ref_index(path->nodes[0], ref); +  	btrfs_release_path(path);  	/* @@ -3061,7 +3059,7 @@ out:  static void __unlink_end_trans(struct btrfs_trans_handle *trans,  			       struct btrfs_root *root)  { -	if (trans->block_rsv == &root->fs_info->global_block_rsv) { +	if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {  		btrfs_block_rsv_release(root, trans->block_rsv,  					trans->bytes_reserved);  		trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -3191,9 +3189,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)  	struct btrfs_trans_handle *trans;  	unsigned long nr = 0; -	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || -	    btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) +	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)  		return -ENOTEMPTY; +	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) +		return -EPERM;  	trans = __unlink_start_trans(dir, dentry);  	if (IS_ERR(trans)) @@ -3267,8 +3266,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  		return -ENOMEM;  	path->reada = -1; +	/* +	 * We want to drop from the next block forward in case this new size is +	 * not block aligned since we will be keeping the last block of the +	 * extent just the way it is. +	 */  	if (root->ref_cows || root == root->fs_info->tree_root) -		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); +		btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);  	/*  	 * This function is also used to drop the items in the log tree before @@ -3429,12 +3433,6 @@ delete:  		if (path->slots[0] == 0 ||  		    path->slots[0] != pending_del_slot) { -			if (root->ref_cows && -			    BTRFS_I(inode)->location.objectid != -						BTRFS_FREE_INO_OBJECTID) { -				err = -EAGAIN; -				goto out; -			}  			if (pending_del_nr) {  				ret = btrfs_del_items(trans, root, path,  						pending_del_slot, @@ -3465,12 +3463,20 @@ error:  }  /* - * taken from block_truncate_page, but does cow as it zeros out - * any bytes left in the last page in the file. + * btrfs_truncate_page - read, zero a chunk and write a page + * @inode - inode that we're zeroing + * @from - the offset to start zeroing + * @len - the length to zero, 0 to zero the entire range respective to the + *	offset + * @front - zero up to the offset instead of from the offset on + * + * This will find the page for the "from" offset and cow the page and zero the + * part we want to zero.  This is used with truncate and hole punching.   */ -static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, +			int front)  { -	struct inode *inode = mapping->host; +	struct address_space *mapping = inode->i_mapping;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct btrfs_ordered_extent *ordered; @@ -3485,7 +3491,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)  	u64 page_start;  	u64 page_end; -	if ((offset & (blocksize - 1)) == 0) +	if ((offset & (blocksize - 1)) == 0 && +	    (!len || ((len & (blocksize - 1)) == 0)))  		goto out;  	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);  	if (ret) @@ -3532,7 +3539,8 @@ again:  	}  	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, -			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, +			  EXTENT_DIRTY | EXTENT_DELALLOC | +			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,  			  0, 0, &cached_state, GFP_NOFS);  	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, @@ -3545,8 +3553,13 @@ again:  	ret = 0;  	if (offset != PAGE_CACHE_SIZE) { +		if (!len) +			len = PAGE_CACHE_SIZE - offset;  		kaddr = kmap(page); -		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); +		if (front) +			memset(kaddr, 0, offset); +		else +			memset(kaddr + offset, 0, len);  		flush_dcache_page(page);  		kunmap(page);  	} @@ -3577,6 +3590,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct extent_map *em = NULL;  	struct extent_state *cached_state = NULL; +	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;  	u64 mask = root->sectorsize - 1;  	u64 hole_start = (oldsize + mask) & ~mask;  	u64 block_end = (size + mask) & ~mask; @@ -3613,7 +3627,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  		last_byte = min(extent_map_end(em), block_end);  		last_byte = (last_byte + mask) & ~mask;  		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { -			u64 hint_byte = 0; +			struct extent_map *hole_em;  			hole_size = last_byte - cur_offset;  			trans = btrfs_start_transaction(root, 3); @@ -3622,9 +3636,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  				break;  			} -			err = btrfs_drop_extents(trans, inode, cur_offset, -						 cur_offset + hole_size, -						 &hint_byte, 1); +			err = btrfs_drop_extents(trans, root, inode, +						 cur_offset, +						 cur_offset + hole_size, 1);  			if (err) {  				btrfs_abort_transaction(trans, root, err);  				btrfs_end_transaction(trans, root); @@ -3641,9 +3655,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  				break;  			} -			btrfs_drop_extent_cache(inode, hole_start, -					last_byte - 1, 0); +			btrfs_drop_extent_cache(inode, cur_offset, +						cur_offset + hole_size - 1, 0); +			hole_em = alloc_extent_map(); +			if (!hole_em) { +				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +					&BTRFS_I(inode)->runtime_flags); +				goto next; +			} +			hole_em->start = cur_offset; +			hole_em->len = hole_size; +			hole_em->orig_start = cur_offset; +			hole_em->block_start = EXTENT_MAP_HOLE; +			hole_em->block_len = 0; +			hole_em->bdev = root->fs_info->fs_devices->latest_bdev; +			hole_em->compress_type = BTRFS_COMPRESS_NONE; +			hole_em->generation = trans->transid; + +			while (1) { +				write_lock(&em_tree->lock); +				err = add_extent_mapping(em_tree, hole_em); +				if (!err) +					list_move(&hole_em->list, +						  &em_tree->modified_extents); +				write_unlock(&em_tree->lock); +				if (err != -EEXIST) +					break; +				btrfs_drop_extent_cache(inode, cur_offset, +							cur_offset + +							hole_size - 1, 0); +			} +			free_extent_map(hole_em); +next:  			btrfs_update_inode(trans, root, inode);  			btrfs_end_transaction(trans, root);  		} @@ -3768,26 +3812,22 @@ void btrfs_evict_inode(struct inode *inode)  		goto no_delete;  	} -	rsv = btrfs_alloc_block_rsv(root); +	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);  	if (!rsv) {  		btrfs_orphan_del(NULL, inode);  		goto no_delete;  	}  	rsv->size = min_size; +	rsv->failfast = 1;  	global_rsv = &root->fs_info->global_block_rsv;  	btrfs_i_size_write(inode, 0);  	/* -	 * This is a bit simpler than btrfs_truncate since -	 * -	 * 1) We've already reserved our space for our orphan item in the -	 *    unlink. -	 * 2) We're going to delete the inode item, so we don't need to update -	 *    it at all. -	 * -	 * So we just need to reserve some slack space in case we add bytes when -	 * doing the truncate. +	 * This is a bit simpler than btrfs_truncate since we've already +	 * reserved our space for our orphan item in the unlink, so we just +	 * need to reserve some slack space in case we add bytes and update +	 * inode item when doing the truncate.  	 */  	while (1) {  		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); @@ -3808,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode)  			goto no_delete;  		} -		trans = btrfs_start_transaction(root, 0); +		trans = btrfs_start_transaction_noflush(root, 1);  		if (IS_ERR(trans)) {  			btrfs_orphan_del(NULL, inode);  			btrfs_free_block_rsv(root, rsv); @@ -3818,9 +3858,13 @@ void btrfs_evict_inode(struct inode *inode)  		trans->block_rsv = rsv;  		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); -		if (ret != -EAGAIN) +		if (ret != -ENOSPC)  			break; +		trans->block_rsv = &root->fs_info->trans_block_rsv; +		ret = btrfs_update_inode(trans, root, inode); +		BUG_ON(ret); +  		nr = trans->blocks_used;  		btrfs_end_transaction(trans, root);  		trans = NULL; @@ -4470,10 +4514,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)  			trans = btrfs_join_transaction(root);  		if (IS_ERR(trans))  			return PTR_ERR(trans); -		if (nolock) -			ret = btrfs_end_transaction_nolock(trans, root); -		else -			ret = btrfs_commit_transaction(trans, root); +		ret = btrfs_commit_transaction(trans, root);  	}  	return ret;  } @@ -4671,6 +4712,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	BTRFS_I(inode)->generation = trans->transid;  	inode->i_generation = BTRFS_I(inode)->generation; +	/* +	 * We could have gotten an inode number from somebody who was fsynced +	 * and then removed in this same transaction, so let's just set full +	 * sync since it will be a full sync anyway and this will blow away the +	 * old info in the log. +	 */ +	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); +  	if (S_ISDIR(mode))  		owner = 0;  	else @@ -4680,6 +4729,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);  	key[0].offset = 0; +	/* +	 * Start new inodes with an inode_ref. This is slightly more +	 * efficient for small numbers of hard links since they will +	 * be packed into one item. Extended refs will kick in if we +	 * add more hard links than can fit in the ref item. +	 */  	key[1].objectid = objectid;  	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);  	key[1].offset = ref_objectid; @@ -4986,7 +5041,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,  	if (root->objectid != BTRFS_I(inode)->root->objectid)  		return -EXDEV; -	if (inode->i_nlink == ~0U) +	if (inode->i_nlink >= BTRFS_LINK_MAX)  		return -EMLINK;  	err = btrfs_set_inode_index(dir, &index); @@ -5450,7 +5505,8 @@ insert:  	write_unlock(&em_tree->lock);  out: -	trace_btrfs_get_extent(root, em); +	if (em) +		trace_btrfs_get_extent(root, em);  	if (path)  		btrfs_free_path(path); @@ -5836,6 +5892,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,  	return ret;  } +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, +					   u64 len, u64 orig_start, +					   u64 block_start, u64 block_len, +					   int type) +{ +	struct extent_map_tree *em_tree; +	struct extent_map *em; +	struct btrfs_root *root = BTRFS_I(inode)->root; +	int ret; + +	em_tree = &BTRFS_I(inode)->extent_tree; +	em = alloc_extent_map(); +	if (!em) +		return ERR_PTR(-ENOMEM); + +	em->start = start; +	em->orig_start = orig_start; +	em->len = len; +	em->block_len = block_len; +	em->block_start = block_start; +	em->bdev = root->fs_info->fs_devices->latest_bdev; +	set_bit(EXTENT_FLAG_PINNED, &em->flags); +	if (type == BTRFS_ORDERED_PREALLOC) +		set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + +	do { +		btrfs_drop_extent_cache(inode, em->start, +				em->start + em->len - 1, 0); +		write_lock(&em_tree->lock); +		ret = add_extent_mapping(em_tree, em); +		write_unlock(&em_tree->lock); +	} while (ret == -EEXIST); + +	if (ret) { +		free_extent_map(em); +		return ERR_PTR(ret); +	} + +	return em; +} + +  static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  				   struct buffer_head *bh_result, int create)  { @@ -5950,6 +6048,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  			goto must_cow;  		if (can_nocow_odirect(trans, inode, start, len) == 1) { +			u64 orig_start = em->start; + +			if (type == BTRFS_ORDERED_PREALLOC) { +				free_extent_map(em); +				em = create_pinned_em(inode, start, len, +						       orig_start, +						       block_start, len, type); +				if (IS_ERR(em)) { +					btrfs_end_transaction(trans, root); +					goto unlock_err; +				} +			} +  			ret = btrfs_add_ordered_extent_dio(inode, start,  					   block_start, len, len, type);  			btrfs_end_transaction(trans, root); @@ -5999,7 +6110,8 @@ unlock:  	if (lockstart < lockend) {  		if (create && len < lockend - lockstart) {  			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, -					 lockstart + len - 1, unlock_bits, 1, 0, +					 lockstart + len - 1, +					 unlock_bits | EXTENT_DEFRAG, 1, 0,  					 &cached_state, GFP_NOFS);  			/*  			 * Beside unlock, we also need to cleanup reserved space @@ -6007,8 +6119,8 @@ unlock:  			 */  			clear_extent_bit(&BTRFS_I(inode)->io_tree,  					 lockstart + len, lockend, -					 unlock_bits | EXTENT_DO_ACCOUNTING, -					 1, 0, NULL, GFP_NOFS); +					 unlock_bits | EXTENT_DO_ACCOUNTING | +					 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);  		} else {  			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,  					 lockend, unlock_bits, 1, 0, @@ -6573,8 +6685,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  		 */  		clear_extent_bit(tree, page_start, page_end,  				 EXTENT_DIRTY | EXTENT_DELALLOC | -				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, -				 &cached_state, GFP_NOFS); +				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | +				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);  		/*  		 * whoever cleared the private bit is responsible  		 * for the finish_ordered_io @@ -6590,7 +6702,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  	}  	clear_extent_bit(tree, page_start, page_end,  		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | -		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); +		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, +		 &cached_state, GFP_NOFS);  	__btrfs_releasepage(page, GFP_NOFS);  	ClearPageChecked(page); @@ -6687,7 +6800,8 @@ again:  	 * prepare_pages in the normal write path.  	 */  	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, -			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, +			  EXTENT_DIRTY | EXTENT_DELALLOC | +			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,  			  0, 0, &cached_state, GFP_NOFS);  	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, @@ -6718,6 +6832,7 @@ again:  	BTRFS_I(inode)->last_trans = root->fs_info->generation;  	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; +	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;  	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -6745,7 +6860,7 @@ static int btrfs_truncate(struct inode *inode)  	u64 mask = root->sectorsize - 1;  	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); -	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); +	ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);  	if (ret)  		return ret; @@ -6788,10 +6903,11 @@ static int btrfs_truncate(struct inode *inode)  	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for  	 * updating the inode.  	 */ -	rsv = btrfs_alloc_block_rsv(root); +	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);  	if (!rsv)  		return -ENOMEM;  	rsv->size = min_size; +	rsv->failfast = 1;  	/*  	 * 1 for the truncate slack space @@ -6837,36 +6953,21 @@ static int btrfs_truncate(struct inode *inode)  					   &BTRFS_I(inode)->runtime_flags))  		btrfs_add_ordered_operation(trans, root, inode); -	while (1) { -		ret = btrfs_block_rsv_refill(root, rsv, min_size); -		if (ret) { -			/* -			 * This can only happen with the original transaction we -			 * started above, every other time we shouldn't have a -			 * transaction started yet. -			 */ -			if (ret == -EAGAIN) -				goto end_trans; -			err = ret; -			break; -		} - -		if (!trans) { -			/* Just need the 1 for updating the inode */ -			trans = btrfs_start_transaction(root, 1); -			if (IS_ERR(trans)) { -				ret = err = PTR_ERR(trans); -				trans = NULL; -				break; -			} -		} - -		trans->block_rsv = rsv; +	/* +	 * So if we truncate and then write and fsync we normally would just +	 * write the extents that changed, which is a problem if we need to +	 * first truncate that entire inode.  So set this flag so we write out +	 * all of the extents in the inode to the sync log so we're completely +	 * safe. +	 */ +	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); +	trans->block_rsv = rsv; +	while (1) {  		ret = btrfs_truncate_inode_items(trans, root, inode,  						 inode->i_size,  						 BTRFS_EXTENT_DATA_KEY); -		if (ret != -EAGAIN) { +		if (ret != -ENOSPC) {  			err = ret;  			break;  		} @@ -6877,11 +6978,22 @@ static int btrfs_truncate(struct inode *inode)  			err = ret;  			break;  		} -end_trans: +  		nr = trans->blocks_used;  		btrfs_end_transaction(trans, root); -		trans = NULL;  		btrfs_btree_balance_dirty(root, nr); + +		trans = btrfs_start_transaction(root, 2); +		if (IS_ERR(trans)) { +			ret = err = PTR_ERR(trans); +			trans = NULL; +			break; +		} + +		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, +					      rsv, min_size); +		BUG_ON(ret);	/* shouldn't happen */ +		trans->block_rsv = rsv;  	}  	if (ret == 0 && inode->i_nlink > 0) { @@ -6965,6 +7077,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->csum_bytes = 0;  	ei->index_cnt = (u64)-1;  	ei->last_unlink_trans = 0; +	ei->last_log_commit = 0;  	spin_lock_init(&ei->lock);  	ei->outstanding_extents = 0; @@ -7095,31 +7208,31 @@ void btrfs_destroy_cachep(void)  int btrfs_init_cachep(void)  { -	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", +	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",  			sizeof(struct btrfs_inode), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);  	if (!btrfs_inode_cachep)  		goto fail; -	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", +	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",  			sizeof(struct btrfs_trans_handle), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!btrfs_trans_handle_cachep)  		goto fail; -	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", +	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",  			sizeof(struct btrfs_transaction), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!btrfs_transaction_cachep)  		goto fail; -	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", +	btrfs_path_cachep = kmem_cache_create("btrfs_path",  			sizeof(struct btrfs_path), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!btrfs_path_cachep)  		goto fail; -	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", +	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",  			sizeof(struct btrfs_free_space), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!btrfs_free_space_cachep) @@ -7513,6 +7626,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  				       loff_t actual_len, u64 *alloc_hint,  				       struct btrfs_trans_handle *trans)  { +	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +	struct extent_map *em;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_key ins;  	u64 cur_offset = start; @@ -7553,6 +7668,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  		btrfs_drop_extent_cache(inode, cur_offset,  					cur_offset + ins.offset -1, 0); +		em = alloc_extent_map(); +		if (!em) { +			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +				&BTRFS_I(inode)->runtime_flags); +			goto next; +		} + +		em->start = cur_offset; +		em->orig_start = cur_offset; +		em->len = ins.offset; +		em->block_start = ins.objectid; +		em->block_len = ins.offset; +		em->bdev = root->fs_info->fs_devices->latest_bdev; +		set_bit(EXTENT_FLAG_PREALLOC, &em->flags); +		em->generation = trans->transid; + +		while (1) { +			write_lock(&em_tree->lock); +			ret = add_extent_mapping(em_tree, em); +			if (!ret) +				list_move(&em->list, +					  &em_tree->modified_extents); +			write_unlock(&em_tree->lock); +			if (ret != -EEXIST) +				break; +			btrfs_drop_extent_cache(inode, cur_offset, +						cur_offset + ins.offset - 1, +						0); +		} +		free_extent_map(em); +next:  		num_bytes -= ins.offset;  		cur_offset += ins.offset;  		*alloc_hint = ins.objectid + ins.offset; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 47127c1bd29..e568c472f80 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  	int ret;  	u64 ip_oldflags;  	unsigned int i_oldflags; +	umode_t mode;  	if (btrfs_root_readonly(root))  		return -EROFS; @@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  	ip_oldflags = ip->flags;  	i_oldflags = inode->i_flags; +	mode = inode->i_mode;  	flags = btrfs_mask_flags(inode->i_mode, flags);  	oldflags = btrfs_flags_to_ioctl(ip->flags); @@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  		ip->flags |= BTRFS_INODE_DIRSYNC;  	else  		ip->flags &= ~BTRFS_INODE_DIRSYNC; -	if (flags & FS_NOCOW_FL) -		ip->flags |= BTRFS_INODE_NODATACOW; -	else -		ip->flags &= ~BTRFS_INODE_NODATACOW; +	if (flags & FS_NOCOW_FL) { +		if (S_ISREG(mode)) { +			/* +			 * It's safe to turn csums off here, no extents exist. +			 * Otherwise we want the flag to reflect the real COW +			 * status of the file and will not set it. +			 */ +			if (inode->i_size == 0) +				ip->flags |= BTRFS_INODE_NODATACOW +					   | BTRFS_INODE_NODATASUM; +		} else { +			ip->flags |= BTRFS_INODE_NODATACOW; +		} +	} else { +		/* +		 * Revert back under same assuptions as above +		 */ +		if (S_ISREG(mode)) { +			if (inode->i_size == 0) +				ip->flags &= ~(BTRFS_INODE_NODATACOW +				             | BTRFS_INODE_NODATASUM); +		} else { +			ip->flags &= ~BTRFS_INODE_NODATACOW; +		} +	}  	/*  	 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS @@ -516,7 +539,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,  	if (!pending_snapshot)  		return -ENOMEM; -	btrfs_init_block_rsv(&pending_snapshot->block_rsv); +	btrfs_init_block_rsv(&pending_snapshot->block_rsv, +			     BTRFS_BLOCK_RSV_TEMP);  	pending_snapshot->dentry = dentry;  	pending_snapshot->root = root;  	pending_snapshot->readonly = readonly; @@ -525,7 +549,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,  		*inherit = NULL;	/* take responsibility to free it */  	} -	trans = btrfs_start_transaction(root->fs_info->extent_root, 5); +	trans = btrfs_start_transaction(root->fs_info->extent_root, 6);  	if (IS_ERR(trans)) {  		ret = PTR_ERR(trans);  		goto fail; @@ -1022,8 +1046,8 @@ again:  			 page_start, page_end - 1, 0, &cached_state);  	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,  			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | -			  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, -			  GFP_NOFS); +			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, +			  &cached_state, GFP_NOFS);  	if (i_done != page_cnt) {  		spin_lock(&BTRFS_I(inode)->lock); @@ -1034,8 +1058,8 @@ again:  	} -	btrfs_set_extent_delalloc(inode, page_start, page_end - 1, -				  &cached_state); +	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, +			  &cached_state, GFP_NOFS);  	unlock_extent_cached(&BTRFS_I(inode)->io_tree,  			     page_start, page_end - 1, &cached_state, @@ -2351,7 +2375,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  	int ret;  	u64 len = olen;  	u64 bs = root->fs_info->sb->s_blocksize; -	u64 hint_byte;  	/*  	 * TODO: @@ -2456,13 +2479,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  	   another, and lock file content */  	while (1) {  		struct btrfs_ordered_extent *ordered; -		lock_extent(&BTRFS_I(src)->io_tree, off, off+len); -		ordered = btrfs_lookup_first_ordered_extent(src, off+len); +		lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); +		ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);  		if (!ordered && -		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, -				   EXTENT_DELALLOC, 0, NULL)) +		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1, +				    EXTENT_DELALLOC, 0, NULL))  			break; -		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); +		unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);  		if (ordered)  			btrfs_put_ordered_extent(ordered);  		btrfs_wait_ordered_range(src, off, len); @@ -2536,7 +2559,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  			btrfs_release_path(path);  			if (key.offset + datal <= off || -			    key.offset >= off+len) +			    key.offset >= off + len - 1)  				goto next;  			memcpy(&new_key, &key, sizeof(new_key)); @@ -2574,10 +2597,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  					datal -= off - key.offset;  				} -				ret = btrfs_drop_extents(trans, inode, +				ret = btrfs_drop_extents(trans, root, inode,  							 new_key.offset,  							 new_key.offset + datal, -							 &hint_byte, 1); +							 1);  				if (ret) {  					btrfs_abort_transaction(trans, root,  								ret); @@ -2637,8 +2660,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  					new_key.offset += skip;  				} -				if (key.offset + datal > off+len) -					trim = key.offset + datal - (off+len); +				if (key.offset + datal > off + len) +					trim = key.offset + datal - (off + len);  				if (comp && (skip || trim)) {  					ret = -EINVAL; @@ -2648,10 +2671,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  				size -= skip + trim;  				datal -= skip + trim; -				ret = btrfs_drop_extents(trans, inode, +				ret = btrfs_drop_extents(trans, root, inode,  							 new_key.offset,  							 new_key.offset + datal, -							 &hint_byte, 1); +							 1);  				if (ret) {  					btrfs_abort_transaction(trans, root,  								ret); @@ -2715,7 +2738,7 @@ next:  	ret = 0;  out:  	btrfs_release_path(path); -	unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); +	unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);  out_unlock:  	mutex_unlock(&src->i_mutex);  	mutex_unlock(&inode->i_mutex); @@ -2850,8 +2873,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  	return 0;  } -static void get_block_group_info(struct list_head *groups_list, -				 struct btrfs_ioctl_space_info *space) +void btrfs_get_block_group_info(struct list_head *groups_list, +				struct btrfs_ioctl_space_info *space)  {  	struct btrfs_block_group_cache *block_group; @@ -2959,8 +2982,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)  		down_read(&info->groups_sem);  		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {  			if (!list_empty(&info->block_groups[c])) { -				get_block_group_info(&info->block_groups[c], -						     &space); +				btrfs_get_block_group_info( +					&info->block_groups[c], &space);  				memcpy(dest, &space, sizeof(space));  				dest++;  				space_args.total_spaces++; @@ -3208,11 +3231,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,  {  	int ret = 0;  	int size; -	u64 extent_item_pos;  	struct btrfs_ioctl_logical_ino_args *loi;  	struct btrfs_data_container *inodes = NULL;  	struct btrfs_path *path = NULL; -	struct btrfs_key key;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -3230,7 +3251,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,  		goto out;  	} -	size = min_t(u32, loi->size, 4096); +	size = min_t(u32, loi->size, 64 * 1024);  	inodes = init_data_container(size);  	if (IS_ERR(inodes)) {  		ret = PTR_ERR(inodes); @@ -3238,22 +3259,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,  		goto out;  	} -	ret = extent_from_logical(root->fs_info, loi->logical, path, &key); -	btrfs_release_path(path); - -	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) +	ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path, +					  build_ino_list, inodes); +	if (ret == -EINVAL)  		ret = -ENOENT;  	if (ret < 0)  		goto out; -	extent_item_pos = loi->logical - key.objectid; -	ret = iterate_extent_inodes(root->fs_info, key.objectid, -					extent_item_pos, 0, build_ino_list, -					inodes); - -	if (ret < 0) -		goto out; -  	ret = copy_to_user((void *)(unsigned long)loi->inodes,  			   (void *)(unsigned long)inodes, size);  	if (ret) @@ -3261,7 +3273,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,  out:  	btrfs_free_path(path); -	kfree(inodes); +	vfree(inodes);  	kfree(loi);  	return ret; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 051c7fe551d..7772f02ba28 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -25,6 +25,8 @@  #include "btrfs_inode.h"  #include "extent_io.h" +static struct kmem_cache *btrfs_ordered_extent_cache; +  static u64 entry_end(struct btrfs_ordered_extent *entry)  {  	if (entry->file_offset + entry->len < entry->file_offset) @@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  	struct btrfs_ordered_extent *entry;  	tree = &BTRFS_I(inode)->ordered_tree; -	entry = kzalloc(sizeof(*entry), GFP_NOFS); +	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);  	if (!entry)  		return -ENOMEM; @@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)  			list_del(&sum->list);  			kfree(sum);  		} -		kfree(entry); +		kmem_cache_free(btrfs_ordered_extent_cache, entry);  	}  } @@ -466,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,   * wait for all the ordered extents in a root.  This is done when balancing   * space between drives.   */ -void btrfs_wait_ordered_extents(struct btrfs_root *root, -				int nocow_only, int delay_iput) +void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)  {  	struct list_head splice;  	struct list_head *cur; @@ -482,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,  		cur = splice.next;  		ordered = list_entry(cur, struct btrfs_ordered_extent,  				     root_extent_list); -		if (nocow_only && -		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && -		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { -			list_move(&ordered->root_extent_list, -				  &root->fs_info->ordered_extents); -			cond_resched_lock(&root->fs_info->ordered_extent_lock); -			continue; -		} -  		list_del_init(&ordered->root_extent_list);  		atomic_inc(&ordered->refs); @@ -775,7 +767,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;  	u64 disk_i_size;  	u64 new_i_size; -	u64 i_size_test;  	u64 i_size = i_size_read(inode);  	struct rb_node *node;  	struct rb_node *prev = NULL; @@ -835,55 +826,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  			break;  		if (test->file_offset >= i_size)  			break; -		if (test->file_offset >= disk_i_size) +		if (test->file_offset >= disk_i_size) { +			/* +			 * we don't update disk_i_size now, so record this +			 * undealt i_size. Or we will not know the real +			 * i_size. +			 */ +			if (test->outstanding_isize < offset) +				test->outstanding_isize = offset; +			if (ordered && +			    ordered->outstanding_isize > +			    test->outstanding_isize) +				test->outstanding_isize = +						ordered->outstanding_isize;  			goto out; -	} -	new_i_size = min_t(u64, offset, i_size); - -	/* -	 * at this point, we know we can safely update i_size to at least -	 * the offset from this ordered extent.  But, we need to -	 * walk forward and see if ios from higher up in the file have -	 * finished. -	 */ -	if (ordered) { -		node = rb_next(&ordered->rb_node); -	} else { -		if (prev) -			node = rb_next(prev); -		else -			node = rb_first(&tree->tree); -	} - -	/* -	 * We are looking for an area between our current extent and the next -	 * ordered extent to update the i_size to.  There are 3 cases here -	 * -	 * 1) We don't actually have anything and we can update to i_size. -	 * 2) We have stuff but they already did their i_size update so again we -	 * can just update to i_size. -	 * 3) We have an outstanding ordered extent so the most we can update -	 * our disk_i_size to is the start of the next offset. -	 */ -	i_size_test = i_size; -	for (; node; node = rb_next(node)) { -		test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - -		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) -			continue; -		if (test->file_offset > offset) { -			i_size_test = test->file_offset; -			break;  		}  	} +	new_i_size = min_t(u64, offset, i_size);  	/* -	 * i_size_test is the end of a region after this ordered -	 * extent where there are no ordered extents, we can safely set -	 * disk_i_size to this. +	 * Some ordered extents may completed before the current one, and +	 * we hold the real i_size in ->outstanding_isize.  	 */ -	if (i_size_test > offset) -		new_i_size = min_t(u64, i_size_test, i_size); +	if (ordered && ordered->outstanding_isize > new_i_size) +		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);  	BTRFS_I(inode)->disk_i_size = new_i_size;  	ret = 0;  out: @@ -984,3 +950,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,  	}  	spin_unlock(&root->fs_info->ordered_extent_lock);  } + +int __init ordered_data_init(void) +{ +	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", +				     sizeof(struct btrfs_ordered_extent), 0, +				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, +				     NULL); +	if (!btrfs_ordered_extent_cache) +		return -ENOMEM; +	return 0; +} + +void ordered_data_exit(void) +{ +	if (btrfs_ordered_extent_cache) +		kmem_cache_destroy(btrfs_ordered_extent_cache); +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e03c560d299..dd27a0b46a3 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -96,6 +96,13 @@ struct btrfs_ordered_extent {  	/* number of bytes that still need writing */  	u64 bytes_left; +	/* +	 * the end of the ordered extent which is behind it but +	 * didn't update disk_i_size. Please see the comment of +	 * btrfs_ordered_update_i_size(); +	 */ +	u64 outstanding_isize; +  	/* flags (described above) */  	unsigned long flags; @@ -183,6 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);  void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,  				 struct btrfs_root *root,  				 struct inode *inode); -void btrfs_wait_ordered_extents(struct btrfs_root *root, -				int nocow_only, int delay_iput); +void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); +int __init ordered_data_init(void); +void ordered_data_exit(void);  #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b6501558174..5039686df6a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,  		ulist_reinit(tmp);  						/* XXX id not needed */ -		ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); +		ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);  		ULIST_ITER_INIT(&tmp_uiter);  		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {  			struct btrfs_qgroup_list *glist; -			qg = (struct btrfs_qgroup *)tmp_unode->aux; +			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;  			if (qg->refcnt < seq)  				qg->refcnt = seq + 1;  			else @@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,  			list_for_each_entry(glist, &qg->groups, next_group) {  				ulist_add(tmp, glist->group->qgroupid, -					  (unsigned long)glist->group, +					  (u64)(uintptr_t)glist->group,  					  GFP_ATOMIC);  			}  		} @@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,  	 * step 2: walk from the new root  	 */  	ulist_reinit(tmp); -	ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); +	ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);  	ULIST_ITER_INIT(&uiter);  	while ((unode = ulist_next(tmp, &uiter))) {  		struct btrfs_qgroup *qg;  		struct btrfs_qgroup_list *glist; -		qg = (struct btrfs_qgroup *)unode->aux; +		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;  		if (qg->refcnt < seq) {  			/* not visited by step 1 */  			qg->rfer += sgn * node->num_bytes; @@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,  		list_for_each_entry(glist, &qg->groups, next_group) {  			ulist_add(tmp, glist->group->qgroupid, -				  (unsigned long)glist->group, GFP_ATOMIC); +				  (uintptr_t)glist->group, GFP_ATOMIC);  		}  	} @@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,  			continue;  		ulist_reinit(tmp); -		ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); +		ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);  		ULIST_ITER_INIT(&tmp_uiter);  		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {  			struct btrfs_qgroup_list *glist; -			qg = (struct btrfs_qgroup *)tmp_unode->aux; +			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;  			if (qg->tag == seq)  				continue; @@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,  			list_for_each_entry(glist, &qg->groups, next_group) {  				ulist_add(tmp, glist->group->qgroupid, -					  (unsigned long)glist->group, +					  (uintptr_t)glist->group,  					  GFP_ATOMIC);  			}  		} @@ -1469,13 +1469,17 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)  	 * be exceeded  	 */  	ulist = ulist_alloc(GFP_ATOMIC); -	ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); +	if (!ulist) { +		ret = -ENOMEM; +		goto out; +	} +	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);  	ULIST_ITER_INIT(&uiter);  	while ((unode = ulist_next(ulist, &uiter))) {  		struct btrfs_qgroup *qg;  		struct btrfs_qgroup_list *glist; -		qg = (struct btrfs_qgroup *)unode->aux; +		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;  		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&  		    qg->reserved + qg->rfer + num_bytes > @@ -1489,7 +1493,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)  		list_for_each_entry(glist, &qg->groups, next_group) {  			ulist_add(ulist, glist->group->qgroupid, -				  (unsigned long)glist->group, GFP_ATOMIC); +				  (uintptr_t)glist->group, GFP_ATOMIC);  		}  	}  	if (ret) @@ -1502,7 +1506,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)  	while ((unode = ulist_next(ulist, &uiter))) {  		struct btrfs_qgroup *qg; -		qg = (struct btrfs_qgroup *)unode->aux; +		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;  		qg->reserved += num_bytes;  	} @@ -1541,19 +1545,23 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)  		goto out;  	ulist = ulist_alloc(GFP_ATOMIC); -	ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); +	if (!ulist) { +		btrfs_std_error(fs_info, -ENOMEM); +		goto out; +	} +	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);  	ULIST_ITER_INIT(&uiter);  	while ((unode = ulist_next(ulist, &uiter))) {  		struct btrfs_qgroup *qg;  		struct btrfs_qgroup_list *glist; -		qg = (struct btrfs_qgroup *)unode->aux; +		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;  		qg->reserved -= num_bytes;  		list_for_each_entry(glist, &qg->groups, next_group) {  			ulist_add(ulist, glist->group->qgroupid, -				  (unsigned long)glist->group, GFP_ATOMIC); +				  (uintptr_t)glist->group, GFP_ATOMIC);  		}  	} diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4da08652004..776f0aa128f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,  	key.offset = 0;  	inode = btrfs_iget(fs_info->sb, &key, root, NULL); -	if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { -		if (inode && !IS_ERR(inode)) +	if (IS_ERR(inode) || is_bad_inode(inode)) { +		if (!IS_ERR(inode))  			iput(inode);  		return -ENOENT;  	} @@ -3621,7 +3621,7 @@ next:  		ret = find_first_extent_bit(&rc->processed_blocks,  					    key.objectid, &start, &end, -					    EXTENT_DIRTY); +					    EXTENT_DIRTY, NULL);  		if (ret == 0 && start <= key.objectid) {  			btrfs_release_path(path); @@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc)  	struct btrfs_trans_handle *trans;  	int ret; -	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); +	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, +					      BTRFS_BLOCK_RSV_TEMP);  	if (!rc->block_rsv)  		return -ENOMEM; @@ -4057,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)  	       (unsigned long long)rc->block_group->flags);  	btrfs_start_delalloc_inodes(fs_info->tree_root, 0); -	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); +	btrfs_wait_ordered_extents(fs_info->tree_root, 0);  	while (1) {  		mutex_lock(&fs_info->cleaner_mutex); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 10d8e4d8807..eb923d087da 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root  		return -ENOMEM;  	ret = btrfs_search_slot(trans, root, key, path, 0, 1); -	if (ret < 0) -		goto out_abort; +	if (ret < 0) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	}  	if (ret != 0) {  		btrfs_print_leaf(root, path->nodes[0]); @@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root  		btrfs_release_path(path);  		ret = btrfs_search_slot(trans, root, key, path,  				-1, 1); -		if (ret < 0) -			goto out_abort; +		if (ret < 0) { +			btrfs_abort_transaction(trans, root, ret); +			goto out; +		} +  		ret = btrfs_del_item(trans, root, path); -		if (ret < 0) -			goto out_abort; +		if (ret < 0) { +			btrfs_abort_transaction(trans, root, ret); +			goto out; +		}  		btrfs_release_path(path);  		ret = btrfs_insert_empty_item(trans, root, path,  				key, sizeof(*item)); -		if (ret < 0) -			goto out_abort; +		if (ret < 0) { +			btrfs_abort_transaction(trans, root, ret); +			goto out; +		}  		l = path->nodes[0];  		slot = path->slots[0];  		ptr = btrfs_item_ptr_offset(l, slot); @@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root  out:  	btrfs_free_path(path);  	return ret; - -out_abort: -	btrfs_abort_transaction(trans, root, ret); -	goto out;  }  int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b223620cd5a..27892f67e69 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  	struct extent_buffer *eb;  	struct btrfs_extent_item *ei;  	struct scrub_warning swarn; -	u32 item_size; -	int ret; +	unsigned long ptr = 0; +	u64 extent_item_pos; +	u64 flags = 0;  	u64 ref_root; +	u32 item_size;  	u8 ref_level; -	unsigned long ptr = 0;  	const int bufsize = 4096; -	u64 extent_item_pos; +	int ret;  	path = btrfs_alloc_path(); @@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  	if (!path || !swarn.scratch_buf || !swarn.msg_buf)  		goto out; -	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); +	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, +				  &flags);  	if (ret < 0)  		goto out; @@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  	item_size = btrfs_item_size_nr(eb, path->slots[0]);  	btrfs_release_path(path); -	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { +	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {  		do {  			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,  							&ref_root, &ref_level); @@ -1029,6 +1031,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,  				spin_lock(&sdev->stat_lock);  				sdev->stat.malloc_errors++;  				spin_unlock(&sdev->stat_lock); +				kfree(bbio);  				return -ENOMEM;  			}  			sblock->page_count++; @@ -1666,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)  		scrub_block_put(sblock);  	} -	if (sbio->err) { -		/* what is this good for??? */ -		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); -		sbio->bio->bi_flags |= 1 << BIO_UPTODATE; -		sbio->bio->bi_phys_segments = 0; -		sbio->bio->bi_idx = 0; - -		for (i = 0; i < sbio->page_count; i++) { -			struct bio_vec *bi; -			bi = &sbio->bio->bi_io_vec[i]; -			bi->bv_offset = 0; -			bi->bv_len = PAGE_SIZE; -		} -	} -  	bio_put(sbio->bio);  	sbio->bio = NULL;  	spin_lock(&sdev->list_lock); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fb5ffe95f86..c7beb543a4a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -107,7 +107,6 @@ struct send_ctx {  	int cur_inode_new;  	int cur_inode_new_gen;  	int cur_inode_deleted; -	int cur_inode_first_ref_orphan;  	u64 cur_inode_size;  	u64 cur_inode_mode; @@ -126,7 +125,15 @@ struct send_ctx {  struct name_cache_entry {  	struct list_head list; -	struct list_head use_list; +	/* +	 * radix_tree has only 32bit entries but we need to handle 64bit inums. +	 * We use the lower 32bit of the 64bit inum to store it in the tree. If +	 * more then one inum would fall into the same entry, we use radix_list +	 * to store the additional entries. radix_list is also used to store +	 * entries where two entries have the same inum but different +	 * generations. +	 */ +	struct list_head radix_list;  	u64 ino;  	u64 gen;  	u64 parent_ino; @@ -328,6 +335,7 @@ out:  	return ret;  } +#if 0  static void fs_path_remove(struct fs_path *p)  {  	BUG_ON(p->reversed); @@ -335,6 +343,7 @@ static void fs_path_remove(struct fs_path *p)  		p->end--;  	*p->end = 0;  } +#endif  static int fs_path_copy(struct fs_path *p, struct fs_path *from)  { @@ -377,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)  	return path;  } -static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) +int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)  {  	int ret;  	mm_segment_t old_fs; @@ -387,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)  	set_fs(KERNEL_DS);  	while (pos < len) { -		ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos, -				&sctx->send_off); +		ret = vfs_write(filp, (char *)buf + pos, len - pos, off);  		/* TODO handle that correctly */  		/*if (ret == -ERESTARTSYS) {  			continue; @@ -544,7 +552,8 @@ static int send_header(struct send_ctx *sctx)  	strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);  	hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); -	return write_buf(sctx, &hdr, sizeof(hdr)); +	return write_buf(sctx->send_filp, &hdr, sizeof(hdr), +					&sctx->send_off);  }  /* @@ -581,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)  	crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);  	hdr->crc = cpu_to_le32(crc); -	ret = write_buf(sctx, sctx->send_buf, sctx->send_size); +	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, +					&sctx->send_off);  	sctx->total_send_size += sctx->send_size;  	sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; @@ -687,7 +697,8 @@ out:   */  static int get_inode_info(struct btrfs_root *root,  			  u64 ino, u64 *size, u64 *gen, -			  u64 *mode, u64 *uid, u64 *gid) +			  u64 *mode, u64 *uid, u64 *gid, +			  u64 *rdev)  {  	int ret;  	struct btrfs_inode_item *ii; @@ -721,6 +732,8 @@ static int get_inode_info(struct btrfs_root *root,  		*uid = btrfs_inode_uid(path->nodes[0], ii);  	if (gid)  		*gid = btrfs_inode_gid(path->nodes[0], ii); +	if (rdev) +		*rdev = btrfs_inode_rdev(path->nodes[0], ii);  out:  	btrfs_free_path(path); @@ -852,7 +865,6 @@ static int iterate_dir_item(struct send_ctx *sctx,  	struct extent_buffer *eb;  	struct btrfs_item *item;  	struct btrfs_dir_item *di; -	struct btrfs_path *tmp_path = NULL;  	struct btrfs_key di_key;  	char *buf = NULL;  	char *buf2 = NULL; @@ -874,12 +886,6 @@ static int iterate_dir_item(struct send_ctx *sctx,  		goto out;  	} -	tmp_path = alloc_path_for_send(); -	if (!tmp_path) { -		ret = -ENOMEM; -		goto out; -	} -  	eb = path->nodes[0];  	slot = path->slots[0];  	item = btrfs_item_nr(eb, slot); @@ -941,7 +947,6 @@ static int iterate_dir_item(struct send_ctx *sctx,  	}  out: -	btrfs_free_path(tmp_path);  	if (buf_virtual)  		vfree(buf);  	else @@ -1026,12 +1031,12 @@ struct backref_ctx {  	u64 extent_len;  	/* Just to check for bugs in backref resolving */ -	int found_in_send_root; +	int found_itself;  };  static int __clone_root_cmp_bsearch(const void *key, const void *elt)  { -	u64 root = (u64)key; +	u64 root = (u64)(uintptr_t)key;  	struct clone_root *cr = (struct clone_root *)elt;  	if (root < cr->root->objectid) @@ -1055,6 +1060,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)  /*   * Called for every backref that is found for the current extent. + * Results are collected in sctx->clone_roots->ino/offset/found_refs   */  static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)  { @@ -1064,7 +1070,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)  	u64 i_size;  	/* First check if the root is in the list of accepted clone sources */ -	found = bsearch((void *)root, bctx->sctx->clone_roots, +	found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,  			bctx->sctx->clone_roots_cnt,  			sizeof(struct clone_root),  			__clone_root_cmp_bsearch); @@ -1074,14 +1080,15 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)  	if (found->root == bctx->sctx->send_root &&  	    ino == bctx->cur_objectid &&  	    offset == bctx->cur_offset) { -		bctx->found_in_send_root = 1; +		bctx->found_itself = 1;  	}  	/* -	 * There are inodes that have extents that lie behind it's i_size. Don't +	 * There are inodes that have extents that lie behind its i_size. Don't  	 * accept clones from these extents.  	 */ -	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL); +	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, +			NULL);  	if (ret < 0)  		return ret; @@ -1101,16 +1108,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)  		 */  		if (ino >= bctx->cur_objectid)  			return 0; -		/*if (ino > ctx->cur_objectid) +#if 0 +		if (ino > bctx->cur_objectid)  			return 0; -		if (offset + ctx->extent_len > ctx->cur_offset) -			return 0;*/ - -		bctx->found++; -		found->found_refs++; -		found->ino = ino; -		found->offset = offset; -		return 0; +		if (offset + bctx->extent_len > bctx->cur_offset) +			return 0; +#endif  	}  	bctx->found++; @@ -1130,6 +1133,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)  }  /* + * Given an inode, offset and extent item, it finds a good clone for a clone + * instruction. Returns -ENOENT when none could be found. The function makes + * sure that the returned clone is usable at the point where sending is at the + * moment. This means, that no clones are accepted which lie behind the current + * inode+offset. + *   * path must point to the extent item when called.   */  static int find_extent_clone(struct send_ctx *sctx, @@ -1141,20 +1150,29 @@ static int find_extent_clone(struct send_ctx *sctx,  	int ret;  	int extent_type;  	u64 logical; +	u64 disk_byte;  	u64 num_bytes;  	u64 extent_item_pos; +	u64 flags = 0;  	struct btrfs_file_extent_item *fi;  	struct extent_buffer *eb = path->nodes[0]; -	struct backref_ctx backref_ctx; +	struct backref_ctx *backref_ctx = NULL;  	struct clone_root *cur_clone_root;  	struct btrfs_key found_key;  	struct btrfs_path *tmp_path; +	int compressed;  	u32 i;  	tmp_path = alloc_path_for_send();  	if (!tmp_path)  		return -ENOMEM; +	backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); +	if (!backref_ctx) { +		ret = -ENOMEM; +		goto out; +	} +  	if (data_offset >= ino_size) {  		/*  		 * There may be extents that lie behind the file's size. @@ -1172,22 +1190,23 @@ static int find_extent_clone(struct send_ctx *sctx,  		ret = -ENOENT;  		goto out;  	} +	compressed = btrfs_file_extent_compression(eb, fi);  	num_bytes = btrfs_file_extent_num_bytes(eb, fi); -	logical = btrfs_file_extent_disk_bytenr(eb, fi); -	if (logical == 0) { +	disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); +	if (disk_byte == 0) {  		ret = -ENOENT;  		goto out;  	} -	logical += btrfs_file_extent_offset(eb, fi); +	logical = disk_byte + btrfs_file_extent_offset(eb, fi); -	ret = extent_from_logical(sctx->send_root->fs_info, -			logical, tmp_path, &found_key); +	ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, +				  &found_key, &flags);  	btrfs_release_path(tmp_path);  	if (ret < 0)  		goto out; -	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { +	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {  		ret = -EIO;  		goto out;  	} @@ -1202,12 +1221,12 @@ static int find_extent_clone(struct send_ctx *sctx,  		cur_clone_root->found_refs = 0;  	} -	backref_ctx.sctx = sctx; -	backref_ctx.found = 0; -	backref_ctx.cur_objectid = ino; -	backref_ctx.cur_offset = data_offset; -	backref_ctx.found_in_send_root = 0; -	backref_ctx.extent_len = num_bytes; +	backref_ctx->sctx = sctx; +	backref_ctx->found = 0; +	backref_ctx->cur_objectid = ino; +	backref_ctx->cur_offset = data_offset; +	backref_ctx->found_itself = 0; +	backref_ctx->extent_len = num_bytes;  	/*  	 * The last extent of a file may be too large due to page alignment. @@ -1215,25 +1234,31 @@ static int find_extent_clone(struct send_ctx *sctx,  	 * __iterate_backrefs work.  	 */  	if (data_offset + num_bytes >= ino_size) -		backref_ctx.extent_len = ino_size - data_offset; +		backref_ctx->extent_len = ino_size - data_offset;  	/*  	 * Now collect all backrefs.  	 */ +	if (compressed == BTRFS_COMPRESS_NONE) +		extent_item_pos = logical - found_key.objectid; +	else +		extent_item_pos = 0; +  	extent_item_pos = logical - found_key.objectid;  	ret = iterate_extent_inodes(sctx->send_root->fs_info,  					found_key.objectid, extent_item_pos, 1, -					__iterate_backrefs, &backref_ctx); +					__iterate_backrefs, backref_ctx); +  	if (ret < 0)  		goto out; -	if (!backref_ctx.found_in_send_root) { +	if (!backref_ctx->found_itself) {  		/* found a bug in backref code? */  		ret = -EIO;  		printk(KERN_ERR "btrfs: ERROR did not find backref in "  				"send_root. inode=%llu, offset=%llu, " -				"logical=%llu\n", -				ino, data_offset, logical); +				"disk_byte=%llu found extent=%llu\n", +				ino, data_offset, disk_byte, found_key.objectid);  		goto out;  	} @@ -1242,7 +1267,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "  		"num_bytes=%llu, logical=%llu\n",  		data_offset, ino, num_bytes, logical); -	if (!backref_ctx.found) +	if (!backref_ctx->found)  		verbose_printk("btrfs:    no clones found\n");  	cur_clone_root = NULL; @@ -1253,7 +1278,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "  			else if (sctx->clone_roots[i].root == sctx->send_root)  				/* prefer clones from send_root over others */  				cur_clone_root = sctx->clone_roots + i; -			break;  		}  	} @@ -1267,6 +1291,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "  out:  	btrfs_free_path(tmp_path); +	kfree(backref_ctx);  	return ret;  } @@ -1307,8 +1332,6 @@ static int read_symlink(struct send_ctx *sctx,  	len = btrfs_file_extent_inline_len(path->nodes[0], ei);  	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); -	if (ret < 0) -		goto out;  out:  	btrfs_free_path(path); @@ -1404,7 +1427,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)  	u64 right_gen;  	ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, -			NULL); +			NULL, NULL);  	if (ret < 0 && ret != -ENOENT)  		goto out;  	left_ret = ret; @@ -1413,16 +1436,16 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)  		right_ret = -ENOENT;  	} else {  		ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, -				NULL, NULL, NULL); +				NULL, NULL, NULL, NULL);  		if (ret < 0 && ret != -ENOENT)  			goto out;  		right_ret = ret;  	}  	if (!left_ret && !right_ret) { -		if (left_gen == gen && right_gen == gen) +		if (left_gen == gen && right_gen == gen) {  			ret = inode_state_no_change; -		else if (left_gen == gen) { +		} else if (left_gen == gen) {  			if (ino < sctx->send_progress)  				ret = inode_state_did_create;  			else @@ -1516,6 +1539,10 @@ out:  	return ret;  } +/* + * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, + * generation of the parent dir and the name of the dir entry. + */  static int get_first_ref(struct send_ctx *sctx,  			 struct btrfs_root *root, u64 ino,  			 u64 *dir, u64 *dir_gen, struct fs_path *name) @@ -1557,7 +1584,7 @@ static int get_first_ref(struct send_ctx *sctx,  	btrfs_release_path(path);  	ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, -			NULL); +			NULL, NULL);  	if (ret < 0)  		goto out; @@ -1586,22 +1613,28 @@ static int is_first_ref(struct send_ctx *sctx,  	if (ret < 0)  		goto out; -	if (name_len != fs_path_len(tmp_name)) { +	if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {  		ret = 0;  		goto out;  	} -	ret = memcmp(tmp_name->start, name, name_len); -	if (ret) -		ret = 0; -	else -		ret = 1; +	ret = !memcmp(tmp_name->start, name, name_len);  out:  	fs_path_free(sctx, tmp_name);  	return ret;  } +/* + * Used by process_recorded_refs to determine if a new ref would overwrite an + * already existing ref. In case it detects an overwrite, it returns the + * inode/gen in who_ino/who_gen. + * When an overwrite is detected, process_recorded_refs does proper orphanizing + * to make sure later references to the overwritten inode are possible. + * Orphanizing is however only required for the first ref of an inode. + * process_recorded_refs does an additional is_first_ref check to see if + * orphanizing is really required. + */  static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,  			      const char *name, int name_len,  			      u64 *who_ino, u64 *who_gen) @@ -1626,9 +1659,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,  		goto out;  	} +	/* +	 * Check if the overwritten ref was already processed. If yes, the ref +	 * was already unlinked/moved, so we can safely assume that we will not +	 * overwrite anything at this point in time. +	 */  	if (other_inode > sctx->send_progress) {  		ret = get_inode_info(sctx->parent_root, other_inode, NULL, -				who_gen, NULL, NULL, NULL); +				who_gen, NULL, NULL, NULL, NULL);  		if (ret < 0)  			goto out; @@ -1642,6 +1680,13 @@ out:  	return ret;  } +/* + * Checks if the ref was overwritten by an already processed inode. This is + * used by __get_cur_name_and_parent to find out if the ref was orphanized and + * thus the orphan name needs be used. + * process_recorded_refs also uses it to avoid unlinking of refs that were + * overwritten. + */  static int did_overwrite_ref(struct send_ctx *sctx,  			    u64 dir, u64 dir_gen,  			    u64 ino, u64 ino_gen, @@ -1671,7 +1716,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,  	}  	ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, -			NULL); +			NULL, NULL);  	if (ret < 0)  		goto out; @@ -1690,6 +1735,11 @@ out:  	return ret;  } +/* + * Same as did_overwrite_ref, but also checks if it is the first ref of an inode + * that got overwritten. This is used by process_recorded_refs to determine + * if it has to use the path as returned by get_cur_path or the orphan name. + */  static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)  {  	int ret = 0; @@ -1710,39 +1760,40 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)  	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,  			name->start, fs_path_len(name)); -	if (ret < 0) -		goto out;  out:  	fs_path_free(sctx, name);  	return ret;  } +/* + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, + * so we need to do some special handling in case we have clashes. This function + * takes care of this with the help of name_cache_entry::radix_list. + * In case of error, nce is kfreed. + */  static int name_cache_insert(struct send_ctx *sctx,  			     struct name_cache_entry *nce)  {  	int ret = 0; -	struct name_cache_entry **ncea; +	struct list_head *nce_head; -	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); -	if (ncea) { -		if (!ncea[0]) -			ncea[0] = nce; -		else if (!ncea[1]) -			ncea[1] = nce; -		else -			BUG(); -	} else { -		ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS); -		if (!ncea) +	nce_head = radix_tree_lookup(&sctx->name_cache, +			(unsigned long)nce->ino); +	if (!nce_head) { +		nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); +		if (!nce_head)  			return -ENOMEM; +		INIT_LIST_HEAD(nce_head); -		ncea[0] = nce; -		ncea[1] = NULL; -		ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea); -		if (ret < 0) +		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); +		if (ret < 0) { +			kfree(nce_head); +			kfree(nce);  			return ret; +		}  	} +	list_add_tail(&nce->radix_list, nce_head);  	list_add_tail(&nce->list, &sctx->name_cache_list);  	sctx->name_cache_size++; @@ -1752,50 +1803,52 @@ static int name_cache_insert(struct send_ctx *sctx,  static void name_cache_delete(struct send_ctx *sctx,  			      struct name_cache_entry *nce)  { -	struct name_cache_entry **ncea; - -	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); -	BUG_ON(!ncea); - -	if (ncea[0] == nce) -		ncea[0] = NULL; -	else if (ncea[1] == nce) -		ncea[1] = NULL; -	else -		BUG(); +	struct list_head *nce_head; -	if (!ncea[0] && !ncea[1]) { -		radix_tree_delete(&sctx->name_cache, nce->ino); -		kfree(ncea); -	} +	nce_head = radix_tree_lookup(&sctx->name_cache, +			(unsigned long)nce->ino); +	BUG_ON(!nce_head); +	list_del(&nce->radix_list);  	list_del(&nce->list); -  	sctx->name_cache_size--; + +	if (list_empty(nce_head)) { +		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); +		kfree(nce_head); +	}  }  static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,  						    u64 ino, u64 gen)  { -	struct name_cache_entry **ncea; +	struct list_head *nce_head; +	struct name_cache_entry *cur; -	ncea = radix_tree_lookup(&sctx->name_cache, ino); -	if (!ncea) +	nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); +	if (!nce_head)  		return NULL; -	if (ncea[0] && ncea[0]->gen == gen) -		return ncea[0]; -	else if (ncea[1] && ncea[1]->gen == gen) -		return ncea[1]; +	list_for_each_entry(cur, nce_head, radix_list) { +		if (cur->ino == ino && cur->gen == gen) +			return cur; +	}  	return NULL;  } +/* + * Removes the entry from the list and adds it back to the end. This marks the + * entry as recently used so that name_cache_clean_unused does not remove it. + */  static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)  {  	list_del(&nce->list);  	list_add_tail(&nce->list, &sctx->name_cache_list);  } +/* + * Remove some entries from the beginning of name_cache_list. + */  static void name_cache_clean_unused(struct send_ctx *sctx)  {  	struct name_cache_entry *nce; @@ -1814,13 +1867,23 @@ static void name_cache_clean_unused(struct send_ctx *sctx)  static void name_cache_free(struct send_ctx *sctx)  {  	struct name_cache_entry *nce; -	struct name_cache_entry *tmp; -	list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) { +	while (!list_empty(&sctx->name_cache_list)) { +		nce = list_entry(sctx->name_cache_list.next, +				struct name_cache_entry, list);  		name_cache_delete(sctx, nce); +		kfree(nce);  	}  } +/* + * Used by get_cur_path for each ref up to the root. + * Returns 0 if it succeeded. + * Returns 1 if the inode is not existent or got overwritten. In that case, the + * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 + * is returned, parent_ino/parent_gen are not guaranteed to be valid. + * Returns <0 in case of error. + */  static int __get_cur_name_and_parent(struct send_ctx *sctx,  				     u64 ino, u64 gen,  				     u64 *parent_ino, @@ -1832,6 +1895,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,  	struct btrfs_path *path = NULL;  	struct name_cache_entry *nce = NULL; +	/* +	 * First check if we already did a call to this function with the same +	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes +	 * return the cached result. +	 */  	nce = name_cache_search(sctx, ino, gen);  	if (nce) {  		if (ino < sctx->send_progress && nce->need_later_update) { @@ -1854,6 +1922,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,  	if (!path)  		return -ENOMEM; +	/* +	 * If the inode is not existent yet, add the orphan name and return 1. +	 * This should only happen for the parent dir that we determine in +	 * __record_new_ref +	 */  	ret = is_inode_existent(sctx, ino, gen);  	if (ret < 0)  		goto out; @@ -1866,6 +1939,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,  		goto out_cache;  	} +	/* +	 * Depending on whether the inode was already processed or not, use +	 * send_root or parent_root for ref lookup. +	 */  	if (ino < sctx->send_progress)  		ret = get_first_ref(sctx, sctx->send_root, ino,  				parent_ino, parent_gen, dest); @@ -1875,6 +1952,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,  	if (ret < 0)  		goto out; +	/* +	 * Check if the ref was overwritten by an inode's ref that was processed +	 * earlier. If yes, treat as orphan and return 1. +	 */  	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,  			dest->start, dest->end - dest->start);  	if (ret < 0) @@ -1888,6 +1969,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,  	}  out_cache: +	/* +	 * Store the result of the lookup in the name cache. +	 */  	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);  	if (!nce) {  		ret = -ENOMEM; @@ -1901,7 +1985,6 @@ out_cache:  	nce->name_len = fs_path_len(dest);  	nce->ret = ret;  	strcpy(nce->name, dest->start); -	memset(&nce->use_list, 0, sizeof(nce->use_list));  	if (ino < sctx->send_progress)  		nce->need_later_update = 0; @@ -2107,9 +2190,6 @@ static int send_subvol_begin(struct send_ctx *sctx)  	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);  	btrfs_release_path(path); -	if (ret < 0) -		goto out; -  	if (parent_root) {  		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);  		if (ret < 0) @@ -2276,7 +2356,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);  			btrfs_inode_mtime(ii));  	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,  			btrfs_inode_ctime(ii)); -	/* TODO otime? */ +	/* TODO Add otime support when the otime patches get into upstream */  	ret = send_cmd(sctx); @@ -2292,39 +2372,39 @@ out:   * a valid path yet because we did not process the refs yet. So, the inode   * is created as orphan.   */ -static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path, -			     struct btrfs_key *key) +static int send_create_inode(struct send_ctx *sctx, u64 ino)  {  	int ret = 0; -	struct extent_buffer *eb = path->nodes[0]; -	struct btrfs_inode_item *ii;  	struct fs_path *p; -	int slot = path->slots[0];  	int cmd; +	u64 gen;  	u64 mode; +	u64 rdev; -verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); +verbose_printk("btrfs: send_create_inode %llu\n", ino);  	p = fs_path_alloc(sctx);  	if (!p)  		return -ENOMEM; -	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); -	mode = btrfs_inode_mode(eb, ii); +	ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, +			NULL, &rdev); +	if (ret < 0) +		goto out; -	if (S_ISREG(mode)) +	if (S_ISREG(mode)) {  		cmd = BTRFS_SEND_C_MKFILE; -	else if (S_ISDIR(mode)) +	} else if (S_ISDIR(mode)) {  		cmd = BTRFS_SEND_C_MKDIR; -	else if (S_ISLNK(mode)) +	} else if (S_ISLNK(mode)) {  		cmd = BTRFS_SEND_C_SYMLINK; -	else if (S_ISCHR(mode) || S_ISBLK(mode)) +	} else if (S_ISCHR(mode) || S_ISBLK(mode)) {  		cmd = BTRFS_SEND_C_MKNOD; -	else if (S_ISFIFO(mode)) +	} else if (S_ISFIFO(mode)) {  		cmd = BTRFS_SEND_C_MKFIFO; -	else if (S_ISSOCK(mode)) +	} else if (S_ISSOCK(mode)) {  		cmd = BTRFS_SEND_C_MKSOCK; -	else { +	} else {  		printk(KERN_WARNING "btrfs: unexpected inode type %o",  				(int)(mode & S_IFMT));  		ret = -ENOTSUPP; @@ -2335,22 +2415,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);  	if (ret < 0)  		goto out; -	ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); +	ret = gen_unique_name(sctx, ino, gen, p);  	if (ret < 0)  		goto out;  	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); -	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino); +	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);  	if (S_ISLNK(mode)) {  		fs_path_reset(p); -		ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p); +		ret = read_symlink(sctx, sctx->send_root, ino, p);  		if (ret < 0)  			goto out;  		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);  	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||  		   S_ISFIFO(mode) || S_ISSOCK(mode)) { -		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii)); +		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);  	}  	ret = send_cmd(sctx); @@ -2364,6 +2444,92 @@ out:  	return ret;  } +/* + * We need some special handling for inodes that get processed before the parent + * directory got created. See process_recorded_refs for details. + * This function does the check if we already created the dir out of order. + */ +static int did_create_dir(struct send_ctx *sctx, u64 dir) +{ +	int ret = 0; +	struct btrfs_path *path = NULL; +	struct btrfs_key key; +	struct btrfs_key found_key; +	struct btrfs_key di_key; +	struct extent_buffer *eb; +	struct btrfs_dir_item *di; +	int slot; + +	path = alloc_path_for_send(); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} + +	key.objectid = dir; +	key.type = BTRFS_DIR_INDEX_KEY; +	key.offset = 0; +	while (1) { +		ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, +				1, 0); +		if (ret < 0) +			goto out; +		if (!ret) { +			eb = path->nodes[0]; +			slot = path->slots[0]; +			btrfs_item_key_to_cpu(eb, &found_key, slot); +		} +		if (ret || found_key.objectid != key.objectid || +		    found_key.type != key.type) { +			ret = 0; +			goto out; +		} + +		di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); +		btrfs_dir_item_key_to_cpu(eb, di, &di_key); + +		if (di_key.objectid < sctx->send_progress) { +			ret = 1; +			goto out; +		} + +		key.offset = found_key.offset + 1; +		btrfs_release_path(path); +	} + +out: +	btrfs_free_path(path); +	return ret; +} + +/* + * Only creates the inode if it is: + * 1. Not a directory + * 2. Or a directory which was not created already due to out of order + *    directories. See did_create_dir and process_recorded_refs for details. + */ +static int send_create_inode_if_needed(struct send_ctx *sctx) +{ +	int ret; + +	if (S_ISDIR(sctx->cur_inode_mode)) { +		ret = did_create_dir(sctx, sctx->cur_ino); +		if (ret < 0) +			goto out; +		if (ret) { +			ret = 0; +			goto out; +		} +	} + +	ret = send_create_inode(sctx, sctx->cur_ino); +	if (ret < 0) +		goto out; + +out: +	return ret; +} +  struct recorded_ref {  	struct list_head list;  	char *dir_path; @@ -2416,13 +2582,13 @@ static int record_ref(struct list_head *head, u64 dir,  static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)  {  	struct recorded_ref *cur; -	struct recorded_ref *tmp; -	list_for_each_entry_safe(cur, tmp, head, list) { +	while (!list_empty(head)) { +		cur = list_entry(head->next, struct recorded_ref, list);  		fs_path_free(sctx, cur->full_path); +		list_del(&cur->list);  		kfree(cur);  	} -	INIT_LIST_HEAD(head);  }  static void free_recorded_refs(struct send_ctx *sctx) @@ -2432,7 +2598,7 @@ static void free_recorded_refs(struct send_ctx *sctx)  }  /* - * Renames/moves a file/dir to it's orphan name. Used when the first + * Renames/moves a file/dir to its orphan name. Used when the first   * ref of an unprocessed inode gets overwritten and for all non empty   * directories.   */ @@ -2472,6 +2638,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)  	struct btrfs_key loc;  	struct btrfs_dir_item *di; +	/* +	 * Don't try to rmdir the top/root subvolume dir. +	 */ +	if (dir == BTRFS_FIRST_FREE_OBJECTID) +		return 0; +  	path = alloc_path_for_send();  	if (!path)  		return -ENOMEM; @@ -2513,160 +2685,6 @@ out:  	return ret;  } -struct finish_unordered_dir_ctx { -	struct send_ctx *sctx; -	struct fs_path *cur_path; -	struct fs_path *dir_path; -	u64 dir_ino; -	int need_delete; -	int delete_pass; -}; - -int __finish_unordered_dir(int num, struct btrfs_key *di_key, -			   const char *name, int name_len, -			   const char *data, int data_len, -			   u8 type, void *ctx) -{ -	int ret = 0; -	struct finish_unordered_dir_ctx *fctx = ctx; -	struct send_ctx *sctx = fctx->sctx; -	u64 di_gen; -	u64 di_mode; -	int is_orphan = 0; - -	if (di_key->objectid >= fctx->dir_ino) -		goto out; - -	fs_path_reset(fctx->cur_path); - -	ret = get_inode_info(sctx->send_root, di_key->objectid, -			NULL, &di_gen, &di_mode, NULL, NULL); -	if (ret < 0) -		goto out; - -	ret = is_first_ref(sctx, sctx->send_root, di_key->objectid, -			fctx->dir_ino, name, name_len); -	if (ret < 0) -		goto out; -	if (ret) { -		is_orphan = 1; -		ret = gen_unique_name(sctx, di_key->objectid, di_gen, -				fctx->cur_path); -	} else { -		ret = get_cur_path(sctx, di_key->objectid, di_gen, -				fctx->cur_path); -	} -	if (ret < 0) -		goto out; - -	ret = fs_path_add(fctx->dir_path, name, name_len); -	if (ret < 0) -		goto out; - -	if (!fctx->delete_pass) { -		if (S_ISDIR(di_mode)) { -			ret = send_rename(sctx, fctx->cur_path, -					fctx->dir_path); -		} else { -			ret = send_link(sctx, fctx->dir_path, -					fctx->cur_path); -			if (is_orphan) -				fctx->need_delete = 1; -		} -	} else if (!S_ISDIR(di_mode)) { -		ret = send_unlink(sctx, fctx->cur_path); -	} else { -		ret = 0; -	} - -	fs_path_remove(fctx->dir_path); - -out: -	return ret; -} - -/* - * Go through all dir items and see if we find refs which could not be created - * in the past because the dir did not exist at that time. - */ -static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen) -{ -	int ret = 0; -	struct btrfs_path *path = NULL; -	struct btrfs_key key; -	struct btrfs_key found_key; -	struct extent_buffer *eb; -	struct finish_unordered_dir_ctx fctx; -	int slot; - -	path = alloc_path_for_send(); -	if (!path) { -		ret = -ENOMEM; -		goto out; -	} - -	memset(&fctx, 0, sizeof(fctx)); -	fctx.sctx = sctx; -	fctx.cur_path = fs_path_alloc(sctx); -	fctx.dir_path = fs_path_alloc(sctx); -	if (!fctx.cur_path || !fctx.dir_path) { -		ret = -ENOMEM; -		goto out; -	} -	fctx.dir_ino = dir; - -	ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path); -	if (ret < 0) -		goto out; - -	/* -	 * We do two passes. The first links in the new refs and the second -	 * deletes orphans if required. Deletion of orphans is not required for -	 * directory inodes, as we always have only one ref and use rename -	 * instead of link for those. -	 */ - -again: -	key.objectid = dir; -	key.type = BTRFS_DIR_ITEM_KEY; -	key.offset = 0; -	while (1) { -		ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, -				1, 0); -		if (ret < 0) -			goto out; -		eb = path->nodes[0]; -		slot = path->slots[0]; -		btrfs_item_key_to_cpu(eb, &found_key, slot); - -		if (found_key.objectid != key.objectid || -		    found_key.type != key.type) { -			btrfs_release_path(path); -			break; -		} - -		ret = iterate_dir_item(sctx, sctx->send_root, path, -				&found_key, __finish_unordered_dir, -				&fctx); -		if (ret < 0) -			goto out; - -		key.offset = found_key.offset + 1; -		btrfs_release_path(path); -	} - -	if (!fctx.delete_pass && fctx.need_delete) { -		fctx.delete_pass = 1; -		goto again; -	} - -out: -	btrfs_free_path(path); -	fs_path_free(sctx, fctx.cur_path); -	fs_path_free(sctx, fctx.dir_path); -	return ret; -} -  /*   * This does all the move/link/unlink/rmdir magic.   */ @@ -2674,6 +2692,7 @@ static int process_recorded_refs(struct send_ctx *sctx)  {  	int ret = 0;  	struct recorded_ref *cur; +	struct recorded_ref *cur2;  	struct ulist *check_dirs = NULL;  	struct ulist_iterator uit;  	struct ulist_node *un; @@ -2685,6 +2704,12 @@ static int process_recorded_refs(struct send_ctx *sctx)  verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); +	/* +	 * This should never happen as the root dir always has the same ref +	 * which is always '..' +	 */ +	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); +  	valid_path = fs_path_alloc(sctx);  	if (!valid_path) {  		ret = -ENOMEM; @@ -2731,6 +2756,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  	list_for_each_entry(cur, &sctx->new_refs, list) {  		/* +		 * We may have refs where the parent directory does not exist +		 * yet. This happens if the parent directories inum is higher +		 * the the current inum. To handle this case, we create the +		 * parent directory out of order. But we need to check if this +		 * did already happen before due to other refs in the same dir. +		 */ +		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); +		if (ret < 0) +			goto out; +		if (ret == inode_state_will_create) { +			ret = 0; +			/* +			 * First check if any of the current inodes refs did +			 * already create the dir. +			 */ +			list_for_each_entry(cur2, &sctx->new_refs, list) { +				if (cur == cur2) +					break; +				if (cur2->dir == cur->dir) { +					ret = 1; +					break; +				} +			} + +			/* +			 * If that did not happen, check if a previous inode +			 * did already create the dir. +			 */ +			if (!ret) +				ret = did_create_dir(sctx, cur->dir); +			if (ret < 0) +				goto out; +			if (!ret) { +				ret = send_create_inode(sctx, cur->dir); +				if (ret < 0) +					goto out; +			} +		} + +		/*  		 * Check if this new ref would overwrite the first ref of  		 * another unprocessed inode. If yes, orphanize the  		 * overwritten inode. If we find an overwritten ref that is @@ -2764,7 +2829,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  		 * inode, move it and update valid_path. If not, link or move  		 * it depending on the inode mode.  		 */ -		if (is_orphan && !sctx->cur_inode_first_ref_orphan) { +		if (is_orphan) {  			ret = send_rename(sctx, valid_path, cur->full_path);  			if (ret < 0)  				goto out; @@ -2827,6 +2892,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  			if (ret < 0)  				goto out;  		} +	} else if (S_ISDIR(sctx->cur_inode_mode) && +		   !list_empty(&sctx->deleted_refs)) { +		/* +		 * We have a moved dir. Add the old parent to check_dirs +		 */ +		cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, +				list); +		ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, +				GFP_NOFS); +		if (ret < 0) +			goto out;  	} else if (!S_ISDIR(sctx->cur_inode_mode)) {  		/*  		 * We have a non dir inode. Go through all deleted refs and @@ -2840,35 +2916,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  			if (ret < 0)  				goto out;  			if (!ret) { -				/* -				 * In case the inode was moved to a directory -				 * that was not created yet (see -				 * __record_new_ref), we can not unlink the ref -				 * as it will be needed later when the parent -				 * directory is created, so that we can move in -				 * the inode to the new dir. -				 */ -				if (!is_orphan && -				    sctx->cur_inode_first_ref_orphan) { -					ret = orphanize_inode(sctx, -							sctx->cur_ino, -							sctx->cur_inode_gen, -							cur->full_path); -					if (ret < 0) -						goto out; -					ret = gen_unique_name(sctx, -							sctx->cur_ino, -							sctx->cur_inode_gen, -							valid_path); -					if (ret < 0) -						goto out; -					is_orphan = 1; - -				} else { -					ret = send_unlink(sctx, cur->full_path); -					if (ret < 0) -						goto out; -				} +				ret = send_unlink(sctx, cur->full_path); +				if (ret < 0) +					goto out;  			}  			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,  					GFP_NOFS); @@ -2880,12 +2930,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  		 * If the inode is still orphan, unlink the orphan. This may  		 * happen when a previous inode did overwrite the first ref  		 * of this inode and no new refs were added for the current -		 * inode. -		 * We can however not delete the orphan in case the inode relies -		 * in a directory that was not created yet (see -		 * __record_new_ref) +		 * inode. Unlinking does not mean that the inode is deleted in +		 * all cases. There may still be links to this inode in other +		 * places.  		 */ -		if (is_orphan && !sctx->cur_inode_first_ref_orphan) { +		if (is_orphan) {  			ret = send_unlink(sctx, valid_path);  			if (ret < 0)  				goto out; @@ -2900,6 +2949,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  	 */  	ULIST_ITER_INIT(&uit);  	while ((un = ulist_next(check_dirs, &uit))) { +		/* +		 * In case we had refs into dirs that were not processed yet, +		 * we don't need to do the utime and rmdir logic for these dirs. +		 * The dir will be processed later. +		 */  		if (un->val > sctx->cur_ino)  			continue; @@ -2929,25 +2983,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  		}  	} -	/* -	 * Current inode is now at it's new position, so we must increase -	 * send_progress -	 */ -	sctx->send_progress = sctx->cur_ino + 1; - -	/* -	 * We may have a directory here that has pending refs which could not -	 * be created before (because the dir did not exist before, see -	 * __record_new_ref). finish_outoforder_dir will link/move the pending -	 * refs. -	 */ -	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) { -		ret = finish_outoforder_dir(sctx, sctx->cur_ino, -				sctx->cur_inode_gen); -		if (ret < 0) -			goto out; -	} -  	ret = 0;  out: @@ -2971,34 +3006,9 @@ static int __record_new_ref(int num, u64 dir, int index,  		return -ENOMEM;  	ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, -			NULL); -	if (ret < 0) -		goto out; - -	/* -	 * The parent may be non-existent at this point in time. This happens -	 * if the ino of the parent dir is higher then the current ino. In this -	 * case, we can not process this ref until the parent dir is finally -	 * created. If we reach the parent dir later, process_recorded_refs -	 * will go through all dir items and process the refs that could not be -	 * processed before. In case this is the first ref, we set -	 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to -	 * keep an orphan of the inode so that it later can be used for -	 * link/move -	 */ -	ret = is_inode_existent(sctx, dir, gen); +			NULL, NULL);  	if (ret < 0)  		goto out; -	if (!ret) { -		ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir, -				name->start, fs_path_len(name)); -		if (ret < 0) -			goto out; -		if (ret) -			sctx->cur_inode_first_ref_orphan = 1; -		ret = 0; -		goto out; -	}  	ret = get_cur_path(sctx, dir, gen, p);  	if (ret < 0) @@ -3029,7 +3039,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,  		return -ENOMEM;  	ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, -			NULL); +			NULL, NULL);  	if (ret < 0)  		goto out; @@ -3206,33 +3216,28 @@ static int process_all_refs(struct send_ctx *sctx,  	key.offset = 0;  	while (1) {  		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); -		if (ret < 0) { -			btrfs_release_path(path); +		if (ret < 0)  			goto out; -		} -		if (ret) { -			btrfs_release_path(path); +		if (ret)  			break; -		}  		eb = path->nodes[0];  		slot = path->slots[0];  		btrfs_item_key_to_cpu(eb, &found_key, slot);  		if (found_key.objectid != key.objectid || -		    found_key.type != key.type) { -			btrfs_release_path(path); +		    found_key.type != key.type)  			break; -		} -		ret = iterate_inode_ref(sctx, sctx->parent_root, path, -				&found_key, 0, cb, sctx); +		ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, +				sctx);  		btrfs_release_path(path);  		if (ret < 0)  			goto out;  		key.offset = found_key.offset + 1;  	} +	btrfs_release_path(path);  	ret = process_recorded_refs(sctx); @@ -3555,7 +3560,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)  	int ret = 0;  	struct fs_path *p;  	loff_t pos = offset; -	int readed = 0; +	int num_read = 0;  	mm_segment_t old_fs;  	p = fs_path_alloc(sctx); @@ -3580,8 +3585,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);  	ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);  	if (ret < 0)  		goto out; -	readed = ret; -	if (!readed) +	num_read = ret; +	if (!num_read)  		goto out;  	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); @@ -3594,7 +3599,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);  	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);  	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); -	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed); +	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);  	ret = send_cmd(sctx); @@ -3604,7 +3609,7 @@ out:  	set_fs(old_fs);  	if (ret < 0)  		return ret; -	return readed; +	return num_read;  }  /* @@ -3615,7 +3620,6 @@ static int send_clone(struct send_ctx *sctx,  		      struct clone_root *clone_root)  {  	int ret = 0; -	struct btrfs_root *clone_root2 = clone_root->root;  	struct fs_path *p;  	u64 gen; @@ -3640,22 +3644,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "  	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);  	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); -	if (clone_root2 == sctx->send_root) { +	if (clone_root->root == sctx->send_root) {  		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, -				&gen, NULL, NULL, NULL); +				&gen, NULL, NULL, NULL, NULL);  		if (ret < 0)  			goto out;  		ret = get_cur_path(sctx, clone_root->ino, gen, p);  	} else { -		ret = get_inode_path(sctx, clone_root2, clone_root->ino, p); +		ret = get_inode_path(sctx, clone_root->root, +				clone_root->ino, p);  	}  	if (ret < 0)  		goto out;  	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, -			clone_root2->root_item.uuid); +			clone_root->root->root_item.uuid);  	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, -			clone_root2->root_item.ctransid); +			clone_root->root->root_item.ctransid);  	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);  	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,  			clone_root->offset); @@ -3684,10 +3689,17 @@ static int send_write_or_clone(struct send_ctx *sctx,  	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],  			struct btrfs_file_extent_item);  	type = btrfs_file_extent_type(path->nodes[0], ei); -	if (type == BTRFS_FILE_EXTENT_INLINE) +	if (type == BTRFS_FILE_EXTENT_INLINE) {  		len = btrfs_file_extent_inline_len(path->nodes[0], ei); -	else +		/* +		 * it is possible the inline item won't cover the whole page, +		 * but there may be items after this page.  Make +		 * sure to send the whole thing +		 */ +		len = PAGE_CACHE_ALIGN(len); +	} else {  		len = btrfs_file_extent_num_bytes(path->nodes[0], ei); +	}  	if (offset + len > sctx->cur_inode_size)  		len = sctx->cur_inode_size - offset; @@ -3735,6 +3747,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,  	u64 left_offset_fixed;  	u64 left_len;  	u64 right_len; +	u64 left_gen; +	u64 right_gen;  	u8 left_type;  	u8 right_type; @@ -3744,17 +3758,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,  	eb = left_path->nodes[0];  	slot = left_path->slots[0]; -  	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);  	left_type = btrfs_file_extent_type(eb, ei); -	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); -	left_len = btrfs_file_extent_num_bytes(eb, ei); -	left_offset = btrfs_file_extent_offset(eb, ei);  	if (left_type != BTRFS_FILE_EXTENT_REG) {  		ret = 0;  		goto out;  	} +	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); +	left_len = btrfs_file_extent_num_bytes(eb, ei); +	left_offset = btrfs_file_extent_offset(eb, ei); +	left_gen = btrfs_file_extent_generation(eb, ei);  	/*  	 * Following comments will refer to these graphics. L is the left @@ -3810,6 +3824,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,  		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);  		right_len = btrfs_file_extent_num_bytes(eb, ei);  		right_offset = btrfs_file_extent_offset(eb, ei); +		right_gen = btrfs_file_extent_generation(eb, ei);  		if (right_type != BTRFS_FILE_EXTENT_REG) {  			ret = 0; @@ -3820,7 +3835,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,  		 * Are we at extent 8? If yes, we know the extent is changed.  		 * This may only happen on the first iteration.  		 */ -		if (found_key.offset + right_len < ekey->offset) { +		if (found_key.offset + right_len <= ekey->offset) {  			ret = 0;  			goto out;  		} @@ -3837,8 +3852,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,  		/*  		 * Check if we have the same extent.  		 */ -		if (left_disknr + left_offset_fixed != -				right_disknr + right_offset) { +		if (left_disknr != right_disknr || +		    left_offset_fixed != right_offset || +		    left_gen != right_gen) {  			ret = 0;  			goto out;  		} @@ -3977,6 +3993,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)  		goto out;  	ret = process_recorded_refs(sctx); +	if (ret < 0) +		goto out; + +	/* +	 * We have processed the refs and thus need to advance send_progress. +	 * Now, calls to get_cur_xxx will take the updated refs of the current +	 * inode into account. +	 */ +	sctx->send_progress = sctx->cur_ino + 1;  out:  	return ret; @@ -4004,7 +4029,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)  		goto out;  	ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, -			&left_mode, &left_uid, &left_gid); +			&left_mode, &left_uid, &left_gid, NULL);  	if (ret < 0)  		goto out; @@ -4015,7 +4040,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)  		} else {  			ret = get_inode_info(sctx->parent_root, sctx->cur_ino,  					NULL, NULL, &right_mode, &right_uid, -					&right_gid); +					&right_gid, NULL);  			if (ret < 0)  				goto out; @@ -4074,7 +4099,12 @@ static int changed_inode(struct send_ctx *sctx,  	sctx->cur_ino = key->objectid;  	sctx->cur_inode_new_gen = 0; -	sctx->cur_inode_first_ref_orphan = 0; + +	/* +	 * Set send_progress to current inode. This will tell all get_cur_xxx +	 * functions that the current inode's refs are not updated yet. Later, +	 * when process_recorded_refs is finished, it is set to cur_ino + 1. +	 */  	sctx->send_progress = sctx->cur_ino;  	if (result == BTRFS_COMPARE_TREE_NEW || @@ -4098,7 +4128,14 @@ static int changed_inode(struct send_ctx *sctx,  		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],  				right_ii); -		if (left_gen != right_gen) + +		/* +		 * The cur_ino = root dir case is special here. We can't treat +		 * the inode as deleted+reused because it would generate a +		 * stream that tries to delete/mkdir the root dir. +		 */ +		if (left_gen != right_gen && +		    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)  			sctx->cur_inode_new_gen = 1;  	} @@ -4111,8 +4148,7 @@ static int changed_inode(struct send_ctx *sctx,  		sctx->cur_inode_mode = btrfs_inode_mode(  				sctx->left_path->nodes[0], left_ii);  		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) -			ret = send_create_inode(sctx, sctx->left_path, -					sctx->cmp_key); +			ret = send_create_inode_if_needed(sctx);  	} else if (result == BTRFS_COMPARE_TREE_DELETED) {  		sctx->cur_inode_gen = right_gen;  		sctx->cur_inode_new = 0; @@ -4122,7 +4158,17 @@ static int changed_inode(struct send_ctx *sctx,  		sctx->cur_inode_mode = btrfs_inode_mode(  				sctx->right_path->nodes[0], right_ii);  	} else if (result == BTRFS_COMPARE_TREE_CHANGED) { +		/* +		 * We need to do some special handling in case the inode was +		 * reported as changed with a changed generation number. This +		 * means that the original inode was deleted and new inode +		 * reused the same inum. So we have to treat the old inode as +		 * deleted and the new one as new. +		 */  		if (sctx->cur_inode_new_gen) { +			/* +			 * First, process the inode as if it was deleted. +			 */  			sctx->cur_inode_gen = right_gen;  			sctx->cur_inode_new = 0;  			sctx->cur_inode_deleted = 1; @@ -4135,6 +4181,9 @@ static int changed_inode(struct send_ctx *sctx,  			if (ret < 0)  				goto out; +			/* +			 * Now process the inode as if it was new. +			 */  			sctx->cur_inode_gen = left_gen;  			sctx->cur_inode_new = 1;  			sctx->cur_inode_deleted = 0; @@ -4142,14 +4191,23 @@ static int changed_inode(struct send_ctx *sctx,  					sctx->left_path->nodes[0], left_ii);  			sctx->cur_inode_mode = btrfs_inode_mode(  					sctx->left_path->nodes[0], left_ii); -			ret = send_create_inode(sctx, sctx->left_path, -					sctx->cmp_key); +			ret = send_create_inode_if_needed(sctx);  			if (ret < 0)  				goto out;  			ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);  			if (ret < 0)  				goto out; +			/* +			 * Advance send_progress now as we did not get into +			 * process_recorded_refs_if_needed in the new_gen case. +			 */ +			sctx->send_progress = sctx->cur_ino + 1; + +			/* +			 * Now process all extents and xattrs of the inode as if +			 * they were all new. +			 */  			ret = process_all_extents(sctx);  			if (ret < 0)  				goto out; @@ -4172,6 +4230,16 @@ out:  	return ret;  } +/* + * We have to process new refs before deleted refs, but compare_trees gives us + * the new and deleted refs mixed. To fix this, we record the new/deleted refs + * first and later process them in process_recorded_refs. + * For the cur_inode_new_gen case, we skip recording completely because + * changed_inode did already initiate processing of refs. The reason for this is + * that in this case, compare_tree actually compares the refs of 2 different + * inodes. To fix this, process_all_refs is used in changed_inode to handle all + * refs of the right tree as deleted and all refs of the left tree as new. + */  static int changed_ref(struct send_ctx *sctx,  		       enum btrfs_compare_tree_result result)  { @@ -4192,6 +4260,11 @@ static int changed_ref(struct send_ctx *sctx,  	return ret;  } +/* + * Process new/deleted/changed xattrs. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of xattrs. The reason is the same as in changed_ref + */  static int changed_xattr(struct send_ctx *sctx,  			 enum btrfs_compare_tree_result result)  { @@ -4211,6 +4284,11 @@ static int changed_xattr(struct send_ctx *sctx,  	return ret;  } +/* + * Process new/deleted/changed extents. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of extents. The reason is the same as in changed_ref + */  static int changed_extent(struct send_ctx *sctx,  			  enum btrfs_compare_tree_result result)  { @@ -4227,7 +4305,10 @@ static int changed_extent(struct send_ctx *sctx,  	return ret;  } - +/* + * Updates compare related fields in sctx and simply forwards to the actual + * changed_xxx functions. + */  static int changed_cb(struct btrfs_root *left_root,  		      struct btrfs_root *right_root,  		      struct btrfs_path *left_path, @@ -4247,6 +4328,11 @@ static int changed_cb(struct btrfs_root *left_root,  	if (ret < 0)  		goto out; +	/* Ignore non-FS objects */ +	if (key->objectid == BTRFS_FREE_INO_OBJECTID || +	    key->objectid == BTRFS_FREE_SPACE_OBJECTID) +		goto out; +  	if (key->type == BTRFS_INODE_ITEM_KEY)  		ret = changed_inode(sctx, result);  	else if (key->type == BTRFS_INODE_REF_KEY) @@ -4299,7 +4385,8 @@ join_trans:  	}  	/* -	 * Make sure the tree has not changed +	 * Make sure the tree has not changed after re-joining. We detect this +	 * by comparing start_ctransid and ctransid. They should always match.  	 */  	spin_lock(&send_root->root_times_lock);  	ctransid = btrfs_root_ctransid(&send_root->root_item); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 9934e948e57..1bf4f32fd4e 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -130,4 +130,5 @@ enum {  #ifdef __KERNEL__  long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); +int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);  #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 83d6f9f9c22..915ac14c206 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -243,12 +243,18 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,  			       struct btrfs_root *root, const char *function,  			       unsigned int line, int errno)  { -	WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); +	WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");  	trans->aborted = errno;  	/* Nothing used. The other threads that have joined this  	 * transaction may be able to continue. */  	if (!trans->blocks_used) { -		btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); +		char nbuf[16]; +		const char *errstr; + +		errstr = btrfs_decode_error(root->fs_info, errno, nbuf); +		btrfs_printk(root->fs_info, +			     "%s:%d: Aborting unused transaction(%s).\n", +			     function, line, errstr);  		return;  	}  	trans->transaction->aborted = errno; @@ -407,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  			btrfs_set_opt(info->mount_opt, NODATASUM);  			break;  		case Opt_nodatacow: -			printk(KERN_INFO "btrfs: setting nodatacow\n"); +			if (!btrfs_test_opt(root, COMPRESS) || +				!btrfs_test_opt(root, FORCE_COMPRESS)) { +					printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n"); +			} else { +				printk(KERN_INFO "btrfs: setting nodatacow\n"); +			} +			info->compress_type = BTRFS_COMPRESS_NONE; +			btrfs_clear_opt(info->mount_opt, COMPRESS); +			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);  			btrfs_set_opt(info->mount_opt, NODATACOW);  			btrfs_set_opt(info->mount_opt, NODATASUM);  			break; @@ -422,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  				compress_type = "zlib";  				info->compress_type = BTRFS_COMPRESS_ZLIB;  				btrfs_set_opt(info->mount_opt, COMPRESS); +				btrfs_clear_opt(info->mount_opt, NODATACOW); +				btrfs_clear_opt(info->mount_opt, NODATASUM);  			} else if (strcmp(args[0].from, "lzo") == 0) {  				compress_type = "lzo";  				info->compress_type = BTRFS_COMPRESS_LZO;  				btrfs_set_opt(info->mount_opt, COMPRESS); +				btrfs_clear_opt(info->mount_opt, NODATACOW); +				btrfs_clear_opt(info->mount_opt, NODATASUM);  				btrfs_set_fs_incompat(info, COMPRESS_LZO);  			} else if (strncmp(args[0].from, "no", 2) == 0) {  				compress_type = "no"; @@ -543,11 +561,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);  			break;  		case Opt_defrag: -			printk(KERN_INFO "btrfs: enabling auto defrag"); +			printk(KERN_INFO "btrfs: enabling auto defrag\n");  			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);  			break;  		case Opt_recovery: -			printk(KERN_INFO "btrfs: enabling auto recovery"); +			printk(KERN_INFO "btrfs: enabling auto recovery\n");  			btrfs_set_opt(info->mount_opt, RECOVERY);  			break;  		case Opt_skip_balance: @@ -846,18 +864,15 @@ int btrfs_sync_fs(struct super_block *sb, int wait)  		return 0;  	} -	btrfs_wait_ordered_extents(root, 0, 0); - -	spin_lock(&fs_info->trans_lock); -	if (!fs_info->running_transaction) { -		spin_unlock(&fs_info->trans_lock); -		return 0; -	} -	spin_unlock(&fs_info->trans_lock); +	btrfs_wait_ordered_extents(root, 0); -	trans = btrfs_join_transaction(root); -	if (IS_ERR(trans)) +	trans = btrfs_attach_transaction(root); +	if (IS_ERR(trans)) { +		/* no transaction, don't bother */ +		if (PTR_ERR(trans) == -ENOENT) +			return 0;  		return PTR_ERR(trans); +	}  	return btrfs_commit_transaction(trans, root);  } @@ -1508,17 +1523,21 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,  static int btrfs_freeze(struct super_block *sb)  { -	struct btrfs_fs_info *fs_info = btrfs_sb(sb); -	mutex_lock(&fs_info->transaction_kthread_mutex); -	mutex_lock(&fs_info->cleaner_mutex); -	return 0; +	struct btrfs_trans_handle *trans; +	struct btrfs_root *root = btrfs_sb(sb)->tree_root; + +	trans = btrfs_attach_transaction(root); +	if (IS_ERR(trans)) { +		/* no transaction, don't bother */ +		if (PTR_ERR(trans) == -ENOENT) +			return 0; +		return PTR_ERR(trans); +	} +	return btrfs_commit_transaction(trans, root);  }  static int btrfs_unfreeze(struct super_block *sb)  { -	struct btrfs_fs_info *fs_info = btrfs_sb(sb); -	mutex_unlock(&fs_info->cleaner_mutex); -	mutex_unlock(&fs_info->transaction_kthread_mutex);  	return 0;  } @@ -1595,7 +1614,7 @@ static int btrfs_interface_init(void)  static void btrfs_interface_exit(void)  {  	if (misc_deregister(&btrfs_misc) < 0) -		printk(KERN_INFO "misc_deregister failed for control device"); +		printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");  }  static int __init init_btrfs_fs(void) @@ -1620,10 +1639,14 @@ static int __init init_btrfs_fs(void)  	if (err)  		goto free_extent_io; -	err = btrfs_delayed_inode_init(); +	err = ordered_data_init();  	if (err)  		goto free_extent_map; +	err = btrfs_delayed_inode_init(); +	if (err) +		goto free_ordered_data; +  	err = btrfs_interface_init();  	if (err)  		goto free_delayed_inode; @@ -1641,6 +1664,8 @@ unregister_ioctl:  	btrfs_interface_exit();  free_delayed_inode:  	btrfs_delayed_inode_exit(); +free_ordered_data: +	ordered_data_exit();  free_extent_map:  	extent_map_exit();  free_extent_io: @@ -1657,6 +1682,7 @@ static void __exit exit_btrfs_fs(void)  {  	btrfs_destroy_cachep();  	btrfs_delayed_inode_exit(); +	ordered_data_exit();  	extent_map_exit();  	extent_io_exit();  	btrfs_interface_exit(); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 27c26004e05..77db875b511 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)  /*   * either allocate a new transaction or hop into the existing one   */ -static noinline int join_transaction(struct btrfs_root *root, int nofail) +static noinline int join_transaction(struct btrfs_root *root, int type)  {  	struct btrfs_transaction *cur_trans;  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -67,7 +67,13 @@ loop:  	}  	if (fs_info->trans_no_join) { -		if (!nofail) { +		/*  +		 * If we are JOIN_NOLOCK we're already committing a current +		 * transaction, we just need a handle to deal with something +		 * when committing the transaction, such as inode cache and +		 * space cache. It is a special case. +		 */ +		if (type != TRANS_JOIN_NOLOCK) {  			spin_unlock(&fs_info->trans_lock);  			return -EBUSY;  		} @@ -87,6 +93,13 @@ loop:  	}  	spin_unlock(&fs_info->trans_lock); +	/* +	 * If we are ATTACH, we just want to catch the current transaction, +	 * and commit it. If there is no transaction, just return ENOENT. +	 */ +	if (type == TRANS_ATTACH) +		return -ENOENT; +  	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);  	if (!cur_trans)  		return -ENOMEM; @@ -267,13 +280,6 @@ static void wait_current_trans(struct btrfs_root *root)  	}  } -enum btrfs_trans_type { -	TRANS_START, -	TRANS_JOIN, -	TRANS_USERSPACE, -	TRANS_JOIN_NOLOCK, -}; -  static int may_wait_transaction(struct btrfs_root *root, int type)  {  	if (root->fs_info->log_root_recovering) @@ -290,7 +296,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type)  }  static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, -						    u64 num_items, int type) +						    u64 num_items, int type, +						    int noflush)  {  	struct btrfs_trans_handle *h;  	struct btrfs_transaction *cur_trans; @@ -324,9 +331,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,  		}  		num_bytes = btrfs_calc_trans_metadata_size(root, num_items); -		ret = btrfs_block_rsv_add(root, -					  &root->fs_info->trans_block_rsv, -					  num_bytes); +		if (noflush) +			ret = btrfs_block_rsv_add_noflush(root, +						&root->fs_info->trans_block_rsv, +						num_bytes); +		else +			ret = btrfs_block_rsv_add(root, +						&root->fs_info->trans_block_rsv, +						num_bytes);  		if (ret)  			return ERR_PTR(ret);  	} @@ -335,19 +347,34 @@ again:  	if (!h)  		return ERR_PTR(-ENOMEM); -	sb_start_intwrite(root->fs_info->sb); +	/* +	 * If we are JOIN_NOLOCK we're already committing a transaction and +	 * waiting on this guy, so we don't need to do the sb_start_intwrite +	 * because we're already holding a ref.  We need this because we could +	 * have raced in and did an fsync() on a file which can kick a commit +	 * and then we deadlock with somebody doing a freeze. +	 * +	 * If we are ATTACH, it means we just want to catch the current +	 * transaction and commit it, so we needn't do sb_start_intwrite().  +	 */ +	if (type < TRANS_JOIN_NOLOCK) +		sb_start_intwrite(root->fs_info->sb);  	if (may_wait_transaction(root, type))  		wait_current_trans(root);  	do { -		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); +		ret = join_transaction(root, type);  		if (ret == -EBUSY)  			wait_current_trans(root);  	} while (ret == -EBUSY);  	if (ret < 0) { -		sb_end_intwrite(root->fs_info->sb); +		/* We must get the transaction if we are JOIN_NOLOCK. */ +		BUG_ON(type == TRANS_JOIN_NOLOCK); + +		if (type < TRANS_JOIN_NOLOCK) +			sb_end_intwrite(root->fs_info->sb);  		kmem_cache_free(btrfs_trans_handle_cachep, h);  		return ERR_PTR(ret);  	} @@ -367,7 +394,9 @@ again:  	h->aborted = 0;  	h->qgroup_reserved = qgroup_reserved;  	h->delayed_ref_elem.seq = 0; +	h->type = type;  	INIT_LIST_HEAD(&h->qgroup_ref_list); +	INIT_LIST_HEAD(&h->new_bgs);  	smp_mb();  	if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -393,21 +422,33 @@ got_it:  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,  						   int num_items)  { -	return start_transaction(root, num_items, TRANS_START); +	return start_transaction(root, num_items, TRANS_START, 0); +} + +struct btrfs_trans_handle *btrfs_start_transaction_noflush( +					struct btrfs_root *root, int num_items) +{ +	return start_transaction(root, num_items, TRANS_START, 1);  } +  struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)  { -	return start_transaction(root, 0, TRANS_JOIN); +	return start_transaction(root, 0, TRANS_JOIN, 0);  }  struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)  { -	return start_transaction(root, 0, TRANS_JOIN_NOLOCK); +	return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);  }  struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)  { -	return start_transaction(root, 0, TRANS_USERSPACE); +	return start_transaction(root, 0, TRANS_USERSPACE, 0); +} + +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) +{ +	return start_transaction(root, 0, TRANS_ATTACH, 0);  }  /* wait for a transaction commit to be fully complete */ @@ -506,11 +547,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,  }  static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, -			  struct btrfs_root *root, int throttle, int lock) +			  struct btrfs_root *root, int throttle)  {  	struct btrfs_transaction *cur_trans = trans->transaction;  	struct btrfs_fs_info *info = root->fs_info;  	int count = 0; +	int lock = (trans->type != TRANS_JOIN_NOLOCK);  	int err = 0;  	if (--trans->use_count) { @@ -536,6 +578,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  		trans->qgroup_reserved = 0;  	} +	if (!list_empty(&trans->new_bgs)) +		btrfs_create_pending_block_groups(trans, root); +  	while (count < 2) {  		unsigned long cur = trans->delayed_ref_updates;  		trans->delayed_ref_updates = 0; @@ -551,7 +596,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	btrfs_trans_release_metadata(trans, root);  	trans->block_rsv = NULL; -	sb_end_intwrite(root->fs_info->sb); +	if (!list_empty(&trans->new_bgs)) +		btrfs_create_pending_block_groups(trans, root);  	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&  	    should_end_transaction(trans, root)) { @@ -573,6 +619,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  		}  	} +	if (trans->type < TRANS_JOIN_NOLOCK) +		sb_end_intwrite(root->fs_info->sb); +  	WARN_ON(cur_trans != info->running_transaction);  	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);  	atomic_dec(&cur_trans->num_writers); @@ -604,7 +653,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,  {  	int ret; -	ret = __btrfs_end_transaction(trans, root, 0, 1); +	ret = __btrfs_end_transaction(trans, root, 0);  	if (ret)  		return ret;  	return 0; @@ -615,18 +664,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,  {  	int ret; -	ret = __btrfs_end_transaction(trans, root, 1, 1); -	if (ret) -		return ret; -	return 0; -} - -int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, -				 struct btrfs_root *root) -{ -	int ret; - -	ret = __btrfs_end_transaction(trans, root, 0, 0); +	ret = __btrfs_end_transaction(trans, root, 1);  	if (ret)  		return ret;  	return 0; @@ -635,7 +673,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,  int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,  				struct btrfs_root *root)  { -	return __btrfs_end_transaction(trans, root, 1, 1); +	return __btrfs_end_transaction(trans, root, 1);  }  /* @@ -649,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,  	int err = 0;  	int werr = 0;  	struct address_space *mapping = root->fs_info->btree_inode->i_mapping; +	struct extent_state *cached_state = NULL;  	u64 start = 0;  	u64 end;  	while (!find_first_extent_bit(dirty_pages, start, &start, &end, -				      mark)) { -		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, -				   GFP_NOFS); +				      mark, &cached_state)) { +		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, +				   mark, &cached_state, GFP_NOFS); +		cached_state = NULL;  		err = filemap_fdatawrite_range(mapping, start, end);  		if (err)  			werr = err; @@ -679,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,  	int err = 0;  	int werr = 0;  	struct address_space *mapping = root->fs_info->btree_inode->i_mapping; +	struct extent_state *cached_state = NULL;  	u64 start = 0;  	u64 end;  	while (!find_first_extent_bit(dirty_pages, start, &start, &end, -				      EXTENT_NEED_WAIT)) { -		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); +				      EXTENT_NEED_WAIT, &cached_state)) { +		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, +				 0, 0, &cached_state, GFP_NOFS);  		err = filemap_fdatawait_range(mapping, start, end);  		if (err)  			werr = err; @@ -955,6 +997,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	struct btrfs_root *parent_root;  	struct btrfs_block_rsv *rsv;  	struct inode *parent_inode; +	struct btrfs_path *path; +	struct btrfs_dir_item *dir_item;  	struct dentry *parent;  	struct dentry *dentry;  	struct extent_buffer *tmp; @@ -967,18 +1011,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	u64 root_flags;  	uuid_le new_uuid; -	rsv = trans->block_rsv; +	path = btrfs_alloc_path(); +	if (!path) { +		ret = pending->error = -ENOMEM; +		goto path_alloc_fail; +	}  	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);  	if (!new_root_item) {  		ret = pending->error = -ENOMEM; -		goto fail; +		goto root_item_alloc_fail;  	}  	ret = btrfs_find_free_objectid(tree_root, &objectid);  	if (ret) {  		pending->error = ret; -		goto fail; +		goto no_free_objectid;  	}  	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); @@ -988,22 +1036,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  						  to_reserve);  		if (ret) {  			pending->error = ret; -			goto fail; +			goto no_free_objectid;  		}  	}  	ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,  				   objectid, pending->inherit); -	kfree(pending->inherit);  	if (ret) {  		pending->error = ret; -		goto fail; +		goto no_free_objectid;  	}  	key.objectid = objectid;  	key.offset = (u64)-1;  	key.type = BTRFS_ROOT_ITEM_KEY; +	rsv = trans->block_rsv;  	trans->block_rsv = &pending->block_rsv;  	dentry = pending->dentry; @@ -1017,24 +1065,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	 */  	ret = btrfs_set_inode_index(parent_inode, &index);  	BUG_ON(ret); /* -ENOMEM */ -	ret = btrfs_insert_dir_item(trans, parent_root, -				dentry->d_name.name, dentry->d_name.len, -				parent_inode, &key, -				BTRFS_FT_DIR, index); -	if (ret == -EEXIST) { + +	/* check if there is a file/dir which has the same name. */ +	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, +					 btrfs_ino(parent_inode), +					 dentry->d_name.name, +					 dentry->d_name.len, 0); +	if (dir_item != NULL && !IS_ERR(dir_item)) {  		pending->error = -EEXIST; -		dput(parent);  		goto fail; -	} else if (ret) { -		goto abort_trans_dput; +	} else if (IS_ERR(dir_item)) { +		ret = PTR_ERR(dir_item); +		btrfs_abort_transaction(trans, root, ret); +		goto fail;  	} - -	btrfs_i_size_write(parent_inode, parent_inode->i_size + -					 dentry->d_name.len * 2); -	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; -	ret = btrfs_update_inode(trans, parent_root, parent_inode); -	if (ret) -		goto abort_trans_dput; +	btrfs_release_path(path);  	/*  	 * pull in the delayed directory update @@ -1043,8 +1088,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	 * snapshot  	 */  	ret = btrfs_run_delayed_items(trans, root); -	if (ret) { /* Transaction aborted */ -		dput(parent); +	if (ret) {	/* Transaction aborted */ +		btrfs_abort_transaction(trans, root, ret);  		goto fail;  	} @@ -1079,7 +1124,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	if (ret) {  		btrfs_tree_unlock(old);  		free_extent_buffer(old); -		goto abort_trans_dput; +		btrfs_abort_transaction(trans, root, ret); +		goto fail;  	}  	btrfs_set_lock_blocking(old); @@ -1088,8 +1134,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	/* clean up in any case */  	btrfs_tree_unlock(old);  	free_extent_buffer(old); -	if (ret) -		goto abort_trans_dput; +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto fail; +	}  	/* see comments in should_cow_block() */  	root->force_cow = 1; @@ -1101,8 +1149,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);  	btrfs_tree_unlock(tmp);  	free_extent_buffer(tmp); -	if (ret) -		goto abort_trans_dput; +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto fail; +	}  	/*  	 * insert root back/forward references @@ -1111,32 +1161,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  				 parent_root->root_key.objectid,  				 btrfs_ino(parent_inode), index,  				 dentry->d_name.name, dentry->d_name.len); -	dput(parent); -	if (ret) +	if (ret) { +		btrfs_abort_transaction(trans, root, ret);  		goto fail; +	}  	key.offset = (u64)-1;  	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);  	if (IS_ERR(pending->snap)) {  		ret = PTR_ERR(pending->snap); -		goto abort_trans; +		btrfs_abort_transaction(trans, root, ret); +		goto fail;  	}  	ret = btrfs_reloc_post_snapshot(trans, pending); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto fail; +	} + +	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto fail; +	} + +	ret = btrfs_insert_dir_item(trans, parent_root, +				    dentry->d_name.name, dentry->d_name.len, +				    parent_inode, &key, +				    BTRFS_FT_DIR, index); +	/* We have check then name at the beginning, so it is impossible. */ +	BUG_ON(ret == -EEXIST); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto fail; +	} + +	btrfs_i_size_write(parent_inode, parent_inode->i_size + +					 dentry->d_name.len * 2); +	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; +	ret = btrfs_update_inode(trans, parent_root, parent_inode);  	if (ret) -		goto abort_trans; -	ret = 0; +		btrfs_abort_transaction(trans, root, ret);  fail: -	kfree(new_root_item); +	dput(parent);  	trans->block_rsv = rsv; +no_free_objectid: +	kfree(new_root_item); +root_item_alloc_fail: +	btrfs_free_path(path); +path_alloc_fail:  	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);  	return ret; - -abort_trans_dput: -	dput(parent); -abort_trans: -	btrfs_abort_transaction(trans, root, ret); -	goto fail;  }  /* @@ -1229,6 +1305,16 @@ static void do_async_commit(struct work_struct *work)  	struct btrfs_async_commit *ac =  		container_of(work, struct btrfs_async_commit, work.work); +	/* +	 * We've got freeze protection passed with the transaction. +	 * Tell lockdep about it. +	 */ +	rwsem_acquire_read( +		&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], +		0, 1, _THIS_IP_); + +	current->journal_info = ac->newtrans; +  	btrfs_commit_transaction(ac->newtrans, ac->root);  	kfree(ac);  } @@ -1258,6 +1344,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,  	atomic_inc(&cur_trans->use_count);  	btrfs_end_transaction(trans, root); + +	/* +	 * Tell lockdep we've released the freeze rwsem, since the +	 * async commit thread will be the one to unlock it. +	 */ +	rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], +		      1, _THIS_IP_); +  	schedule_delayed_work(&ac->work, 0);  	/* wait for transaction to start and unblock */ @@ -1348,6 +1442,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	 */  	cur_trans->delayed_refs.flushing = 1; +	if (!list_empty(&trans->new_bgs)) +		btrfs_create_pending_block_groups(trans, root); +  	ret = btrfs_run_delayed_refs(trans, root, 0);  	if (ret)  		goto cleanup_transaction; @@ -1403,7 +1500,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  		if (flush_on_commit || snap_pending) {  			btrfs_start_delalloc_inodes(root, 1); -			btrfs_wait_ordered_extents(root, 0, 1); +			btrfs_wait_ordered_extents(root, 1);  		}  		ret = btrfs_run_delayed_items(trans, root); @@ -1456,13 +1553,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	 */  	mutex_lock(&root->fs_info->reloc_mutex); -	ret = btrfs_run_delayed_items(trans, root); +	/* +	 * We needn't worry about the delayed items because we will +	 * deal with them in create_pending_snapshot(), which is the +	 * core function of the snapshot creation. +	 */ +	ret = create_pending_snapshots(trans, root->fs_info);  	if (ret) {  		mutex_unlock(&root->fs_info->reloc_mutex);  		goto cleanup_transaction;  	} -	ret = create_pending_snapshots(trans, root->fs_info); +	/* +	 * We insert the dir indexes of the snapshots and update the inode +	 * of the snapshots' parents after the snapshot creation, so there +	 * are some delayed items which are not dealt with. Now deal with +	 * them. +	 * +	 * We needn't worry that this operation will corrupt the snapshots, +	 * because all the tree which are snapshoted will be forced to COW +	 * the nodes and leaves. +	 */ +	ret = btrfs_run_delayed_items(trans, root);  	if (ret) {  		mutex_unlock(&root->fs_info->reloc_mutex);  		goto cleanup_transaction; @@ -1584,7 +1696,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	put_transaction(cur_trans);  	put_transaction(cur_trans); -	sb_end_intwrite(root->fs_info->sb); +	if (trans->type < TRANS_JOIN_NOLOCK) +		sb_end_intwrite(root->fs_info->sb);  	trace_btrfs_transaction_commit(root); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index e8b8416c688..80961947a6b 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,6 +47,14 @@ struct btrfs_transaction {  	int aborted;  }; +enum btrfs_trans_type { +	TRANS_START, +	TRANS_JOIN, +	TRANS_USERSPACE, +	TRANS_JOIN_NOLOCK, +	TRANS_ATTACH, +}; +  struct btrfs_trans_handle {  	u64 transid;  	u64 bytes_reserved; @@ -58,8 +66,9 @@ struct btrfs_trans_handle {  	struct btrfs_transaction *transaction;  	struct btrfs_block_rsv *block_rsv;  	struct btrfs_block_rsv *orig_rsv; -	int aborted; -	int adding_csums; +	short aborted; +	short adding_csums; +	enum btrfs_trans_type type;  	/*  	 * this root is only needed to validate that the root passed to  	 * start_transaction is the same as the one passed to end_transaction. @@ -68,6 +77,7 @@ struct btrfs_trans_handle {  	struct btrfs_root *root;  	struct seq_list delayed_ref_elem;  	struct list_head qgroup_ref_list; +	struct list_head new_bgs;  };  struct btrfs_pending_snapshot { @@ -88,16 +98,18 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,  {  	BTRFS_I(inode)->last_trans = trans->transaction->transid;  	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; +	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;  }  int btrfs_end_transaction(struct btrfs_trans_handle *trans,  			  struct btrfs_root *root); -int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, -				 struct btrfs_root *root);  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,  						   int num_items); +struct btrfs_trans_handle *btrfs_start_transaction_noflush( +					struct btrfs_root *root, int num_items);  struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);  struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);  struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);  int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);  int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c86670f4f28..e9ebb472b28 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,13 +18,16 @@  #include <linux/sched.h>  #include <linux/slab.h> +#include <linux/list_sort.h>  #include "ctree.h"  #include "transaction.h"  #include "disk-io.h"  #include "locking.h"  #include "print-tree.h" +#include "backref.h"  #include "compat.h"  #include "tree-log.h" +#include "hash.h"  /* magic values for the inode_only field in btrfs_log_inode:   * @@ -146,7 +149,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,  			root->log_multiple_pids = true;  		} -		root->log_batch++; +		atomic_inc(&root->log_batch);  		atomic_inc(&root->log_writers);  		mutex_unlock(&root->log_mutex);  		return 0; @@ -165,7 +168,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,  			err = ret;  	}  	mutex_unlock(&root->fs_info->tree_log_mutex); -	root->log_batch++; +	atomic_inc(&root->log_batch);  	atomic_inc(&root->log_writers);  	mutex_unlock(&root->log_mutex);  	return err; @@ -484,7 +487,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,  	int found_type;  	u64 mask = root->sectorsize - 1;  	u64 extent_end; -	u64 alloc_hint;  	u64 start = key->offset;  	u64 saved_nbytes;  	struct btrfs_file_extent_item *item; @@ -550,8 +552,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,  	saved_nbytes = inode_get_bytes(inode);  	/* drop any overlapping extents */ -	ret = btrfs_drop_extents(trans, inode, start, extent_end, -				 &alloc_hint, 1); +	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);  	BUG_ON(ret);  	if (found_type == BTRFS_FILE_EXTENT_REG || @@ -744,6 +745,7 @@ out:   */  static noinline int backref_in_log(struct btrfs_root *log,  				   struct btrfs_key *key, +				   u64 ref_objectid,  				   char *name, int namelen)  {  	struct btrfs_path *path; @@ -764,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,  	if (ret != 0)  		goto out; -	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);  	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + +	if (key->type == BTRFS_INODE_EXTREF_KEY) { +		if (btrfs_find_name_in_ext_backref(path, ref_objectid, +						   name, namelen, NULL)) +			match = 1; + +		goto out; +	} + +	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);  	ptr_end = ptr + item_size;  	while (ptr < ptr_end) {  		ref = (struct btrfs_inode_ref *)ptr; @@ -786,91 +797,42 @@ out:  	return match;  } - -/* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function.  (it should be released on return). - */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, +static inline int __add_inode_ref(struct btrfs_trans_handle *trans,  				  struct btrfs_root *root, -				  struct btrfs_root *log,  				  struct btrfs_path *path, -				  struct extent_buffer *eb, int slot, -				  struct btrfs_key *key) +				  struct btrfs_root *log_root, +				  struct inode *dir, struct inode *inode, +				  struct extent_buffer *eb, +				  u64 inode_objectid, u64 parent_objectid, +				  u64 ref_index, char *name, int namelen, +				  int *search_done)  { -	struct btrfs_inode_ref *ref; -	struct btrfs_dir_item *di; -	struct inode *dir; -	struct inode *inode; -	unsigned long ref_ptr; -	unsigned long ref_end; -	char *name; -	int namelen;  	int ret; -	int search_done = 0; - -	/* -	 * it is possible that we didn't log all the parent directories -	 * for a given inode.  If we don't find the dir, just don't -	 * copy the back ref in.  The link count fixup code will take -	 * care of the rest -	 */ -	dir = read_one_inode(root, key->offset); -	if (!dir) -		return -ENOENT; - -	inode = read_one_inode(root, key->objectid); -	if (!inode) { -		iput(dir); -		return -EIO; -	} - -	ref_ptr = btrfs_item_ptr_offset(eb, slot); -	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); +	char *victim_name; +	int victim_name_len; +	struct extent_buffer *leaf; +	struct btrfs_dir_item *di; +	struct btrfs_key search_key; +	struct btrfs_inode_extref *extref;  again: -	ref = (struct btrfs_inode_ref *)ref_ptr; - -	namelen = btrfs_inode_ref_name_len(eb, ref); -	name = kmalloc(namelen, GFP_NOFS); -	BUG_ON(!name); - -	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); - -	/* if we already have a perfect match, we're done */ -	if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), -			 btrfs_inode_ref_index(eb, ref), -			 name, namelen)) { -		goto out; -	} - -	/* -	 * look for a conflicting back reference in the metadata. -	 * if we find one we have to unlink that name of the file -	 * before we add our new link.  Later on, we overwrite any -	 * existing back reference, and we don't want to create -	 * dangling pointers in the directory. -	 */ - -	if (search_done) -		goto insert; - -	ret = btrfs_search_slot(NULL, root, key, path, 0, 0); +	/* Search old style refs */ +	search_key.objectid = inode_objectid; +	search_key.type = BTRFS_INODE_REF_KEY; +	search_key.offset = parent_objectid; +	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);  	if (ret == 0) { -		char *victim_name; -		int victim_name_len;  		struct btrfs_inode_ref *victim_ref;  		unsigned long ptr;  		unsigned long ptr_end; -		struct extent_buffer *leaf = path->nodes[0]; + +		leaf = path->nodes[0];  		/* are we trying to overwrite a back ref for the root directory  		 * if so, just jump out, we're done  		 */ -		if (key->objectid == key->offset) -			goto out_nowrite; +		if (search_key.objectid == search_key.offset) +			return 1;  		/* check all the names in this back reference to see  		 * if they are in the log.  if so, we allow them to stay @@ -889,7 +851,9 @@ again:  					   (unsigned long)(victim_ref + 1),  					   victim_name_len); -			if (!backref_in_log(log, key, victim_name, +			if (!backref_in_log(log_root, &search_key, +					    parent_objectid, +					    victim_name,  					    victim_name_len)) {  				btrfs_inc_nlink(inode);  				btrfs_release_path(path); @@ -897,9 +861,14 @@ again:  				ret = btrfs_unlink_inode(trans, root, dir,  							 inode, victim_name,  							 victim_name_len); +				BUG_ON(ret);  				btrfs_run_delayed_items(trans, root); +				kfree(victim_name); +				*search_done = 1; +				goto again;  			}  			kfree(victim_name); +  			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;  		}  		BUG_ON(ret); @@ -908,14 +877,78 @@ again:  		 * NOTE: we have searched root tree and checked the  		 * coresponding ref, it does not need to check again.  		 */ -		search_done = 1; +		*search_done = 1; +	} +	btrfs_release_path(path); + +	/* Same search but for extended refs */ +	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, +					   inode_objectid, parent_objectid, 0, +					   0); +	if (!IS_ERR_OR_NULL(extref)) { +		u32 item_size; +		u32 cur_offset = 0; +		unsigned long base; +		struct inode *victim_parent; + +		leaf = path->nodes[0]; + +		item_size = btrfs_item_size_nr(leaf, path->slots[0]); +		base = btrfs_item_ptr_offset(leaf, path->slots[0]); + +		while (cur_offset < item_size) { +			extref = (struct btrfs_inode_extref *)base + cur_offset; + +			victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + +			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) +				goto next; + +			victim_name = kmalloc(victim_name_len, GFP_NOFS); +			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, +					   victim_name_len); + +			search_key.objectid = inode_objectid; +			search_key.type = BTRFS_INODE_EXTREF_KEY; +			search_key.offset = btrfs_extref_hash(parent_objectid, +							      victim_name, +							      victim_name_len); +			ret = 0; +			if (!backref_in_log(log_root, &search_key, +					    parent_objectid, victim_name, +					    victim_name_len)) { +				ret = -ENOENT; +				victim_parent = read_one_inode(root, +							       parent_objectid); +				if (victim_parent) { +					btrfs_inc_nlink(inode); +					btrfs_release_path(path); + +					ret = btrfs_unlink_inode(trans, root, +								 victim_parent, +								 inode, +								 victim_name, +								 victim_name_len); +					btrfs_run_delayed_items(trans, root); +				} +				BUG_ON(ret); +				iput(victim_parent); +				kfree(victim_name); +				*search_done = 1; +				goto again; +			} +			kfree(victim_name); +			BUG_ON(ret); +next: +			cur_offset += victim_name_len + sizeof(*extref); +		} +		*search_done = 1;  	}  	btrfs_release_path(path);  	/* look for a conflicting sequence number */  	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), -					 btrfs_inode_ref_index(eb, ref), -					 name, namelen, 0); +					 ref_index, name, namelen, 0);  	if (di && !IS_ERR(di)) {  		ret = drop_one_dir_item(trans, root, path, dir, di);  		BUG_ON(ret); @@ -931,25 +964,173 @@ again:  	}  	btrfs_release_path(path); -insert: -	/* insert our name */ -	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, -			     btrfs_inode_ref_index(eb, ref)); -	BUG_ON(ret); +	return 0; +} -	btrfs_update_inode(trans, root, inode); +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, +			     u32 *namelen, char **name, u64 *index, +			     u64 *parent_objectid) +{ +	struct btrfs_inode_extref *extref; -out: -	ref_ptr = (unsigned long)(ref + 1) + namelen; -	kfree(name); -	if (ref_ptr < ref_end) -		goto again; +	extref = (struct btrfs_inode_extref *)ref_ptr; + +	*namelen = btrfs_inode_extref_name_len(eb, extref); +	*name = kmalloc(*namelen, GFP_NOFS); +	if (*name == NULL) +		return -ENOMEM; + +	read_extent_buffer(eb, *name, (unsigned long)&extref->name, +			   *namelen); + +	*index = btrfs_inode_extref_index(eb, extref); +	if (parent_objectid) +		*parent_objectid = btrfs_inode_extref_parent(eb, extref); + +	return 0; +} + +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, +			  u32 *namelen, char **name, u64 *index) +{ +	struct btrfs_inode_ref *ref; + +	ref = (struct btrfs_inode_ref *)ref_ptr; + +	*namelen = btrfs_inode_ref_name_len(eb, ref); +	*name = kmalloc(*namelen, GFP_NOFS); +	if (*name == NULL) +		return -ENOMEM; + +	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + +	*index = btrfs_inode_ref_index(eb, ref); + +	return 0; +} + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function.  (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, +				  struct btrfs_root *root, +				  struct btrfs_root *log, +				  struct btrfs_path *path, +				  struct extent_buffer *eb, int slot, +				  struct btrfs_key *key) +{ +	struct inode *dir; +	struct inode *inode; +	unsigned long ref_ptr; +	unsigned long ref_end; +	char *name; +	int namelen; +	int ret; +	int search_done = 0; +	int log_ref_ver = 0; +	u64 parent_objectid; +	u64 inode_objectid; +	u64 ref_index = 0; +	int ref_struct_size; + +	ref_ptr = btrfs_item_ptr_offset(eb, slot); +	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + +	if (key->type == BTRFS_INODE_EXTREF_KEY) { +		struct btrfs_inode_extref *r; + +		ref_struct_size = sizeof(struct btrfs_inode_extref); +		log_ref_ver = 1; +		r = (struct btrfs_inode_extref *)ref_ptr; +		parent_objectid = btrfs_inode_extref_parent(eb, r); +	} else { +		ref_struct_size = sizeof(struct btrfs_inode_ref); +		parent_objectid = key->offset; +	} +	inode_objectid = key->objectid; + +	/* +	 * it is possible that we didn't log all the parent directories +	 * for a given inode.  If we don't find the dir, just don't +	 * copy the back ref in.  The link count fixup code will take +	 * care of the rest +	 */ +	dir = read_one_inode(root, parent_objectid); +	if (!dir) +		return -ENOENT; + +	inode = read_one_inode(root, inode_objectid); +	if (!inode) { +		iput(dir); +		return -EIO; +	} + +	while (ref_ptr < ref_end) { +		if (log_ref_ver) { +			ret = extref_get_fields(eb, ref_ptr, &namelen, &name, +						&ref_index, &parent_objectid); +			/* +			 * parent object can change from one array +			 * item to another. +			 */ +			if (!dir) +				dir = read_one_inode(root, parent_objectid); +			if (!dir) +				return -ENOENT; +		} else { +			ret = ref_get_fields(eb, ref_ptr, &namelen, &name, +					     &ref_index); +		} +		if (ret) +			return ret; + +		/* if we already have a perfect match, we're done */ +		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), +				  ref_index, name, namelen)) { +			/* +			 * look for a conflicting back reference in the +			 * metadata. if we find one we have to unlink that name +			 * of the file before we add our new link.  Later on, we +			 * overwrite any existing back reference, and we don't +			 * want to create dangling pointers in the directory. +			 */ + +			if (!search_done) { +				ret = __add_inode_ref(trans, root, path, log, +						      dir, inode, eb, +						      inode_objectid, +						      parent_objectid, +						      ref_index, name, namelen, +						      &search_done); +				if (ret == 1) +					goto out; +				BUG_ON(ret); +			} + +			/* insert our name */ +			ret = btrfs_add_link(trans, dir, inode, name, namelen, +					     0, ref_index); +			BUG_ON(ret); + +			btrfs_update_inode(trans, root, inode); +		} + +		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; +		kfree(name); +		if (log_ref_ver) { +			iput(dir); +			dir = NULL; +		} +	}  	/* finally write the back reference in the inode */  	ret = overwrite_item(trans, root, path, eb, slot, key);  	BUG_ON(ret); -out_nowrite: +out:  	btrfs_release_path(path);  	iput(dir);  	iput(inode); @@ -966,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,  	return ret;  } +static int count_inode_extrefs(struct btrfs_root *root, +			       struct inode *inode, struct btrfs_path *path) +{ +	int ret = 0; +	int name_len; +	unsigned int nlink = 0; +	u32 item_size; +	u32 cur_offset = 0; +	u64 inode_objectid = btrfs_ino(inode); +	u64 offset = 0; +	unsigned long ptr; +	struct btrfs_inode_extref *extref; +	struct extent_buffer *leaf; -/* - * There are a few corners where the link count of the file can't - * be properly maintained during replay.  So, instead of adding - * lots of complexity to the log code, we just scan the backrefs - * for any file that has been through replay. - * - * The scan will update the link count on the inode to reflect the - * number of back refs found.  If it goes down to zero, the iput - * will free the inode. - */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, -					   struct btrfs_root *root, -					   struct inode *inode) +	while (1) { +		ret = btrfs_find_one_extref(root, inode_objectid, offset, path, +					    &extref, &offset); +		if (ret) +			break; + +		leaf = path->nodes[0]; +		item_size = btrfs_item_size_nr(leaf, path->slots[0]); +		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + +		while (cur_offset < item_size) { +			extref = (struct btrfs_inode_extref *) (ptr + cur_offset); +			name_len = btrfs_inode_extref_name_len(leaf, extref); + +			nlink++; + +			cur_offset += name_len + sizeof(*extref); +		} + +		offset++; +		btrfs_release_path(path); +	} +	btrfs_release_path(path); + +	if (ret < 0) +		return ret; +	return nlink; +} + +static int count_inode_refs(struct btrfs_root *root, +			       struct inode *inode, struct btrfs_path *path)  { -	struct btrfs_path *path;  	int ret;  	struct btrfs_key key; -	u64 nlink = 0; +	unsigned int nlink = 0;  	unsigned long ptr;  	unsigned long ptr_end;  	int name_len; @@ -994,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,  	key.type = BTRFS_INODE_REF_KEY;  	key.offset = (u64)-1; -	path = btrfs_alloc_path(); -	if (!path) -		return -ENOMEM; -  	while (1) {  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  		if (ret < 0) @@ -1031,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,  		btrfs_release_path(path);  	}  	btrfs_release_path(path); + +	return nlink; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay.  So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found.  If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, +					   struct btrfs_root *root, +					   struct inode *inode) +{ +	struct btrfs_path *path; +	int ret; +	u64 nlink = 0; +	u64 ino = btrfs_ino(inode); + +	path = btrfs_alloc_path(); +	if (!path) +		return -ENOMEM; + +	ret = count_inode_refs(root, inode, path); +	if (ret < 0) +		goto out; + +	nlink = ret; + +	ret = count_inode_extrefs(root, inode, path); +	if (ret == -ENOENT) +		ret = 0; + +	if (ret < 0) +		goto out; + +	nlink += ret; + +	ret = 0; +  	if (nlink != inode->i_nlink) {  		set_nlink(inode, nlink);  		btrfs_update_inode(trans, root, inode); @@ -1046,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,  		ret = insert_orphan_item(trans, root, ino);  		BUG_ON(ret);  	} -	btrfs_free_path(path); -	return 0; +out: +	btrfs_free_path(path); +	return ret;  }  static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, @@ -1695,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,  			ret = add_inode_ref(wc->trans, root, log, path,  					    eb, i, &key);  			BUG_ON(ret && ret != -ENOENT); +		} else if (key.type == BTRFS_INODE_EXTREF_KEY) { +			ret = add_inode_ref(wc->trans, root, log, path, +					    eb, i, &key); +			BUG_ON(ret && ret != -ENOENT);  		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {  			ret = replay_one_extent(wc->trans, root, path,  						eb, i, &key); @@ -2037,7 +2293,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))  		wait_log_commit(trans, root, root->log_transid - 1);  	while (1) { -		unsigned long batch = root->log_batch; +		int batch = atomic_read(&root->log_batch);  		/* when we're on an ssd, just kick the log commit out */  		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {  			mutex_unlock(&root->log_mutex); @@ -2045,7 +2301,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  			mutex_lock(&root->log_mutex);  		}  		wait_for_writer(trans, root); -		if (batch == root->log_batch) +		if (batch == atomic_read(&root->log_batch))  			break;  	} @@ -2074,7 +2330,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	btrfs_set_root_node(&log->root_item, log->node); -	root->log_batch = 0;  	root->log_transid++;  	log->log_transid = root->log_transid;  	root->log_start_pid = 0; @@ -2087,7 +2342,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	mutex_unlock(&root->log_mutex);  	mutex_lock(&log_root_tree->log_mutex); -	log_root_tree->log_batch++; +	atomic_inc(&log_root_tree->log_batch);  	atomic_inc(&log_root_tree->log_writers);  	mutex_unlock(&log_root_tree->log_mutex); @@ -2157,7 +2412,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,  				btrfs_header_level(log_root_tree->node)); -	log_root_tree->log_batch = 0;  	log_root_tree->log_transid++;  	smp_mb(); @@ -2171,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	 * in and cause problems either.  	 */  	btrfs_scrub_pause_super(root); -	write_ctree_super(trans, root->fs_info->tree_root, 1); +	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);  	btrfs_scrub_continue_super(root); -	ret = 0; +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_wake_log_root; +	}  	mutex_lock(&root->log_mutex);  	if (root->last_log_commit < log_transid) @@ -2209,7 +2466,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,  	while (1) {  		ret = find_first_extent_bit(&log->dirty_log_pages, -				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); +				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, +				NULL);  		if (ret)  			break; @@ -2646,6 +2904,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,  	int ret;  	struct btrfs_key key;  	struct btrfs_key found_key; +	int start_slot;  	key.objectid = objectid;  	key.type = max_key_type; @@ -2667,8 +2926,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,  		if (found_key.objectid != objectid)  			break; -		ret = btrfs_del_item(trans, log, path); -		if (ret) +		found_key.offset = 0; +		found_key.type = 0; +		ret = btrfs_bin_search(path->nodes[0], &found_key, 0, +				       &start_slot); + +		ret = btrfs_del_items(trans, log, path, start_slot, +				      path->slots[0] - start_slot + 1); +		/* +		 * If start slot isn't 0 then we don't need to re-search, we've +		 * found the last guy with the objectid in this tree. +		 */ +		if (ret || start_slot != 0)  			break;  		btrfs_release_path(path);  	} @@ -2678,14 +2947,64 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,  	return ret;  } +static void fill_inode_item(struct btrfs_trans_handle *trans, +			    struct extent_buffer *leaf, +			    struct btrfs_inode_item *item, +			    struct inode *inode, int log_inode_only) +{ +	btrfs_set_inode_uid(leaf, item, inode->i_uid); +	btrfs_set_inode_gid(leaf, item, inode->i_gid); +	btrfs_set_inode_mode(leaf, item, inode->i_mode); +	btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + +	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), +			       inode->i_atime.tv_sec); +	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), +				inode->i_atime.tv_nsec); + +	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), +			       inode->i_mtime.tv_sec); +	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), +				inode->i_mtime.tv_nsec); + +	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), +			       inode->i_ctime.tv_sec); +	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), +				inode->i_ctime.tv_nsec); + +	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + +	btrfs_set_inode_sequence(leaf, item, inode->i_version); +	btrfs_set_inode_transid(leaf, item, trans->transid); +	btrfs_set_inode_rdev(leaf, item, inode->i_rdev); +	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); +	btrfs_set_inode_block_group(leaf, item, 0); + +	if (log_inode_only) { +		/* set the generation to zero so the recover code +		 * can tell the difference between an logging +		 * just to say 'this inode exists' and a logging +		 * to say 'update this inode with these values' +		 */ +		btrfs_set_inode_generation(leaf, item, 0); +		btrfs_set_inode_size(leaf, item, 0); +	} else { +		btrfs_set_inode_generation(leaf, item, +					   BTRFS_I(inode)->generation); +		btrfs_set_inode_size(leaf, item, inode->i_size); +	} + +} +  static noinline int copy_items(struct btrfs_trans_handle *trans, -			       struct btrfs_root *log, +			       struct inode *inode,  			       struct btrfs_path *dst_path,  			       struct extent_buffer *src,  			       int start_slot, int nr, int inode_only)  {  	unsigned long src_offset;  	unsigned long dst_offset; +	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;  	struct btrfs_file_extent_item *extent;  	struct btrfs_inode_item *inode_item;  	int ret; @@ -2694,6 +3013,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,  	char *ins_data;  	int i;  	struct list_head ordered_sums; +	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;  	INIT_LIST_HEAD(&ordered_sums); @@ -2722,29 +3042,23 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,  		src_offset = btrfs_item_ptr_offset(src, start_slot + i); -		copy_extent_buffer(dst_path->nodes[0], src, dst_offset, -				   src_offset, ins_sizes[i]); - -		if (inode_only == LOG_INODE_EXISTS && -		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { +		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {  			inode_item = btrfs_item_ptr(dst_path->nodes[0],  						    dst_path->slots[0],  						    struct btrfs_inode_item); -			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); - -			/* set the generation to zero so the recover code -			 * can tell the difference between an logging -			 * just to say 'this inode exists' and a logging -			 * to say 'update this inode with these values' -			 */ -			btrfs_set_inode_generation(dst_path->nodes[0], -						   inode_item, 0); +			fill_inode_item(trans, dst_path->nodes[0], inode_item, +					inode, inode_only == LOG_INODE_EXISTS); +		} else { +			copy_extent_buffer(dst_path->nodes[0], src, dst_offset, +					   src_offset, ins_sizes[i]);  		} +  		/* take a reference on file data extents so that truncates  		 * or deletes of this inode don't have to relog the inode  		 * again  		 */ -		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { +		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && +		    !skip_csum) {  			int found_type;  			extent = btrfs_item_ptr(src, start_slot + i,  						struct btrfs_file_extent_item); @@ -2753,8 +3067,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,  				continue;  			found_type = btrfs_file_extent_type(src, extent); -			if (found_type == BTRFS_FILE_EXTENT_REG || -			    found_type == BTRFS_FILE_EXTENT_PREALLOC) { +			if (found_type == BTRFS_FILE_EXTENT_REG) {  				u64 ds, dl, cs, cl;  				ds = btrfs_file_extent_disk_bytenr(src,  								extent); @@ -2803,6 +3116,239 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,  	return ret;  } +static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +{ +	struct extent_map *em1, *em2; + +	em1 = list_entry(a, struct extent_map, list); +	em2 = list_entry(b, struct extent_map, list); + +	if (em1->start < em2->start) +		return -1; +	else if (em1->start > em2->start) +		return 1; +	return 0; +} + +struct log_args { +	struct extent_buffer *src; +	u64 next_offset; +	int start_slot; +	int nr; +}; + +static int log_one_extent(struct btrfs_trans_handle *trans, +			  struct inode *inode, struct btrfs_root *root, +			  struct extent_map *em, struct btrfs_path *path, +			  struct btrfs_path *dst_path, struct log_args *args) +{ +	struct btrfs_root *log = root->log_root; +	struct btrfs_file_extent_item *fi; +	struct btrfs_key key; +	u64 start = em->mod_start; +	u64 search_start = start; +	u64 len = em->mod_len; +	u64 num_bytes; +	int nritems; +	int ret; + +	if (BTRFS_I(inode)->logged_trans == trans->transid) { +		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, +					   start + len, NULL, 0); +		if (ret) +			return ret; +	} + +	while (len) { +		if (args->nr) +			goto next_slot; +again: +		key.objectid = btrfs_ino(inode); +		key.type = BTRFS_EXTENT_DATA_KEY; +		key.offset = search_start; + +		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +		if (ret < 0) +			return ret; + +		if (ret) { +			/* +			 * A rare case were we can have an em for a section of a +			 * larger extent so we need to make sure that this em +			 * falls within the extent we've found.  If not we just +			 * bail and go back to ye-olde way of doing things but +			 * it happens often enough in testing that we need to do +			 * this dance to make sure. +			 */ +			do { +				if (path->slots[0] == 0) { +					btrfs_release_path(path); +					if (search_start == 0) +						return -ENOENT; +					search_start--; +					goto again; +				} + +				path->slots[0]--; +				btrfs_item_key_to_cpu(path->nodes[0], &key, +						      path->slots[0]); +				if (key.objectid != btrfs_ino(inode) || +				    key.type != BTRFS_EXTENT_DATA_KEY) { +					btrfs_release_path(path); +					return -ENOENT; +				} +			} while (key.offset > start); + +			fi = btrfs_item_ptr(path->nodes[0], path->slots[0], +					    struct btrfs_file_extent_item); +			num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], +								fi); +			if (key.offset + num_bytes <= start) { +				btrfs_release_path(path); +				return -ENOENT; +			} +		} +		args->src = path->nodes[0]; +next_slot: +		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); +		fi = btrfs_item_ptr(args->src, path->slots[0], +				    struct btrfs_file_extent_item); +		if (args->nr && +		    args->start_slot + args->nr == path->slots[0]) { +			args->nr++; +		} else if (args->nr) { +			ret = copy_items(trans, inode, dst_path, args->src, +					 args->start_slot, args->nr, +					 LOG_INODE_ALL); +			if (ret) +				return ret; +			args->nr = 1; +			args->start_slot = path->slots[0]; +		} else if (!args->nr) { +			args->nr = 1; +			args->start_slot = path->slots[0]; +		} +		nritems = btrfs_header_nritems(path->nodes[0]); +		path->slots[0]++; +		num_bytes = btrfs_file_extent_num_bytes(args->src, fi); +		if (len < num_bytes) { +			/* I _think_ this is ok, envision we write to a +			 * preallocated space that is adjacent to a previously +			 * written preallocated space that gets merged when we +			 * mark this preallocated space written.  If we do not +			 * have the adjacent extent in cache then when we copy +			 * this extent it could end up being larger than our EM +			 * thinks it is, which is a-ok, so just set len to 0. +			 */ +			len = 0; +		} else { +			len -= num_bytes; +		} +		start = key.offset + num_bytes; +		args->next_offset = start; +		search_start = start; + +		if (path->slots[0] < nritems) { +			if (len) +				goto next_slot; +			break; +		} + +		if (args->nr) { +			ret = copy_items(trans, inode, dst_path, args->src, +					 args->start_slot, args->nr, +					 LOG_INODE_ALL); +			if (ret) +				return ret; +			args->nr = 0; +			btrfs_release_path(path); +		} +	} + +	return 0; +} + +static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, +				     struct btrfs_root *root, +				     struct inode *inode, +				     struct btrfs_path *path, +				     struct btrfs_path *dst_path) +{ +	struct log_args args; +	struct extent_map *em, *n; +	struct list_head extents; +	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; +	u64 test_gen; +	int ret = 0; + +	INIT_LIST_HEAD(&extents); + +	memset(&args, 0, sizeof(args)); + +	write_lock(&tree->lock); +	test_gen = root->fs_info->last_trans_committed; + +	list_for_each_entry_safe(em, n, &tree->modified_extents, list) { +		list_del_init(&em->list); +		if (em->generation <= test_gen) +			continue; +		/* Need a ref to keep it from getting evicted from cache */ +		atomic_inc(&em->refs); +		set_bit(EXTENT_FLAG_LOGGING, &em->flags); +		list_add_tail(&em->list, &extents); +	} + +	list_sort(NULL, &extents, extent_cmp); + +	while (!list_empty(&extents)) { +		em = list_entry(extents.next, struct extent_map, list); + +		list_del_init(&em->list); +		clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + +		/* +		 * If we had an error we just need to delete everybody from our +		 * private list. +		 */ +		if (ret) { +			free_extent_map(em); +			continue; +		} + +		write_unlock(&tree->lock); + +		/* +		 * If the previous EM and the last extent we left off on aren't +		 * sequential then we need to copy the items we have and redo +		 * our search +		 */ +		if (args.nr && em->mod_start != args.next_offset) { +			ret = copy_items(trans, inode, dst_path, args.src, +					 args.start_slot, args.nr, +					 LOG_INODE_ALL); +			if (ret) { +				free_extent_map(em); +				write_lock(&tree->lock); +				continue; +			} +			btrfs_release_path(path); +			args.nr = 0; +		} + +		ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); +		free_extent_map(em); +		write_lock(&tree->lock); +	} +	WARN_ON(!list_empty(&extents)); +	write_unlock(&tree->lock); + +	if (!ret && args.nr) +		ret = copy_items(trans, inode, dst_path, args.src, +				 args.start_slot, args.nr, LOG_INODE_ALL); +	btrfs_release_path(path); +	return ret; +} +  /* log a single inode in the tree log.   * At least one parent directory for this inode must exist in the tree   * or be logged already. @@ -2832,6 +3378,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  	int nritems;  	int ins_start_slot = 0;  	int ins_nr; +	bool fast_search = false;  	u64 ino = btrfs_ino(inode);  	log = root->log_root; @@ -2851,21 +3398,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  	max_key.objectid = ino; -	/* today the code can only do partial logging of directories */ -	if (!S_ISDIR(inode->i_mode)) -	    inode_only = LOG_INODE_ALL; +	/* today the code can only do partial logging of directories */  	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))  		max_key.type = BTRFS_XATTR_ITEM_KEY;  	else  		max_key.type = (u8)-1;  	max_key.offset = (u64)-1; -	ret = btrfs_commit_inode_delayed_items(trans, inode); -	if (ret) { -		btrfs_free_path(path); -		btrfs_free_path(dst_path); -		return ret; +	/* Only run delayed items if we are a dir or a new file */ +	if (S_ISDIR(inode->i_mode) || +	    BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { +		ret = btrfs_commit_inode_delayed_items(trans, inode); +		if (ret) { +			btrfs_free_path(path); +			btrfs_free_path(dst_path); +			return ret; +		}  	}  	mutex_lock(&BTRFS_I(inode)->log_mutex); @@ -2881,7 +3430,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  			max_key_type = BTRFS_XATTR_ITEM_KEY;  		ret = drop_objectid_items(trans, log, path, ino, max_key_type);  	} else { -		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); +		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +				       &BTRFS_I(inode)->runtime_flags)) { +			ret = btrfs_truncate_inode_items(trans, log, +							 inode, 0, 0); +		} else { +			fast_search = true; +			max_key.type = BTRFS_XATTR_ITEM_KEY; +			ret = drop_objectid_items(trans, log, path, ino, +						  BTRFS_XATTR_ITEM_KEY); +		}  	}  	if (ret) {  		err = ret; @@ -2912,7 +3470,7 @@ again:  			goto next_slot;  		} -		ret = copy_items(trans, log, dst_path, src, ins_start_slot, +		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,  				 ins_nr, inode_only);  		if (ret) {  			err = ret; @@ -2930,7 +3488,7 @@ next_slot:  			goto again;  		}  		if (ins_nr) { -			ret = copy_items(trans, log, dst_path, src, +			ret = copy_items(trans, inode, dst_path, src,  					 ins_start_slot,  					 ins_nr, inode_only);  			if (ret) { @@ -2951,8 +3509,7 @@ next_slot:  			break;  	}  	if (ins_nr) { -		ret = copy_items(trans, log, dst_path, src, -				 ins_start_slot, +		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,  				 ins_nr, inode_only);  		if (ret) {  			err = ret; @@ -2960,7 +3517,24 @@ next_slot:  		}  		ins_nr = 0;  	} -	WARN_ON(ins_nr); + +	if (fast_search) { +		btrfs_release_path(path); +		btrfs_release_path(dst_path); +		ret = btrfs_log_changed_extents(trans, root, inode, path, +						dst_path); +		if (ret) { +			err = ret; +			goto out_unlock; +		} +	} else { +		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; +		struct extent_map *em, *n; + +		list_for_each_entry_safe(em, n, &tree->modified_extents, list) +			list_del_init(&em->list); +	} +  	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {  		btrfs_release_path(path);  		btrfs_release_path(dst_path); @@ -2971,6 +3545,7 @@ next_slot:  		}  	}  	BTRFS_I(inode)->logged_trans = trans->transid; +	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;  out_unlock:  	mutex_unlock(&BTRFS_I(inode)->log_mutex); @@ -3138,7 +3713,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,  end_trans:  	dput(old_parent);  	if (ret < 0) { -		BUG_ON(ret != -ENOSPC); +		WARN_ON(ret != -ENOSPC);  		root->fs_info->last_trans_log_full_commit = trans->transid;  		ret = 1;  	} diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index ab942f46b3d..99be4c138db 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);   * In case of allocation failure -ENOMEM is returned and the ulist stays   * unaltered.   */ -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, -	      gfp_t gfp_mask) +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)  {  	return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);  } -int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, -		    unsigned long *old_aux, gfp_t gfp_mask) +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, +		    u64 *old_aux, gfp_t gfp_mask)  {  	int i; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 21bdc8ec813..21a1963439c 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -33,7 +33,7 @@ struct ulist_iterator {   */  struct ulist_node {  	u64 val;		/* value to store */ -	unsigned long aux;	/* auxiliary value saved along with the val */ +	u64 aux;		/* auxiliary value saved along with the val */  };  struct ulist { @@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);  void ulist_reinit(struct ulist *ulist);  struct ulist *ulist_alloc(gfp_t gfp_mask);  void ulist_free(struct ulist *ulist); -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, -	      gfp_t gfp_mask); -int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, -		    unsigned long *old_aux, gfp_t gfp_mask); +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, +		    u64 *old_aux, gfp_t gfp_mask);  struct ulist_node *ulist_next(struct ulist *ulist,  			      struct ulist_iterator *uiter); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 88b969aeeb7..029b903a4ae 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  		bdev = blkdev_get_by_path(device->name->str, flags, holder);  		if (IS_ERR(bdev)) { -			printk(KERN_INFO "open %s failed\n", device->name->str); +			printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);  			goto error;  		}  		filemap_write_and_wait(bdev->bd_inode->i_mapping); @@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		free_fs_devices(cur_devices);  	} +	root->fs_info->num_tolerated_disk_barrier_failures = +		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); +  	/*  	 * at this point, the device is zero sized.  We want to  	 * remove it from the devices list and zero out the old super @@ -1775,15 +1778,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  	if (seeding_dev) {  		ret = init_first_rw_device(trans, root, device); -		if (ret) +		if (ret) { +			btrfs_abort_transaction(trans, root, ret);  			goto error_trans; +		}  		ret = btrfs_finish_sprout(trans, root); -		if (ret) +		if (ret) { +			btrfs_abort_transaction(trans, root, ret);  			goto error_trans; +		}  	} else {  		ret = btrfs_add_device(trans, root, device); -		if (ret) +		if (ret) { +			btrfs_abort_transaction(trans, root, ret);  			goto error_trans; +		}  	}  	/* @@ -1793,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  	btrfs_clear_space_info_full(root->fs_info);  	unlock_chunks(root); +	root->fs_info->num_tolerated_disk_barrier_failures = +		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);  	ret = btrfs_commit_transaction(trans, root);  	if (seeding_dev) { @@ -1814,7 +1825,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  error_trans:  	unlock_chunks(root); -	btrfs_abort_transaction(trans, root, ret);  	btrfs_end_transaction(trans, root);  	rcu_string_free(device->name);  	kfree(device); @@ -2804,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		}  	} +	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { +		int num_tolerated_disk_barrier_failures; +		u64 target = bctl->sys.target; + +		num_tolerated_disk_barrier_failures = +			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); +		if (num_tolerated_disk_barrier_failures > 0 && +		    (target & +		     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | +		      BTRFS_AVAIL_ALLOC_BIT_SINGLE))) +			num_tolerated_disk_barrier_failures = 0; +		else if (num_tolerated_disk_barrier_failures > 1 && +			 (target & +			  (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) +			num_tolerated_disk_barrier_failures = 1; + +		fs_info->num_tolerated_disk_barrier_failures = +			num_tolerated_disk_barrier_failures; +	} +  	ret = insert_balance_item(fs_info->tree_root, bctl);  	if (ret && ret != -EEXIST)  		goto out; @@ -2836,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		__cancel_balance(fs_info);  	} +	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { +		fs_info->num_tolerated_disk_barrier_failures = +			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); +	} +  	wake_up(&fs_info->balance_wait_q);  	return ret; @@ -3608,12 +3643,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,  	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,  				  &sys_chunk_size, &sys_stripe_size,  				  sys_chunk_offset, alloc_profile); -	if (ret) -		goto abort; +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	}  	ret = btrfs_add_device(trans, fs_info->chunk_root, device); -	if (ret) -		goto abort; +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	}  	/*  	 * Modifying chunk tree needs allocating new blocks from both @@ -3623,19 +3662,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,  	 */  	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,  				   chunk_size, stripe_size); -	if (ret) -		goto abort; +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	}  	ret = __finish_chunk_alloc(trans, extent_root, sys_map,  				   sys_chunk_offset, sys_chunk_size,  				   sys_stripe_size);  	if (ret) -		goto abort; +		btrfs_abort_transaction(trans, root, ret); -	return 0; +out: -abort: -	btrfs_abort_transaction(trans, root, ret);  	return ret;  } @@ -3760,7 +3799,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	read_unlock(&em_tree->lock);  	if (!em) { -		printk(KERN_CRIT "unable to find logical %llu len %llu\n", +		printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",  		       (unsigned long long)logical,  		       (unsigned long long)*length);  		BUG(); @@ -4217,7 +4256,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	total_devs = bbio->num_stripes;  	if (map_length < length) { -		printk(KERN_CRIT "mapping failed logical %llu bio len %llu " +		printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "  		       "len %llu\n", (unsigned long long)logical,  		       (unsigned long long)length,  		       (unsigned long long)map_length); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 92c20654cc5..9acb846c3e7 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,  	*total_in = 0;  	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { -		printk(KERN_WARNING "deflateInit failed\n"); +		printk(KERN_WARNING "btrfs: deflateInit failed\n");  		ret = -1;  		goto out;  	} @@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,  	while (workspace->def_strm.total_in < len) {  		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);  		if (ret != Z_OK) { -			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", +			printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",  			       ret);  			zlib_deflateEnd(&workspace->def_strm);  			ret = -1; @@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,  	}  	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { -		printk(KERN_WARNING "inflateInit failed\n"); +		printk(KERN_WARNING "btrfs: inflateInit failed\n");  		return -1;  	}  	while (workspace->inf_strm.total_in < srclen) { @@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,  	}  	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { -		printk(KERN_WARNING "inflateInit failed\n"); +		printk(KERN_WARNING "btrfs: inflateInit failed\n");  		return -1;  	}  |