diff options
| -rw-r--r-- | fs/btrfs/Makefile | 3 | ||||
| -rw-r--r-- | fs/btrfs/btrfs_inode.h | 8 | ||||
| -rw-r--r-- | fs/btrfs/compat.h | 15 | ||||
| -rw-r--r-- | fs/btrfs/ctree.c | 73 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 72 | ||||
| -rw-r--r-- | fs/btrfs/dir-item.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 138 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.h | 8 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 93 | ||||
| -rw-r--r-- | fs/btrfs/file.c | 39 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 261 | ||||
| -rw-r--r-- | fs/btrfs/root-tree.c | 5 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 45 | ||||
| -rw-r--r-- | fs/btrfs/transaction.h | 1 | ||||
| -rw-r--r-- | fs/btrfs/tree-defrag.c | 4 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 2804 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.h | 41 | 
17 files changed, 3408 insertions, 205 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3a01065d4ef..b7addbfd8c2 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,8 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \  	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \  	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \  	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ -	   ref-cache.o acl.o export.o - +	   ref-cache.o export.o tree-log.o acl.o  else  # Normal Makefile diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 111f90524ae..fcc8cf27e90 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -33,6 +33,7 @@ struct btrfs_inode {  	struct extent_io_tree io_failure_tree;  	struct mutex csum_mutex;  	struct mutex extent_mutex; +	struct mutex log_mutex;  	struct inode vfs_inode;  	struct btrfs_ordered_inode_tree ordered_tree; @@ -44,10 +45,17 @@ struct btrfs_inode {  	struct list_head delalloc_inodes; +	/* full 64 bit generation number */ +	u64 generation; +  	/*  	 * transid of the trans_handle that last modified this inode  	 */  	u64 last_trans; +	/* +	 * transid that last logged this inode +	 */ +	u64 logged_trans;  	u64 delalloc_bytes;  	u64 disk_i_size;  	u32 flags; diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h index d45fb37887b..b0ed1887d9b 100644 --- a/fs/btrfs/compat.h +++ b/fs/btrfs/compat.h @@ -22,6 +22,21 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)  }  #endif +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) +static inline void btrfs_drop_nlink(struct inode *inode) +{ +	inode->i_nlink--; +} + +static inline void btrfs_inc_nlink(struct inode *inode) +{ +	inode->i_nlink++; +} +#else +# define btrfs_drop_nlink(inode) drop_nlink(inode) +# define btrfs_inc_nlink(inode)	inc_nlink(inode) +#endif +  /*   * Even if AppArmor isn't enabled, it still has different prototypes.   * Add more distro/version pairs here to declare which has AppArmor applied. diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7114faafa9d..579124043d9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -60,7 +60,7 @@ void btrfs_free_path(struct btrfs_path *p)  	kmem_cache_free(btrfs_path_cachep, p);  } -void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) +void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)  {  	int i; @@ -176,7 +176,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	return 0;  } -int __btrfs_cow_block(struct btrfs_trans_handle *trans, +int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root,  			     struct extent_buffer *buf,  			     struct extent_buffer *parent, int parent_slot, @@ -294,7 +294,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	return 0;  } -int btrfs_cow_block(struct btrfs_trans_handle *trans, +int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,  		    struct btrfs_root *root, struct extent_buffer *buf,  		    struct extent_buffer *parent, int parent_slot,  		    struct extent_buffer **cow_ret, u64 prealloc_dest) @@ -677,9 +677,10 @@ static int noinline check_block(struct btrfs_root *root,   *   * slot may point to max if the key is bigger than all of the keys   */ -static int generic_bin_search(struct extent_buffer *eb, unsigned long p, -			      int item_size, struct btrfs_key *key, -			      int max, int *slot) +static noinline int generic_bin_search(struct extent_buffer *eb, +				       unsigned long p, +				       int item_size, struct btrfs_key *key, +				       int max, int *slot)  {  	int low = 0;  	int high = max; @@ -765,7 +766,7 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,  	return -1;  } -static struct extent_buffer *read_node_slot(struct btrfs_root *root, +static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,  				   struct extent_buffer *parent, int slot)  {  	int level = btrfs_header_level(parent); @@ -781,7 +782,7 @@ static struct extent_buffer *read_node_slot(struct btrfs_root *root,  		       btrfs_node_ptr_generation(parent, slot));  } -static int balance_level(struct btrfs_trans_handle *trans, +static noinline int balance_level(struct btrfs_trans_handle *trans,  			 struct btrfs_root *root,  			 struct btrfs_path *path, int level)  { @@ -1128,8 +1129,9 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,  /*   * readahead one full node of leaves   */ -static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, -			     int level, int slot, u64 objectid) +static noinline void reada_for_search(struct btrfs_root *root, +				      struct btrfs_path *path, +				      int level, int slot, u64 objectid)  {  	struct extent_buffer *node;  	struct btrfs_disk_key disk_key; @@ -1201,7 +1203,8 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,  	}  } -static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock) +static noinline void unlock_up(struct btrfs_path *path, int level, +			       int lowest_unlock)  {  	int i;  	int skip_level = level; @@ -1759,8 +1762,9 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root   *   * returns 0 on success and < 0 on failure   */ -static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root -		      *root, struct btrfs_path *path, int level) +static noinline int split_node(struct btrfs_trans_handle *trans, +			       struct btrfs_root *root, +			       struct btrfs_path *path, int level)  {  	u64 root_gen;  	struct extent_buffer *c; @@ -1874,7 +1878,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)   * the start of the leaf data.  IOW, how much room   * the leaf has left for both items and data   */ -int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf) +int noinline btrfs_leaf_free_space(struct btrfs_root *root, +				   struct extent_buffer *leaf)  {  	int nritems = btrfs_header_nritems(leaf);  	int ret; @@ -2283,9 +2288,11 @@ out:   *   * returns 0 if all went well and < 0 on failure.   */ -static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root -		      *root, struct btrfs_key *ins_key, -		      struct btrfs_path *path, int data_size, int extend) +static noinline int split_leaf(struct btrfs_trans_handle *trans, +			       struct btrfs_root *root, +			       struct btrfs_key *ins_key, +			       struct btrfs_path *path, int data_size, +			       int extend)  {  	u64 root_gen;  	struct extent_buffer *l; @@ -3079,6 +3086,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)   * was nothing in the tree that matched the search criteria.   */  int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, +			 struct btrfs_key *max_key,  			 struct btrfs_path *path, int cache_only,  			 u64 min_trans)  { @@ -3093,6 +3101,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,  again:  	cur = btrfs_lock_root_node(root);  	level = btrfs_header_level(cur); +	WARN_ON(path->nodes[level]);  	path->nodes[level] = cur;  	path->locks[level] = 1; @@ -3107,6 +3116,8 @@ again:  		/* at level = 0, we're done, setup the path and exit */  		if (level == 0) { +			if (slot >= nritems) +				goto find_next_key;  			ret = 0;  			path->slots[level] = slot;  			btrfs_item_key_to_cpu(cur, &found_key, slot); @@ -3123,6 +3134,8 @@ again:  			u64 blockptr;  			u64 gen;  			struct extent_buffer *tmp; +			struct btrfs_disk_key disk_key; +  			blockptr = btrfs_node_blockptr(cur, slot);  			gen = btrfs_node_ptr_generation(cur, slot);  			if (gen < min_trans) { @@ -3132,6 +3145,14 @@ again:  			if (!cache_only)  				break; +			if (max_key) { +				btrfs_node_key(cur, &disk_key, slot); +				if (comp_keys(&disk_key, max_key) >= 0) { +					ret = 1; +					goto out; +				} +			} +  			tmp = btrfs_find_tree_block(root, blockptr,  					    btrfs_level_size(root, level - 1)); @@ -3143,14 +3164,16 @@ again:  				free_extent_buffer(tmp);  			slot++;  		} +find_next_key:  		/*  		 * we didn't find a candidate key in this node, walk forward  		 * and find another one  		 */  		if (slot >= nritems) { -			ret = btrfs_find_next_key(root, path, min_key, level, +			path->slots[level] = slot; +			sret = btrfs_find_next_key(root, path, min_key, level,  						  cache_only, min_trans); -			if (ret == 0) { +			if (sret == 0) {  				btrfs_release_path(root, path);  				goto again;  			} else { @@ -3351,6 +3374,7 @@ int btrfs_previous_item(struct btrfs_root *root,  {  	struct btrfs_key found_key;  	struct extent_buffer *leaf; +	u32 nritems;  	int ret;  	while(1) { @@ -3362,9 +3386,20 @@ int btrfs_previous_item(struct btrfs_root *root,  			path->slots[0]--;  		}  		leaf = path->nodes[0]; +		nritems = btrfs_header_nritems(leaf); +		if (nritems == 0) +			return 1; +		if (path->slots[0] == nritems) +			path->slots[0]--; +  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);  		if (found_key.type == type)  			return 0; +		if (found_key.objectid < min_objectid) +			break; +		if (found_key.objectid == min_objectid && +		    found_key.type < type) +			break;  	}  	return 1;  } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b305ae7e10b..6532b60683e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -77,6 +77,10 @@ struct btrfs_ordered_sum;  /* orhpan objectid for tracking unlinked/truncated files */  #define BTRFS_ORPHAN_OBJECTID -5ULL +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL +  /*   * All files have objectids higher than this.   */ @@ -276,6 +280,7 @@ struct btrfs_super_block {  	__le64 generation;  	__le64 root;  	__le64 chunk_root; +	__le64 log_root;  	__le64 total_bytes;  	__le64 bytes_used;  	__le64 root_dir_objectid; @@ -287,6 +292,7 @@ struct btrfs_super_block {  	__le32 sys_chunk_array_size;  	u8 root_level;  	u8 chunk_root_level; +	u8 log_root_level;  	struct btrfs_dev_item dev_item;  	char label[BTRFS_LABEL_SIZE];  	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; @@ -392,7 +398,10 @@ struct btrfs_timespec {   * make a new item type   */  struct btrfs_inode_item { +	/* nfs style generation number */  	__le64 generation; +	/* transid that last touched this inode */ +	__le64 transid;  	__le64 size;  	__le64 nblocks;  	__le64 block_group; @@ -409,8 +418,13 @@ struct btrfs_inode_item {  	struct btrfs_timespec otime;  } __attribute__ ((__packed__)); +struct btrfs_dir_log_item { +	__le64 end; +} __attribute__ ((__packed__)); +  struct btrfs_dir_item {  	struct btrfs_disk_key location; +	__le64 transid;  	__le16 data_len;  	__le16 name_len;  	u8 type; @@ -505,6 +519,9 @@ struct btrfs_fs_info {  	struct btrfs_root *tree_root;  	struct btrfs_root *chunk_root;  	struct btrfs_root *dev_root; + +	/* the log root tree is a directory of all the other log roots */ +	struct btrfs_root *log_root_tree;  	struct radix_tree_root fs_roots_radix;  	struct extent_io_tree free_space_cache; @@ -518,6 +535,7 @@ struct btrfs_fs_info {  	u64 generation;  	u64 last_trans_committed; +	u64 last_trans_new_blockgroup;  	u64 open_ioctl_trans;  	unsigned long mount_opt;  	u64 max_extent; @@ -527,6 +545,9 @@ struct btrfs_fs_info {  	wait_queue_head_t transaction_throttle;  	wait_queue_head_t transaction_wait;  	wait_queue_head_t async_submit_wait; + +	wait_queue_head_t tree_log_wait; +  	struct btrfs_super_block super_copy;  	struct btrfs_super_block super_for_commit;  	struct block_device *__bdev; @@ -535,6 +556,7 @@ struct btrfs_fs_info {  	struct backing_dev_info bdi;  	spinlock_t hash_lock;  	struct mutex trans_mutex; +	struct mutex tree_log_mutex;  	struct mutex transaction_kthread_mutex;  	struct mutex cleaner_mutex;  	struct mutex alloc_mutex; @@ -544,8 +566,13 @@ struct btrfs_fs_info {  	struct list_head trans_list;  	struct list_head hashers;  	struct list_head dead_roots; +  	atomic_t nr_async_submits;  	atomic_t nr_async_bios; +	atomic_t tree_log_writers; +	atomic_t tree_log_commit; +	unsigned long tree_log_batch; +	u64 tree_log_transid;  	/*  	 * this is used by the balancing code to wait for all the pending @@ -583,6 +610,7 @@ struct btrfs_fs_info {  	struct completion kobj_unregister;  	int do_barriers;  	int closing; +	int log_root_recovering;  	atomic_t throttles;  	atomic_t throttle_gen; @@ -596,6 +624,7 @@ struct btrfs_fs_info {  	u64 delalloc_bytes;  	u64 last_alloc;  	u64 last_data_alloc; +	u64 last_log_alloc;  	spinlock_t ref_cache_lock;  	u64 total_ref_cache_size; @@ -632,6 +661,7 @@ struct btrfs_root {  	struct btrfs_leaf_ref_tree *ref_tree;  	struct btrfs_leaf_ref_tree ref_tree_struct;  	struct btrfs_dirty_root *dirty_root; +	struct btrfs_root *log_root;  	struct btrfs_root_item root_item;  	struct btrfs_key root_key; @@ -640,6 +670,7 @@ struct btrfs_root {  	struct kobject root_kobj;  	struct completion kobj_unregister;  	struct mutex objectid_mutex; +	struct mutex log_mutex;  	u64 objectid;  	u64 last_trans; @@ -692,6 +723,8 @@ struct btrfs_root {   * dir items are the name -> inode pointers in a directory.  There is one   * for every name in a directory.   */ +#define BTRFS_DIR_LOG_ITEM_KEY  14 +#define BTRFS_DIR_LOG_INDEX_KEY 15  #define BTRFS_DIR_ITEM_KEY	16  #define BTRFS_DIR_INDEX_KEY	17  /* @@ -703,7 +736,8 @@ struct btrfs_root {   */  #define BTRFS_CSUM_ITEM_KEY	19 -/* reserve 20-31 for other file stuff */ + +/* reserve 21-31 for other file/dir stuff */  /*   * root items point to tree roots.  There are typically in the root @@ -938,6 +972,7 @@ BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);  /* struct btrfs_inode_item */  BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);  BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);  BTRFS_SETGET_FUNCS(inode_nblocks, struct btrfs_inode_item, nblocks, 64);  BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); @@ -1126,10 +1161,13 @@ static inline void btrfs_set_item_key(struct extent_buffer *eb,  	write_eb_member(eb, item, struct btrfs_item, key, disk_key);  } +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); +  /* struct btrfs_dir_item */  BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);  BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);  BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);  static inline void btrfs_dir_item_key(struct extent_buffer *eb,  				      struct btrfs_dir_item *item, @@ -1301,7 +1339,11 @@ BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,  BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,  			 chunk_root, 64);  BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, -			 chunk_root_level, 64); +			 chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, +			 log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, +			 log_root_level, 8);  BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,  			 total_bytes, 64);  BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, @@ -1405,6 +1447,12 @@ static inline struct dentry *fdentry(struct file *file) {  }  /* extent-tree.c */ +int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path, +			u64 start, u64 len); +int btrfs_update_pinned_extents(struct btrfs_root *root, +				u64 bytenr, u64 num, int pin); +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, +			struct btrfs_root *root, struct extent_buffer *leaf);  int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct btrfs_key *key, u64 bytenr); @@ -1448,6 +1496,11 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,  				u64 root_objectid, u64 ref_generation,  				u64 owner, u64 owner_offset,  				struct btrfs_key *ins); +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, +				struct btrfs_root *root, +				u64 root_objectid, u64 ref_generation, +				u64 owner, u64 owner_offset, +				struct btrfs_key *ins);  int btrfs_reserve_extent(struct btrfs_trans_handle *trans,  				  struct btrfs_root *root,  				  u64 num_bytes, u64 min_alloc_size, @@ -1488,9 +1541,9 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,  			struct btrfs_key *key, int lowest_level,  			int cache_only, u64 min_trans);  int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, +			 struct btrfs_key *max_key,  			 struct btrfs_path *path, int cache_only,  			 u64 min_trans); -  int btrfs_cow_block(struct btrfs_trans_handle *trans,  		    struct btrfs_root *root, struct extent_buffer *buf,  		    struct extent_buffer *parent, int parent_slot, @@ -1656,6 +1709,18 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,  #define PageChecked PageFsMisc  #endif +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, +		       struct btrfs_root *root, +		       struct inode *dir, struct inode *inode, +		       const char *name, int name_len); +int btrfs_add_link(struct btrfs_trans_handle *trans, +		   struct inode *parent_inode, struct inode *inode, +		   const char *name, int name_len, int add_backref, u64 index); +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, +			       struct btrfs_root *root, +			       struct inode *inode, u64 new_size, +			       u32 min_type); +  int btrfs_start_delalloc_inodes(struct btrfs_root *root);  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);  int btrfs_writepages(struct address_space *mapping, @@ -1715,6 +1780,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,  long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);  /* file.c */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);  int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);  int btrfs_check_file(struct btrfs_root *root, struct inode *inode);  extern struct file_operations btrfs_file_operations; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 125094617fe..e4f30090d64 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -96,6 +96,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,  	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);  	btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);  	btrfs_set_dir_name_len(leaf, dir_item, name_len); +	btrfs_set_dir_transid(leaf, dir_item, trans->transid);  	btrfs_set_dir_data_len(leaf, dir_item, data_len);  	name_ptr = (unsigned long)(dir_item + 1);  	data_ptr = (unsigned long)((char *)name_ptr + name_len); @@ -142,6 +143,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root  	btrfs_set_dir_type(leaf, dir_item, type);  	btrfs_set_dir_data_len(leaf, dir_item, 0);  	btrfs_set_dir_name_len(leaf, dir_item, name_len); +	btrfs_set_dir_transid(leaf, dir_item, trans->transid);  	name_ptr = (unsigned long)(dir_item + 1);  	write_extent_buffer(leaf, name, name_ptr, name_len); @@ -169,6 +171,7 @@ second_insert:  	btrfs_set_dir_type(leaf, dir_item, type);  	btrfs_set_dir_data_len(leaf, dir_item, 0);  	btrfs_set_dir_name_len(leaf, dir_item, name_len); +	btrfs_set_dir_transid(leaf, dir_item, trans->transid);  	name_ptr = (unsigned long)(dir_item + 1);  	write_extent_buffer(leaf, name, name_ptr, name_len);  	btrfs_mark_buffer_dirty(leaf); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8e7a938bfbc..a4373db5967 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -41,6 +41,7 @@  #include "async-thread.h"  #include "locking.h"  #include "ref-cache.h" +#include "tree-log.h"  #if 0  static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) @@ -694,6 +695,18 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,  } +int btrfs_write_tree_block(struct extent_buffer *buf) +{ +	return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, +				      buf->start + buf->len - 1, WB_SYNC_NONE); +} + +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ +	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, +				  buf->start, buf->start + buf->len -1); +} +  struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,  				      u32 blocksize, u64 parent_transid)  { @@ -732,15 +745,6 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,  	return 0;  } -int wait_on_tree_block_writeback(struct btrfs_root *root, -				 struct extent_buffer *buf) -{ -	struct inode *btree_inode = root->fs_info->btree_inode; -	wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->io_tree, -					buf); -	return 0; -} -  static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,  			u32 stripesize, struct btrfs_root *root,  			struct btrfs_fs_info *fs_info, @@ -771,6 +775,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,  	spin_lock_init(&root->node_lock);  	spin_lock_init(&root->list_lock);  	mutex_init(&root->objectid_mutex); +	mutex_init(&root->log_mutex);  	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);  	root->ref_tree = &root->ref_tree_struct; @@ -809,11 +814,74 @@ static int find_and_setup_root(struct btrfs_root *tree_root,  	return 0;  } -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, -					       struct btrfs_key *location) +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, +			     struct btrfs_fs_info *fs_info) +{ +	struct extent_buffer *eb; +	int ret; + +	if (!fs_info->log_root_tree) +		return 0; + +	eb = fs_info->log_root_tree->node; + +	WARN_ON(btrfs_header_level(eb) != 0); +	WARN_ON(btrfs_header_nritems(eb) != 0); + +	ret = btrfs_free_extent(trans, fs_info->tree_root, +				eb->start, eb->len, +				BTRFS_TREE_LOG_OBJECTID, 0, 0, 0, 1); +	BUG_ON(ret); + +	free_extent_buffer(eb); +	kfree(fs_info->log_root_tree); +	fs_info->log_root_tree = NULL; +	return 0; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, +			     struct btrfs_fs_info *fs_info)  {  	struct btrfs_root *root;  	struct btrfs_root *tree_root = fs_info->tree_root; + +	root = kzalloc(sizeof(*root), GFP_NOFS); +	if (!root) +		return -ENOMEM; + +	__setup_root(tree_root->nodesize, tree_root->leafsize, +		     tree_root->sectorsize, tree_root->stripesize, +		     root, fs_info, BTRFS_TREE_LOG_OBJECTID); + +	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; +	root->root_key.type = BTRFS_ROOT_ITEM_KEY; +	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; +	root->ref_cows = 0; + +	root->node = btrfs_alloc_free_block(trans, root, root->leafsize, +					    BTRFS_TREE_LOG_OBJECTID, +					    0, 0, 0, 0, 0); + +	btrfs_set_header_nritems(root->node, 0); +	btrfs_set_header_level(root->node, 0); +	btrfs_set_header_bytenr(root->node, root->node->start); +	btrfs_set_header_generation(root->node, trans->transid); +	btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); + +	write_extent_buffer(root->node, root->fs_info->fsid, +			    (unsigned long)btrfs_header_fsid(root->node), +			    BTRFS_FSID_SIZE); +	btrfs_mark_buffer_dirty(root->node); +	btrfs_tree_unlock(root->node); +	fs_info->log_root_tree = root; +	return 0; +} + +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, +					       struct btrfs_key *location) +{ +	struct btrfs_root *root; +	struct btrfs_fs_info *fs_info = tree_root->fs_info;  	struct btrfs_path *path;  	struct extent_buffer *l;  	u64 highest_inode; @@ -863,11 +931,13 @@ out:  				     blocksize, 0);  	BUG_ON(!root->node);  insert: -	root->ref_cows = 1; -	ret = btrfs_find_highest_inode(root, &highest_inode); -	if (ret == 0) { -		root->highest_inode = highest_inode; -		root->last_inode_alloc = highest_inode; +	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { +		root->ref_cows = 1; +		ret = btrfs_find_highest_inode(root, &highest_inode); +		if (ret == 0) { +			root->highest_inode = highest_inode; +			root->last_inode_alloc = highest_inode; +		}  	}  	return root;  } @@ -907,7 +977,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,  	if (root)  		return root; -	root = btrfs_read_fs_root_no_radix(fs_info, location); +	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);  	if (IS_ERR(root))  		return root;  	ret = radix_tree_insert(&fs_info->fs_roots_radix, @@ -1250,16 +1320,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	u32 blocksize;  	u32 stripesize;  	struct buffer_head *bh; -	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root), +	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),  						 GFP_NOFS); -	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root), +	struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),  					       GFP_NOFS);  	struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),  						GFP_NOFS); -	struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root), +	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),  						GFP_NOFS); -	struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root), +	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),  					      GFP_NOFS); +	struct btrfs_root *log_tree_root; +  	int ret;  	int err = -EINVAL; @@ -1343,6 +1415,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);  	mutex_init(&fs_info->trans_mutex); +	mutex_init(&fs_info->tree_log_mutex);  	mutex_init(&fs_info->drop_mutex);  	mutex_init(&fs_info->alloc_mutex);  	mutex_init(&fs_info->chunk_mutex); @@ -1352,6 +1425,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	init_waitqueue_head(&fs_info->transaction_throttle);  	init_waitqueue_head(&fs_info->transaction_wait);  	init_waitqueue_head(&fs_info->async_submit_wait); +	init_waitqueue_head(&fs_info->tree_log_wait); +	atomic_set(&fs_info->tree_log_commit, 0); +	atomic_set(&fs_info->tree_log_writers, 0); +	fs_info->tree_log_transid = 0;  #if 0  	ret = add_hasher(fs_info, "crc32c"); @@ -1532,7 +1609,26 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	if (!fs_info->transaction_kthread)  		goto fail_cleaner; +	if (btrfs_super_log_root(disk_super) != 0) { +		u32 blocksize; +		u64 bytenr = btrfs_super_log_root(disk_super); + +		blocksize = +		     btrfs_level_size(tree_root, +				      btrfs_super_log_root_level(disk_super)); +		log_tree_root = kzalloc(sizeof(struct btrfs_root), +						      GFP_NOFS); + +		__setup_root(nodesize, leafsize, sectorsize, stripesize, +			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); + +		log_tree_root->node = read_tree_block(tree_root, bytenr, +						      blocksize, 0); +		ret = btrfs_recover_log_trees(log_tree_root); +		BUG_ON(ret); +	} +	fs_info->last_trans_committed = btrfs_super_generation(disk_super);  	return tree_root;  fail_cleaner: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2562a273ae1..6b6fdc697f3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -45,7 +45,7 @@ struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,  struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,  				      struct btrfs_key *location,  				      const char *name, int namelen); -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,  					       struct btrfs_key *location);  struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,  					      struct btrfs_key *location); @@ -74,4 +74,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,  			extent_submit_bio_hook_t *submit_bio_hook);  int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);  unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); +int btrfs_write_tree_block(struct extent_buffer *buf); +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, +			     struct btrfs_fs_info *fs_info); +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, +			     struct btrfs_fs_info *fs_info);  #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e63b3b4bed7..646b9148ca2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -496,6 +496,23 @@ static int match_extent_ref(struct extent_buffer *leaf,  	return ret == 0;  } +/* simple helper to search for an existing extent at a given offset */ +int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path, +			u64 start, u64 len) +{ +	int ret; +	struct btrfs_key key; + +	maybe_lock_mutex(root); +	key.objectid = start; +	key.offset = len; +	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); +	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, +				0, 0); +	maybe_unlock_mutex(root); +	return ret; +} +  static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,  					  struct btrfs_root *root,  					  struct btrfs_path *path, u64 bytenr, @@ -1409,7 +1426,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)  } -static int update_pinned_extents(struct btrfs_root *root, +int btrfs_update_pinned_extents(struct btrfs_root *root,  				u64 bytenr, u64 num, int pin)  {  	u64 len; @@ -1492,7 +1509,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,  					    EXTENT_DIRTY);  		if (ret)  			break; -		update_pinned_extents(root, start, end + 1 - start, 0); +		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);  		clear_extent_dirty(unpin, start, end, GFP_NOFS);  		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);  		if (need_resched()) { @@ -1538,14 +1555,11 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,  		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,  				  GFP_NOFS); -		eb = btrfs_find_tree_block(extent_root, ins.objectid, +		eb = btrfs_find_create_tree_block(extent_root, ins.objectid,  					   ins.offset); -		if (!btrfs_buffer_uptodate(eb, trans->transid)) { -			mutex_unlock(&extent_root->fs_info->alloc_mutex); +		if (!btrfs_buffer_uptodate(eb, trans->transid))  			btrfs_read_buffer(eb, trans->transid); -			mutex_lock(&extent_root->fs_info->alloc_mutex); -		}  		btrfs_tree_lock(eb);  		level = btrfs_header_level(eb); @@ -1585,13 +1599,20 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,  		struct extent_buffer *buf;  		buf = btrfs_find_tree_block(root, bytenr, num_bytes);  		if (buf) { +			/* we can reuse a block if it hasn't been written +			 * and it is from this transaction.  We can't +			 * reuse anything from the tree log root because +			 * it has tiny sub-transactions. +			 */  			if (btrfs_buffer_uptodate(buf, 0) &&  			    btrfs_try_tree_lock(buf)) {  				u64 transid =  				    root->fs_info->running_transaction->transid;  				u64 header_transid =  					btrfs_header_generation(buf); -				if (header_transid == transid && +				if (btrfs_header_owner(buf) != +				    BTRFS_TREE_LOG_OBJECTID && +				    header_transid == transid &&  				    !btrfs_header_flag(buf,  					       BTRFS_HEADER_FLAG_WRITTEN)) {  					clean_tree_block(NULL, root, buf); @@ -1603,7 +1624,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,  			}  			free_extent_buffer(buf);  		} -		update_pinned_extents(root, bytenr, num_bytes, 1); +		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);  	} else {  		set_extent_bits(&root->fs_info->pending_del,  				bytenr, bytenr + num_bytes - 1, @@ -1801,7 +1822,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct  				  GFP_NOFS);  		if (!test_range_bit(&extent_root->fs_info->extent_ins,  				    start, end, EXTENT_LOCKED, 0)) { -			update_pinned_extents(extent_root, start, +			btrfs_update_pinned_extents(extent_root, start,  					      end + 1 - start, 1);  			ret = __free_extent(trans, extent_root,  					     start, end + 1 - start, @@ -1919,6 +1940,12 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,  	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {  		last_ptr = &root->fs_info->last_data_alloc;  	} +	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { +		last_ptr = &root->fs_info->last_log_alloc; +		if (!last_ptr == 0 && root->fs_info->last_alloc) { +			*last_ptr = root->fs_info->last_alloc + empty_cluster; +		} +	}  	if (last_ptr) {  		if (*last_ptr) @@ -2268,6 +2295,35 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,  	maybe_unlock_mutex(root);  	return ret;  } + +/* + * this is used by the tree logging recovery code.  It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, +				struct btrfs_root *root, +				u64 root_objectid, u64 ref_generation, +				u64 owner, u64 owner_offset, +				struct btrfs_key *ins) +{ +	int ret; +	struct btrfs_block_group_cache *block_group; + +	maybe_lock_mutex(root); +	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); +	cache_block_group(root, block_group); + +	clear_extent_dirty(&root->fs_info->free_space_cache, +			   ins->objectid, ins->objectid + ins->offset - 1, +			   GFP_NOFS); +	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid, +					    ref_generation, owner, +					    owner_offset, ins); +	maybe_unlock_mutex(root); +	return ret; +} +  /*   * finds a free extent and does all the dirty work required for allocation   * returns the key for the extent through ins, and a tree buffer for @@ -2350,9 +2406,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  	return buf;  } -static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans, -					   struct btrfs_root *root, -					   struct extent_buffer *leaf) +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, +			struct btrfs_root *root, struct extent_buffer *leaf)  {  	u64 leaf_owner;  	u64 leaf_generation; @@ -2402,9 +2457,9 @@ static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,  	return 0;  } -static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans, -					 struct btrfs_root *root, -					 struct btrfs_leaf_ref *ref) +static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans, +					struct btrfs_root *root, +					struct btrfs_leaf_ref *ref)  {  	int i;  	int ret; @@ -2512,7 +2567,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,  		    btrfs_header_nritems(cur))  			break;  		if (*level == 0) { -			ret = drop_leaf_ref_no_cache(trans, root, cur); +			ret = btrfs_drop_leaf_ref(trans, root, cur);  			BUG_ON(ret);  			break;  		} @@ -2552,7 +2607,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,  			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);  			ref = btrfs_lookup_leaf_ref(root, bytenr);  			if (ref) { -				ret = drop_leaf_ref(trans, root, ref); +				ret = cache_drop_leaf_ref(trans, root, ref);  				BUG_ON(ret);  				btrfs_remove_leaf_ref(root, ref);  				btrfs_free_leaf_ref(root, ref); @@ -3628,6 +3683,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	extent_root = root->fs_info->extent_root;  	block_group_cache = &root->fs_info->block_group_cache; +	root->fs_info->last_trans_new_blockgroup = trans->transid; +  	cache = kzalloc(sizeof(*cache), GFP_NOFS);  	BUG_ON(!cache);  	cache->key.objectid = chunk_offset; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e9e86fbaa24..84ecf3ab851 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -36,6 +36,8 @@  #include "btrfs_inode.h"  #include "ioctl.h"  #include "print-tree.h" +#include "tree-log.h" +#include "locking.h"  #include "compat.h" @@ -988,10 +990,27 @@ out_nolock:  	*ppos = pos;  	if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { -		err = sync_page_range(inode, inode->i_mapping, -				      start_pos, num_written); +		struct btrfs_trans_handle *trans; + +		err = btrfs_fdatawrite_range(inode->i_mapping, start_pos, +					     start_pos + num_written -1, +					     WB_SYNC_NONE); +		if (err < 0) +			num_written = err; + +		err = btrfs_wait_on_page_writeback_range(inode->i_mapping, +				 start_pos, start_pos + num_written - 1);  		if (err < 0)  			num_written = err; + +		trans = btrfs_start_transaction(root, 1); +		ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); +		if (ret == 0) { +			btrfs_sync_log(trans, root); +			btrfs_end_transaction(trans, root); +		} else { +			btrfs_commit_transaction(trans, root); +		}  	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)  		do_sync_file_range(file, start_pos, @@ -1019,8 +1038,7 @@ int btrfs_release_file(struct inode * inode, struct file * filp)  	return 0;  } -static int btrfs_sync_file(struct file *file, -			   struct dentry *dentry, int datasync) +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)  {  	struct inode *inode = dentry->d_inode;  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1043,6 +1061,8 @@ static int btrfs_sync_file(struct file *file,  	}  	mutex_unlock(&root->fs_info->trans_mutex); +	filemap_fdatawait(inode->i_mapping); +  	/*  	 * ok we haven't committed the transaction yet, lets do a commit  	 */ @@ -1054,7 +1074,16 @@ static int btrfs_sync_file(struct file *file,  		ret = -ENOMEM;  		goto out;  	} -	ret = btrfs_commit_transaction(trans, root); + +	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); +	if (ret < 0) +		goto out; +	if (ret > 0) { +		ret = btrfs_commit_transaction(trans, root); +	} else { +		btrfs_sync_log(trans, root); +		ret = btrfs_end_transaction(trans, root); +	}  out:  	return ret > 0 ? EIO : ret;  } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 43d3f2649ca..65df9d83023 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -46,6 +46,8 @@  #include "volumes.h"  #include "ordered-data.h"  #include "xattr.h" +#include "compat.h" +#include "tree-log.h"  struct btrfs_iget_args {  	u64 ino; @@ -586,6 +588,7 @@ nocow:  			  &ordered_extent->list);  	btrfs_ordered_update_i_size(inode, ordered_extent); +	btrfs_update_inode(trans, root, inode);  	btrfs_remove_ordered_extent(inode, ordered_extent);  	/* once for us */ @@ -593,7 +596,6 @@ nocow:  	/* once for the tree */  	btrfs_put_ordered_extent(ordered_extent); -	btrfs_update_inode(trans, root, inode);  	btrfs_end_transaction(trans, root);  	return 0;  } @@ -1007,7 +1009,8 @@ void btrfs_read_locked_inode(struct inode *inode)  	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);  	inode->i_blocks = btrfs_inode_nblocks(leaf, inode_item); -	inode->i_generation = btrfs_inode_generation(leaf, inode_item); +	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); +	inode->i_generation = BTRFS_I(inode)->generation;  	inode->i_rdev = 0;  	rdev = btrfs_inode_rdev(leaf, inode_item); @@ -1056,7 +1059,8 @@ make_bad:  	make_bad_inode(inode);  } -static void fill_inode_item(struct extent_buffer *leaf, +static void fill_inode_item(struct btrfs_trans_handle *trans, +			    struct extent_buffer *leaf,  			    struct btrfs_inode_item *item,  			    struct inode *inode)  { @@ -1082,7 +1086,8 @@ static void fill_inode_item(struct extent_buffer *leaf,  				inode->i_ctime.tv_nsec);  	btrfs_set_inode_nblocks(leaf, item, inode->i_blocks); -	btrfs_set_inode_generation(leaf, item, inode->i_generation); +	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); +	btrfs_set_inode_transid(leaf, item, trans->transid);  	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);  	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);  	btrfs_set_inode_block_group(leaf, item, @@ -1112,7 +1117,7 @@ int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,  	inode_item = btrfs_item_ptr(leaf, path->slots[0],  				  struct btrfs_inode_item); -	fill_inode_item(leaf, inode_item, inode); +	fill_inode_item(trans, leaf, inode_item, inode);  	btrfs_mark_buffer_dirty(leaf);  	btrfs_set_inode_last_trans(trans, inode);  	ret = 0; @@ -1122,14 +1127,12 @@ failed:  } -static int btrfs_unlink_trans(struct btrfs_trans_handle *trans, -			      struct btrfs_root *root, -			      struct inode *dir, -			      struct dentry *dentry) +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, +		       struct btrfs_root *root, +		       struct inode *dir, struct inode *inode, +		       const char *name, int name_len)  {  	struct btrfs_path *path; -	const char *name = dentry->d_name.name; -	int name_len = dentry->d_name.len;  	int ret = 0;  	struct extent_buffer *leaf;  	struct btrfs_dir_item *di; @@ -1160,13 +1163,12 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,  	btrfs_release_path(root, path);  	ret = btrfs_del_inode_ref(trans, root, name, name_len, -				  dentry->d_inode->i_ino, -				  dentry->d_parent->d_inode->i_ino, &index); +				  inode->i_ino, +				  dir->i_ino, &index);  	if (ret) {  		printk("failed to delete reference to %.*s, "  		       "inode %lu parent %lu\n", name_len, name, -		       dentry->d_inode->i_ino, -		       dentry->d_parent->d_inode->i_ino); +		       inode->i_ino, dir->i_ino);  		goto err;  	} @@ -1183,21 +1185,25 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,  	ret = btrfs_delete_one_dir_name(trans, root, path, di);  	btrfs_release_path(root, path); -	dentry->d_inode->i_ctime = dir->i_ctime; +	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, +					 inode, dir->i_ino); +	BUG_ON(ret); + +	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, +					   dir, index); +	BUG_ON(ret);  err:  	btrfs_free_path(path); -	if (!ret) { -		btrfs_i_size_write(dir, dir->i_size - name_len * 2); -		dir->i_mtime = dir->i_ctime = CURRENT_TIME; -		btrfs_update_inode(trans, root, dir); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) -		dentry->d_inode->i_nlink--; -#else -		drop_nlink(dentry->d_inode); -#endif -		ret = btrfs_update_inode(trans, root, dentry->d_inode); -		dir->i_sb->s_dirt = 1; -	} +	if (ret) +		goto out; + +	btrfs_i_size_write(dir, dir->i_size - name_len * 2); +	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; +	btrfs_update_inode(trans, root, dir); +	btrfs_drop_nlink(inode); +	ret = btrfs_update_inode(trans, root, inode); +	dir->i_sb->s_dirt = 1; +out:  	return ret;  } @@ -1218,7 +1224,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)  	trans = btrfs_start_transaction(root, 1);  	btrfs_set_trans_block_group(trans, dir); -	ret = btrfs_unlink_trans(trans, root, dir, dentry); +	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, +				 dentry->d_name.name, dentry->d_name.len);  	if (inode->i_nlink == 0)  		ret = btrfs_orphan_add(trans, inode); @@ -1256,7 +1263,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)  		goto fail_trans;  	/* now the directory is empty */ -	err = btrfs_unlink_trans(trans, root, dir, dentry); +	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, +				 dentry->d_name.name, dentry->d_name.len);  	if (!err) {  		btrfs_i_size_write(inode, 0);  	} @@ -1283,10 +1291,10 @@ fail:   * min_type is the minimum key type to truncate down to.  If set to 0, this   * will kill all the items on this inode, including the INODE_ITEM_KEY.   */ -static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, -				   struct btrfs_root *root, -				   struct inode *inode, -				   u32 min_type) +noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, +					struct btrfs_root *root, +					struct inode *inode, +					u64 new_size, u32 min_type)  {  	int ret;  	struct btrfs_path *path; @@ -1307,7 +1315,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,  	int extent_type = -1;  	u64 mask = root->sectorsize - 1; -	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1); +	if (root->ref_cows) +		btrfs_drop_extent_cache(inode, +					new_size & (~mask), (u64)-1);  	path = btrfs_alloc_path();  	path->reada = -1;  	BUG_ON(!path); @@ -1324,7 +1334,13 @@ search_again:  		goto error;  	}  	if (ret > 0) { -		BUG_ON(path->slots[0] == 0); +		/* there are no items in the tree for us to truncate, we're +		 * done +		 */ +		if (path->slots[0] == 0) { +			ret = 0; +			goto error; +		}  		path->slots[0]--;  	} @@ -1358,10 +1374,10 @@ search_again:  		}  		if (found_type == BTRFS_CSUM_ITEM_KEY) {  			ret = btrfs_csum_truncate(trans, root, path, -						  inode->i_size); +						  new_size);  			BUG_ON(ret);  		} -		if (item_end < inode->i_size) { +		if (item_end < new_size) {  			if (found_type == BTRFS_DIR_ITEM_KEY) {  				found_type = BTRFS_INODE_ITEM_KEY;  			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) { @@ -1378,7 +1394,7 @@ search_again:  			btrfs_set_key_type(&key, found_type);  			goto next;  		} -		if (found_key.offset >= inode->i_size) +		if (found_key.offset >= new_size)  			del_item = 1;  		else  			del_item = 0; @@ -1394,7 +1410,7 @@ search_again:  			if (!del_item) {  				u64 orig_num_bytes =  					btrfs_file_extent_num_bytes(leaf, fi); -				extent_num_bytes = inode->i_size - +				extent_num_bytes = new_size -  					found_key.offset + root->sectorsize - 1;  				extent_num_bytes = extent_num_bytes &  					~((u64)root->sectorsize - 1); @@ -1402,7 +1418,7 @@ search_again:  							 extent_num_bytes);  				num_dec = (orig_num_bytes -  					   extent_num_bytes); -				if (extent_start != 0) +				if (root->ref_cows && extent_start != 0)  					dec_i_blocks(inode, num_dec);  				btrfs_mark_buffer_dirty(leaf);  			} else { @@ -1413,22 +1429,29 @@ search_again:  				num_dec = btrfs_file_extent_num_bytes(leaf, fi);  				if (extent_start != 0) {  					found_extent = 1; -					dec_i_blocks(inode, num_dec); +					if (root->ref_cows) +						dec_i_blocks(inode, num_dec); +				} +				if (root->ref_cows) { +					root_gen = +						btrfs_header_generation(leaf);  				} -				root_gen = btrfs_header_generation(leaf);  				root_owner = btrfs_header_owner(leaf);  			}  		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {  			if (!del_item) { -				u32 newsize = inode->i_size - found_key.offset; -				dec_i_blocks(inode, item_end + 1 - -					    found_key.offset - newsize); -				newsize = -				    btrfs_file_extent_calc_inline_size(newsize); +				u32 size = new_size - found_key.offset; + +				if (root->ref_cows) { +					dec_i_blocks(inode, item_end + 1 - +						    found_key.offset - size); +				} +				size = +				    btrfs_file_extent_calc_inline_size(size);  				ret = btrfs_truncate_item(trans, root, path, -							  newsize, 1); +							  size, 1);  				BUG_ON(ret); -			} else { +			} else if (root->ref_cows) {  				dec_i_blocks(inode, item_end + 1 -  					     found_key.offset);  			} @@ -1666,7 +1689,7 @@ void btrfs_delete_inode(struct inode *inode)  	trans = btrfs_start_transaction(root, 1);  	btrfs_set_trans_block_group(trans, inode); -	ret = btrfs_truncate_in_trans(trans, root, inode, 0); +	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);  	if (ret) {  		btrfs_orphan_del(NULL, inode);  		goto no_delete_lock; @@ -1753,15 +1776,20 @@ static int fixup_tree_root_location(struct btrfs_root *root,  	return 0;  } -static int btrfs_init_locked_inode(struct inode *inode, void *p) +static noinline void init_btrfs_i(struct inode *inode)  { -	struct btrfs_iget_args *args = p; -	inode->i_ino = args->ino; -	BTRFS_I(inode)->root = args->root; -	BTRFS_I(inode)->delalloc_bytes = 0; -	inode->i_mapping->writeback_index = 0; -	BTRFS_I(inode)->disk_i_size = 0; -	BTRFS_I(inode)->index_cnt = (u64)-1; +	struct btrfs_inode *bi = BTRFS_I(inode); + +	bi->i_acl = NULL; +	bi->i_default_acl = NULL; + +	bi->generation = 0; +	bi->last_trans = 0; +	bi->logged_trans = 0; +	bi->delalloc_bytes = 0; +	bi->disk_i_size = 0; +	bi->flags = 0; +	bi->index_cnt = (u64)-1;  	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);  	extent_io_tree_init(&BTRFS_I(inode)->io_tree,  			     inode->i_mapping, GFP_NOFS); @@ -1771,6 +1799,15 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)  	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);  	mutex_init(&BTRFS_I(inode)->csum_mutex);  	mutex_init(&BTRFS_I(inode)->extent_mutex); +	mutex_init(&BTRFS_I(inode)->log_mutex); +} + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ +	struct btrfs_iget_args *args = p; +	inode->i_ino = args->ino; +	init_btrfs_i(inode); +	BTRFS_I(inode)->root = args->root;  	return 0;  } @@ -2263,21 +2300,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	 * btrfs_get_inode_index_count has an explanation for the magic  	 * number  	 */ +	init_btrfs_i(inode);  	BTRFS_I(inode)->index_cnt = 2; - -	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); -	extent_io_tree_init(&BTRFS_I(inode)->io_tree, -			     inode->i_mapping, GFP_NOFS); -	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, -			     inode->i_mapping, GFP_NOFS); -	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); -	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); -	mutex_init(&BTRFS_I(inode)->csum_mutex); -	mutex_init(&BTRFS_I(inode)->extent_mutex); -	BTRFS_I(inode)->delalloc_bytes = 0; -	inode->i_mapping->writeback_index = 0; -	BTRFS_I(inode)->disk_i_size = 0;  	BTRFS_I(inode)->root = root; +	BTRFS_I(inode)->generation = trans->transid;  	if (mode & S_IFDIR)  		owner = 0; @@ -2290,7 +2316,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  		new_inode_group = group;  	}  	BTRFS_I(inode)->block_group = new_inode_group; -	BTRFS_I(inode)->flags = 0;  	key[0].objectid = objectid;  	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); @@ -2318,7 +2343,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;  	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],  				  struct btrfs_inode_item); -	fill_inode_item(path->nodes[0], inode_item, inode); +	fill_inode_item(trans, path->nodes[0], inode_item, inode);  	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,  			     struct btrfs_inode_ref); @@ -2349,38 +2374,34 @@ static inline u8 btrfs_inode_type(struct inode *inode)  	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];  } -static int btrfs_add_link(struct btrfs_trans_handle *trans, -			    struct dentry *dentry, struct inode *inode, -			    int add_backref, u64 index) +int btrfs_add_link(struct btrfs_trans_handle *trans, +		   struct inode *parent_inode, struct inode *inode, +		   const char *name, int name_len, int add_backref, u64 index)  {  	int ret;  	struct btrfs_key key; -	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root; -	struct inode *parent_inode = dentry->d_parent->d_inode; +	struct btrfs_root *root = BTRFS_I(parent_inode)->root;  	key.objectid = inode->i_ino;  	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);  	key.offset = 0; -	ret = btrfs_insert_dir_item(trans, root, -				    dentry->d_name.name, dentry->d_name.len, -				    dentry->d_parent->d_inode->i_ino, +	ret = btrfs_insert_dir_item(trans, root, name, name_len, +				    parent_inode->i_ino,  				    &key, btrfs_inode_type(inode),  				    index);  	if (ret == 0) {  		if (add_backref) {  			ret = btrfs_insert_inode_ref(trans, root, -					     dentry->d_name.name, -					     dentry->d_name.len, -					     inode->i_ino, -					     parent_inode->i_ino, -					     index); +						     name, name_len, +						     inode->i_ino, +						     parent_inode->i_ino, +						     index);  		}  		btrfs_i_size_write(parent_inode, parent_inode->i_size + -				   dentry->d_name.len * 2); +				   name_len * 2);  		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; -		ret = btrfs_update_inode(trans, root, -					 dentry->d_parent->d_inode); +		ret = btrfs_update_inode(trans, root, parent_inode);  	}  	return ret;  } @@ -2389,7 +2410,9 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,  			    struct dentry *dentry, struct inode *inode,  			    int backref, u64 index)  { -	int err = btrfs_add_link(trans, dentry, inode, backref, index); +	int err = btrfs_add_link(trans, dentry->d_parent->d_inode, +				 inode, dentry->d_name.name, +				 dentry->d_name.len, backref, index);  	if (!err) {  		d_instantiate(dentry, inode);  		return 0; @@ -2513,19 +2536,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,  		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;  		inode->i_fop = &btrfs_file_operations;  		inode->i_op = &btrfs_file_inode_operations; -		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); -		extent_io_tree_init(&BTRFS_I(inode)->io_tree, -				     inode->i_mapping, GFP_NOFS); -		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, -				     inode->i_mapping, GFP_NOFS); -		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); -		mutex_init(&BTRFS_I(inode)->csum_mutex); -		mutex_init(&BTRFS_I(inode)->extent_mutex); -		BTRFS_I(inode)->delalloc_bytes = 0; -		BTRFS_I(inode)->disk_i_size = 0; -		inode->i_mapping->writeback_index = 0;  		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);  	}  	dir->i_sb->s_dirt = 1;  	btrfs_update_inode_block_group(trans, inode); @@ -2556,11 +2567,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,  	if (inode->i_nlink == 0)  		return -ENOENT; -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) -	inode->i_nlink++; -#else -	inc_nlink(inode); -#endif +	btrfs_inc_nlink(inode);  	err = btrfs_check_free_space(root, 1, 0);  	if (err)  		goto fail; @@ -2650,7 +2657,9 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)  	if (err)  		goto out_fail; -	err = btrfs_add_link(trans, dentry, inode, 0, index); +	err = btrfs_add_link(trans, dentry->d_parent->d_inode, +				 inode, dentry->d_name.name, +				 dentry->d_name.len, 0, index);  	if (err)  		goto out_fail; @@ -3221,7 +3230,7 @@ static void btrfs_truncate(struct inode *inode)  	if (ret)  		goto out;  	/* FIXME, add redo link to tree so we don't leak on crash */ -	ret = btrfs_truncate_in_trans(trans, root, inode, +	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,  				      BTRFS_EXTENT_DATA_KEY);  	btrfs_update_inode(trans, root, inode); @@ -3304,6 +3313,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	if (!ei)  		return NULL;  	ei->last_trans = 0; +	ei->logged_trans = 0;  	btrfs_ordered_inode_tree_init(&ei->ordered_tree);  	ei->i_acl = BTRFS_ACL_NOT_CACHED;  	ei->i_default_acl = BTRFS_ACL_NOT_CACHED; @@ -3463,31 +3473,39 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,  	btrfs_set_trans_block_group(trans, new_dir); -	old_dentry->d_inode->i_nlink++; +	btrfs_inc_nlink(old_dentry->d_inode);  	old_dir->i_ctime = old_dir->i_mtime = ctime;  	new_dir->i_ctime = new_dir->i_mtime = ctime;  	old_inode->i_ctime = ctime; -	ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry); +	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, +				 old_dentry->d_name.name, +				 old_dentry->d_name.len);  	if (ret)  		goto out_fail;  	if (new_inode) {  		new_inode->i_ctime = CURRENT_TIME; -		ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry); +		ret = btrfs_unlink_inode(trans, root, new_dir, +					 new_dentry->d_inode, +					 new_dentry->d_name.name, +					 new_dentry->d_name.len);  		if (ret)  			goto out_fail;  		if (new_inode->i_nlink == 0) { -			ret = btrfs_orphan_add(trans, new_inode); +			ret = btrfs_orphan_add(trans, new_dentry->d_inode);  			if (ret)  				goto out_fail;  		} +  	}  	ret = btrfs_set_inode_index(new_dir, old_inode, &index);  	if (ret)  		goto out_fail; -	ret = btrfs_add_link(trans, new_dentry, old_inode, 1, index); +	ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, +			     old_inode, new_dentry->d_name.name, +			     new_dentry->d_name.len, 1, index);  	if (ret)  		goto out_fail; @@ -3577,19 +3595,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,  		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;  		inode->i_fop = &btrfs_file_operations;  		inode->i_op = &btrfs_file_inode_operations; -		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); -		extent_io_tree_init(&BTRFS_I(inode)->io_tree, -				     inode->i_mapping, GFP_NOFS); -		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, -				     inode->i_mapping, GFP_NOFS); -		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); -		mutex_init(&BTRFS_I(inode)->csum_mutex); -		mutex_init(&BTRFS_I(inode)->extent_mutex); -		BTRFS_I(inode)->delalloc_bytes = 0; -		BTRFS_I(inode)->disk_i_size = 0; -		inode->i_mapping->writeback_index = 0;  		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);  	}  	dir->i_sb->s_dirt = 1;  	btrfs_update_inode_block_group(trans, inode); @@ -3691,6 +3697,7 @@ static struct file_operations btrfs_dir_file_operations = {  	.compat_ioctl	= btrfs_ioctl,  #endif  	.release        = btrfs_release_file, +	.fsync		= btrfs_sync_file,  };  static struct extent_io_ops btrfs_extent_io_ops = { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 36726696e58..e3984f902e7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -202,8 +202,9 @@ again:  		memcpy(&found_key, &key, sizeof(key));  		key.offset++;  		btrfs_release_path(root, path); -		dead_root = btrfs_read_fs_root_no_radix(root->fs_info, -							&found_key); +		dead_root = +			btrfs_read_fs_root_no_radix(root->fs_info->tree_root, +						    &found_key);  		if (IS_ERR(dead_root)) {  			ret = PTR_ERR(dead_root);  			goto err; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index eff3ad72991..49c4f5b40ed 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -25,6 +25,7 @@  #include "transaction.h"  #include "locking.h"  #include "ref-cache.h" +#include "tree-log.h"  static int total_trans = 0;  extern struct kmem_cache *btrfs_trans_handle_cachep; @@ -57,6 +58,7 @@ static noinline int join_transaction(struct btrfs_root *root)  		root->fs_info->generation++;  		root->fs_info->last_alloc = 0;  		root->fs_info->last_data_alloc = 0; +		root->fs_info->last_log_alloc = 0;  		cur_trans->num_writers = 1;  		cur_trans->num_joined = 0;  		cur_trans->transid = root->fs_info->generation; @@ -83,7 +85,7 @@ static noinline int join_transaction(struct btrfs_root *root)  	return 0;  } -static noinline int record_root_in_trans(struct btrfs_root *root) +noinline int btrfs_record_root_in_trans(struct btrfs_root *root)  {  	struct btrfs_dirty_root *dirty;  	u64 running_trans_id = root->fs_info->running_transaction->transid; @@ -151,7 +153,7 @@ static void wait_current_trans(struct btrfs_root *root)  	}  } -struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, +static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,  					     int num_blocks, int wait)  {  	struct btrfs_trans_handle *h = @@ -164,7 +166,7 @@ struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,  	ret = join_transaction(root);  	BUG_ON(ret); -	record_root_in_trans(root); +	btrfs_record_root_in_trans(root);  	h->transid = root->fs_info->running_transaction->transid;  	h->transaction = root->fs_info->running_transaction;  	h->blocks_reserved = num_blocks; @@ -456,6 +458,8 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,  			BUG_ON(!root->ref_tree);  			dirty = root->dirty_root; +			btrfs_free_log(trans, root); +  			if (root->commit_root == root->node) {  				WARN_ON(root->node->start !=  					btrfs_root_bytenr(&root->root_item)); @@ -600,7 +604,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,  		num_bytes -= btrfs_root_used(&dirty->root->root_item);  		bytes_used = btrfs_root_used(&root->root_item);  		if (num_bytes) { -			record_root_in_trans(root); +			btrfs_record_root_in_trans(root);  			btrfs_set_root_used(&root->root_item,  					    bytes_used - num_bytes);  		} @@ -745,7 +749,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	int ret;  	INIT_LIST_HEAD(&dirty_fs_roots); -  	mutex_lock(&root->fs_info->trans_mutex);  	if (trans->transaction->in_commit) {  		cur_trans = trans->transaction; @@ -821,10 +824,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	WARN_ON(cur_trans != trans->transaction); +	/* btrfs_commit_tree_roots is responsible for getting the +	 * various roots consistent with each other.  Every pointer +	 * in the tree of tree roots has to point to the most up to date +	 * root for every subvolume and other tree.  So, we have to keep +	 * the tree logging code from jumping in and changing any +	 * of the trees. +	 * +	 * At this point in the commit, there can't be any tree-log +	 * writers, but a little lower down we drop the trans mutex +	 * and let new people in.  By holding the tree_log_mutex +	 * from now until after the super is written, we avoid races +	 * with the tree-log code. +	 */ +	mutex_lock(&root->fs_info->tree_log_mutex); +  	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,  			      &dirty_fs_roots);  	BUG_ON(ret); +	/* add_dirty_roots gets rid of all the tree log roots, it is now +	 * safe to free the root of tree log roots +	 */ +	btrfs_free_log_root_tree(trans, root->fs_info); +  	ret = btrfs_commit_tree_roots(trans, root);  	BUG_ON(ret); @@ -843,6 +866,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  				   chunk_root->node->start);  	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,  					 btrfs_header_level(chunk_root->node)); + +	if (!root->fs_info->log_root_recovering) { +		btrfs_set_super_log_root(&root->fs_info->super_copy, 0); +		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); +	} +  	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,  	       sizeof(root->fs_info->super_copy)); @@ -857,6 +886,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	BUG_ON(ret);  	write_ctree_super(trans, root); +	/* +	 * the super is written, we can safely allow the tree-loggers +	 * to go about their business +	 */ +	mutex_unlock(&root->fs_info->tree_log_mutex); +  	btrfs_finish_extent_commit(trans, root, pinned_copy);  	mutex_lock(&root->fs_info->trans_mutex); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 598baa31241..cc63650d60d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -98,4 +98,5 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root);  void btrfs_throttle(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_root *root);  #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index cc2650b0695..b3bb5bbad76 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -81,12 +81,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,  		memcpy(&key, &root->defrag_progress, sizeof(key));  	} -	path->lowest_level = 1;  	path->keep_locks = 1;  	if (cache_only)  		min_trans = root->defrag_trans_start; -	ret = btrfs_search_forward(root, &key, path, cache_only, min_trans); +	ret = btrfs_search_forward(root, &key, NULL, path, +				   cache_only, min_trans);  	if (ret < 0)  		goto out;  	if (ret > 0) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c new file mode 100644 index 00000000000..d1ce8314b94 --- /dev/null +++ b/fs/btrfs/tree-log.c @@ -0,0 +1,2804 @@ +/* + * Copyright (C) 2008 Oracle.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "print-tree.h" +#include "compat.h" + +/* magic values for the inode_only field in btrfs_log_inode: + * + * LOG_INODE_ALL means to log everything + * LOG_INODE_EXISTS means to log just enough to recreate the inode + * during log replay + */ +#define LOG_INODE_ALL 0 +#define LOG_INODE_EXISTS 1 + +/* + * stages for the tree walking.  The first + * stage (0) is to only pin down the blocks we find + * the second stage (1) is to make sure that all the inodes + * we find in the log are created in the subvolume. + * + * The last stage is to deal with directories and links and extents + * and all the other fun semantics + */ +#define LOG_WALK_PIN_ONLY 0 +#define LOG_WALK_REPLAY_INODES 1 +#define LOG_WALK_REPLAY_ALL 2 + +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +			     struct btrfs_root *root, struct inode *inode, +			     int inode_only); + +/* + * tree logging is a special write ahead log used to make sure that + * fsyncs and O_SYNCs can happen without doing full tree commits. + * + * Full tree commits are expensive because they require commonly + * modified blocks to be recowed, creating many dirty pages in the + * extent tree an 4x-6x higher write load than ext3. + * + * Instead of doing a tree commit on every fsync, we use the + * key ranges and transaction ids to find items for a given file or directory + * that have changed in this transaction.  Those items are copied into + * a special tree (one per subvolume root), that tree is written to disk + * and then the fsync is considered complete. + * + * After a crash, items are copied out of the log-tree back into the + * subvolume tree.  Any file data extents found are recorded in the extent + * allocation tree, and the log-tree freed. + * + * The log tree is read three times, once to pin down all the extents it is + * using in ram and once, once to create all the inodes logged in the tree + * and once to do all the other items. + */ + +/* + * btrfs_add_log_tree adds a new per-subvolume log tree into the + * tree of log tree roots.  This must be called with a tree log transaction + * running (see start_log_trans). + */ +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, +		      struct btrfs_root *root) +{ +	struct btrfs_key key; +	struct btrfs_root_item root_item; +	struct btrfs_inode_item *inode_item; +	struct extent_buffer *leaf; +	struct btrfs_root *new_root = root; +	int ret; +	u64 objectid = root->root_key.objectid; + +	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, +				      BTRFS_TREE_LOG_OBJECTID, +				      0, 0, 0, 0, 0); +	if (IS_ERR(leaf)) { +		ret = PTR_ERR(leaf); +		return ret; +	} + +	btrfs_set_header_nritems(leaf, 0); +	btrfs_set_header_level(leaf, 0); +	btrfs_set_header_bytenr(leaf, leaf->start); +	btrfs_set_header_generation(leaf, trans->transid); +	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); + +	write_extent_buffer(leaf, root->fs_info->fsid, +			    (unsigned long)btrfs_header_fsid(leaf), +			    BTRFS_FSID_SIZE); +	btrfs_mark_buffer_dirty(leaf); + +	inode_item = &root_item.inode; +	memset(inode_item, 0, sizeof(*inode_item)); +	inode_item->generation = cpu_to_le64(1); +	inode_item->size = cpu_to_le64(3); +	inode_item->nlink = cpu_to_le32(1); +	inode_item->nblocks = cpu_to_le64(1); +	inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + +	btrfs_set_root_bytenr(&root_item, leaf->start); +	btrfs_set_root_level(&root_item, 0); +	btrfs_set_root_refs(&root_item, 0); +	btrfs_set_root_used(&root_item, 0); + +	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); +	root_item.drop_level = 0; + +	btrfs_tree_unlock(leaf); +	free_extent_buffer(leaf); +	leaf = NULL; + +	btrfs_set_root_dirid(&root_item, 0); + +	key.objectid = BTRFS_TREE_LOG_OBJECTID; +	key.offset = objectid; +	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); +	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, +				&root_item); +	if (ret) +		goto fail; + +	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, +					       &key); +	BUG_ON(!new_root); + +	WARN_ON(root->log_root); +	root->log_root = new_root; + +	/* +	 * log trees do not get reference counted because they go away +	 * before a real commit is actually done.  They do store pointers +	 * to file data extents, and those reference counts still get +	 * updated (along with back refs to the log tree). +	 */ +	new_root->ref_cows = 0; +	new_root->last_trans = trans->transid; +fail: +	return ret; +} + +/* + * start a sub transaction and setup the log tree + * this increments the log tree writer count to make the people + * syncing the tree wait for us to finish + */ +static int start_log_trans(struct btrfs_trans_handle *trans, +			   struct btrfs_root *root) +{ +	int ret; +	mutex_lock(&root->fs_info->tree_log_mutex); +	if (!root->fs_info->log_root_tree) { +		ret = btrfs_init_log_root_tree(trans, root->fs_info); +		BUG_ON(ret); +	} +	if (!root->log_root) { +		ret = btrfs_add_log_tree(trans, root); +		BUG_ON(ret); +	} +	atomic_inc(&root->fs_info->tree_log_writers); +	root->fs_info->tree_log_batch++; +	mutex_unlock(&root->fs_info->tree_log_mutex); +	return 0; +} + +/* + * returns 0 if there was a log transaction running and we were able + * to join, or returns -ENOENT if there were not transactions + * in progress + */ +static int join_running_log_trans(struct btrfs_root *root) +{ +	int ret = -ENOENT; + +	smp_mb(); +	if (!root->log_root) +		return -ENOENT; + +	mutex_lock(&root->fs_info->tree_log_mutex); +	if (root->log_root) { +		ret = 0; +		atomic_inc(&root->fs_info->tree_log_writers); +		root->fs_info->tree_log_batch++; +	} +	mutex_unlock(&root->fs_info->tree_log_mutex); +	return ret; +} + +/* + * indicate we're done making changes to the log tree + * and wake up anyone waiting to do a sync + */ +static int end_log_trans(struct btrfs_root *root) +{ +	atomic_dec(&root->fs_info->tree_log_writers); +	smp_mb(); +	if (waitqueue_active(&root->fs_info->tree_log_wait)) +		wake_up(&root->fs_info->tree_log_wait); +	return 0; +} + + +/* + * the walk control struct is used to pass state down the chain when + * processing the log tree.  The stage field tells us which part + * of the log tree processing we are currently doing.  The others + * are state fields used for that specific part + */ +struct walk_control { +	/* should we free the extent on disk when done?  This is used +	 * at transaction commit time while freeing a log tree +	 */ +	int free; + +	/* should we write out the extent buffer?  This is used +	 * while flushing the log tree to disk during a sync +	 */ +	int write; + +	/* should we wait for the extent buffer io to finish?  Also used +	 * while flushing the log tree to disk for a sync +	 */ +	int wait; + +	/* pin only walk, we record which extents on disk belong to the +	 * log trees +	 */ +	int pin; + +	/* what stage of the replay code we're currently in */ +	int stage; + +	/* the root we are currently replaying */ +	struct btrfs_root *replay_dest; + +	/* the trans handle for the current replay */ +	struct btrfs_trans_handle *trans; + +	/* the function that gets used to process blocks we find in the +	 * tree.  Note the extent_buffer might not be up to date when it is +	 * passed in, and it must be checked or read if you need the data +	 * inside it +	 */ +	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, +			    struct walk_control *wc, u64 gen); +}; + +/* + * process_func used to pin down extents, write them or wait on them + */ +static int process_one_buffer(struct btrfs_root *log, +			      struct extent_buffer *eb, +			      struct walk_control *wc, u64 gen) +{ +	if (wc->pin) { +		mutex_lock(&log->fs_info->alloc_mutex); +		btrfs_update_pinned_extents(log->fs_info->extent_root, +					    eb->start, eb->len, 1); +		mutex_unlock(&log->fs_info->alloc_mutex); +	} + +	if (btrfs_buffer_uptodate(eb, gen)) { +		if (wc->write) +			btrfs_write_tree_block(eb); +		if (wc->wait) +			btrfs_wait_tree_block_writeback(eb); +	} +	return 0; +} + +/* + * Item overwrite used by replay and tree logging.  eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten.  If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static noinline int overwrite_item(struct btrfs_trans_handle *trans, +				   struct btrfs_root *root, +				   struct btrfs_path *path, +				   struct extent_buffer *eb, int slot, +				   struct btrfs_key *key) +{ +	int ret; +	u32 item_size; +	u64 saved_i_size = 0; +	int save_old_i_size = 0; +	unsigned long src_ptr; +	unsigned long dst_ptr; +	int overwrite_root = 0; + +	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) +		overwrite_root = 1; + +	item_size = btrfs_item_size_nr(eb, slot); +	src_ptr = btrfs_item_ptr_offset(eb, slot); + +	/* look for the key in the destination tree */ +	ret = btrfs_search_slot(NULL, root, key, path, 0, 0); +	if (ret == 0) { +		char *src_copy; +		char *dst_copy; +		u32 dst_size = btrfs_item_size_nr(path->nodes[0], +						  path->slots[0]); +		if (dst_size != item_size) +			goto insert; + +		if (item_size == 0) { +			btrfs_release_path(root, path); +			return 0; +		} +		dst_copy = kmalloc(item_size, GFP_NOFS); +		src_copy = kmalloc(item_size, GFP_NOFS); + +		read_extent_buffer(eb, src_copy, src_ptr, item_size); + +		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); +		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, +				   item_size); +		ret = memcmp(dst_copy, src_copy, item_size); + +		kfree(dst_copy); +		kfree(src_copy); +		/* +		 * they have the same contents, just return, this saves +		 * us from cowing blocks in the destination tree and doing +		 * extra writes that may not have been done by a previous +		 * sync +		 */ +		if (ret == 0) { +			btrfs_release_path(root, path); +			return 0; +		} + +	} +insert: +	btrfs_release_path(root, path); +	/* try to insert the key into the destination tree */ +	ret = btrfs_insert_empty_item(trans, root, path, +				      key, item_size); + +	/* make sure any existing item is the correct size */ +	if (ret == -EEXIST) { +		u32 found_size; +		found_size = btrfs_item_size_nr(path->nodes[0], +						path->slots[0]); +		if (found_size > item_size) { +			btrfs_truncate_item(trans, root, path, item_size, 1); +		} else if (found_size < item_size) { +			ret = btrfs_del_item(trans, root, +					     path); +			BUG_ON(ret); + +			btrfs_release_path(root, path); +			ret = btrfs_insert_empty_item(trans, +				  root, path, key, item_size); +			BUG_ON(ret); +		} +	} else if (ret) { +		BUG(); +	} +	dst_ptr = btrfs_item_ptr_offset(path->nodes[0], +					path->slots[0]); + +	/* don't overwrite an existing inode if the generation number +	 * was logged as zero.  This is done when the tree logging code +	 * is just logging an inode to make sure it exists after recovery. +	 * +	 * Also, don't overwrite i_size on directories during replay. +	 * log replay inserts and removes directory items based on the +	 * state of the tree found in the subvolume, and i_size is modified +	 * as it goes +	 */ +	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { +		struct btrfs_inode_item *src_item; +		struct btrfs_inode_item *dst_item; + +		src_item = (struct btrfs_inode_item *)src_ptr; +		dst_item = (struct btrfs_inode_item *)dst_ptr; + +		if (btrfs_inode_generation(eb, src_item) == 0) +			goto no_copy; + +		if (overwrite_root && +		    S_ISDIR(btrfs_inode_mode(eb, src_item)) && +		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { +			save_old_i_size = 1; +			saved_i_size = btrfs_inode_size(path->nodes[0], +							dst_item); +		} +	} + +	copy_extent_buffer(path->nodes[0], eb, dst_ptr, +			   src_ptr, item_size); + +	if (save_old_i_size) { +		struct btrfs_inode_item *dst_item; +		dst_item = (struct btrfs_inode_item *)dst_ptr; +		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); +	} + +	/* make sure the generation is filled in */ +	if (key->type == BTRFS_INODE_ITEM_KEY) { +		struct btrfs_inode_item *dst_item; +		dst_item = (struct btrfs_inode_item *)dst_ptr; +		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { +			btrfs_set_inode_generation(path->nodes[0], dst_item, +						   trans->transid); +		} +	} +no_copy: +	btrfs_mark_buffer_dirty(path->nodes[0]); +	btrfs_release_path(root, path); +	return 0; +} + +/* + * simple helper to read an inode off the disk from a given root + * This can only be called for subvolume roots and not for the log + */ +static noinline struct inode *read_one_inode(struct btrfs_root *root, +					     u64 objectid) +{ +	struct inode *inode; +	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); +	if (inode->i_state & I_NEW) { +		BTRFS_I(inode)->root = root; +		BTRFS_I(inode)->location.objectid = objectid; +		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; +		BTRFS_I(inode)->location.offset = 0; +		btrfs_read_locked_inode(inode); +		unlock_new_inode(inode); + +	} +	if (is_bad_inode(inode)) { +		iput(inode); +		inode = NULL; +	} +	return inode; +} + +/* replays a single extent in 'eb' at 'slot' with 'key' into the + * subvolume 'root'.  path is released on entry and should be released + * on exit. + * + * extents in the log tree have not been allocated out of the extent + * tree yet.  So, this completes the allocation, taking a reference + * as required if the extent already exists or creating a new extent + * if it isn't in the extent allocation tree yet. + * + * The extent is inserted into the file, dropping any existing extents + * from the file that overlap the new one. + */ +static noinline int replay_one_extent(struct btrfs_trans_handle *trans, +				      struct btrfs_root *root, +				      struct btrfs_path *path, +				      struct extent_buffer *eb, int slot, +				      struct btrfs_key *key) +{ +	int found_type; +	u64 mask = root->sectorsize - 1; +	u64 extent_end; +	u64 alloc_hint; +	u64 start = key->offset; +	struct btrfs_file_extent_item *item; +	struct inode *inode = NULL; +	unsigned long size; +	int ret = 0; + +	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); +	found_type = btrfs_file_extent_type(eb, item); + +	if (found_type == BTRFS_FILE_EXTENT_REG) +		extent_end = start + btrfs_file_extent_num_bytes(eb, item); +	else if (found_type == BTRFS_FILE_EXTENT_INLINE) { +		size = btrfs_file_extent_inline_len(eb, +						    btrfs_item_nr(eb, slot)); +		extent_end = (start + size + mask) & ~mask; +	} else { +		ret = 0; +		goto out; +	} + +	inode = read_one_inode(root, key->objectid); +	if (!inode) { +		ret = -EIO; +		goto out; +	} + +	/* +	 * first check to see if we already have this extent in the +	 * file.  This must be done before the btrfs_drop_extents run +	 * so we don't try to drop this extent. +	 */ +	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, +				       start, 0); + +	if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) { +		struct btrfs_file_extent_item cmp1; +		struct btrfs_file_extent_item cmp2; +		struct btrfs_file_extent_item *existing; +		struct extent_buffer *leaf; + +		leaf = path->nodes[0]; +		existing = btrfs_item_ptr(leaf, path->slots[0], +					  struct btrfs_file_extent_item); + +		read_extent_buffer(eb, &cmp1, (unsigned long)item, +				   sizeof(cmp1)); +		read_extent_buffer(leaf, &cmp2, (unsigned long)existing, +				   sizeof(cmp2)); + +		/* +		 * we already have a pointer to this exact extent, +		 * we don't have to do anything +		 */ +		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { +			btrfs_release_path(root, path); +			goto out; +		} +	} +	btrfs_release_path(root, path); + +	/* drop any overlapping extents */ +	ret = btrfs_drop_extents(trans, root, inode, +			 start, extent_end, start, &alloc_hint); +	BUG_ON(ret); + +	BUG_ON(ret); +	if (found_type == BTRFS_FILE_EXTENT_REG) { +		struct btrfs_key ins; + +		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); +		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); +		ins.type = BTRFS_EXTENT_ITEM_KEY; + +		/* insert the extent pointer in the file */ +		ret = overwrite_item(trans, root, path, eb, slot, key); +		BUG_ON(ret); + +		/* +		 * is this extent already allocated in the extent +		 * allocation tree?  If so, just add a reference +		 */ +		ret = btrfs_lookup_extent(root, path, ins.objectid, ins.offset); +		btrfs_release_path(root, path); +		if (ret == 0) { +			ret = btrfs_inc_extent_ref(trans, root, +				   ins.objectid, ins.offset, +				   root->root_key.objectid, +				   trans->transid, key->objectid, start); +		} else { +			/* +			 * insert the extent pointer in the extent +			 * allocation tree +			 */ +			ret = btrfs_alloc_logged_extent(trans, root, +						root->root_key.objectid, +						trans->transid, key->objectid, +						start, &ins); +			BUG_ON(ret); +		} +	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) { +		/* inline extents are easy, we just overwrite them */ +		ret = overwrite_item(trans, root, path, eb, slot, key); +		BUG_ON(ret); +	} +	/* btrfs_drop_extents changes i_blocks, update it here */ +	inode->i_blocks += (extent_end - start) >> 9; +	btrfs_update_inode(trans, root, inode); +out: +	if (inode) +		iput(inode); +	return ret; +} + +/* + * when cleaning up conflicts between the directory names in the + * subvolume, directory names in the log and directory names in the + * inode back references, we may have to unlink inodes from directories. + * + * This is a helper function to do the unlink of a specific directory + * item + */ +static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, +				      struct btrfs_root *root, +				      struct btrfs_path *path, +				      struct inode *dir, +				      struct btrfs_dir_item *di) +{ +	struct inode *inode; +	char *name; +	int name_len; +	struct extent_buffer *leaf; +	struct btrfs_key location; +	int ret; + +	leaf = path->nodes[0]; + +	btrfs_dir_item_key_to_cpu(leaf, di, &location); +	name_len = btrfs_dir_name_len(leaf, di); +	name = kmalloc(name_len, GFP_NOFS); +	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); +	btrfs_release_path(root, path); + +	inode = read_one_inode(root, location.objectid); +	BUG_ON(!inode); + +	btrfs_inc_nlink(inode); +	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); +	kfree(name); + +	iput(inode); +	return ret; +} + +/* + * helper function to see if a given name and sequence number found + * in an inode back reference are already in a directory and correctly + * point to this inode + */ +static noinline int inode_in_dir(struct btrfs_root *root, +				 struct btrfs_path *path, +				 u64 dirid, u64 objectid, u64 index, +				 const char *name, int name_len) +{ +	struct btrfs_dir_item *di; +	struct btrfs_key location; +	int match = 0; + +	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, +					 index, name, name_len, 0); +	if (di && !IS_ERR(di)) { +		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); +		if (location.objectid != objectid) +			goto out; +	} else +		goto out; +	btrfs_release_path(root, path); + +	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); +	if (di && !IS_ERR(di)) { +		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); +		if (location.objectid != objectid) +			goto out; +	} else +		goto out; +	match = 1; +out: +	btrfs_release_path(root, path); +	return match; +} + +/* + * helper function to check a log tree for a named back reference in + * an inode.  This is used to decide if a back reference that is + * found in the subvolume conflicts with what we find in the log. + * + * inode backreferences may have multiple refs in a single item, + * during replay we process one reference at a time, and we don't + * want to delete valid links to a file from the subvolume if that + * link is also in the log. + */ +static noinline int backref_in_log(struct btrfs_root *log, +				   struct btrfs_key *key, +				   char *name, int namelen) +{ +	struct btrfs_path *path; +	struct btrfs_inode_ref *ref; +	unsigned long ptr; +	unsigned long ptr_end; +	unsigned long name_ptr; +	int found_name_len; +	int item_size; +	int ret; +	int match = 0; + +	path = btrfs_alloc_path(); +	ret = btrfs_search_slot(NULL, log, key, path, 0, 0); +	if (ret != 0) +		goto out; + +	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); +	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); +	ptr_end = ptr + item_size; +	while (ptr < ptr_end) { +		ref = (struct btrfs_inode_ref *)ptr; +		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); +		if (found_name_len == namelen) { +			name_ptr = (unsigned long)(ref + 1); +			ret = memcmp_extent_buffer(path->nodes[0], name, +						   name_ptr, namelen); +			if (ret == 0) { +				match = 1; +				goto out; +			} +		} +		ptr = (unsigned long)(ref + 1) + found_name_len; +	} +out: +	btrfs_free_path(path); +	return match; +} + + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function.  (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, +				  struct btrfs_root *root, +				  struct btrfs_root *log, +				  struct btrfs_path *path, +				  struct extent_buffer *eb, int slot, +				  struct btrfs_key *key) +{ +	struct inode *dir; +	int ret; +	struct btrfs_key location; +	struct btrfs_inode_ref *ref; +	struct btrfs_dir_item *di; +	struct inode *inode; +	char *name; +	int namelen; +	unsigned long ref_ptr; +	unsigned long ref_end; + +	location.objectid = key->objectid; +	location.type = BTRFS_INODE_ITEM_KEY; +	location.offset = 0; + +	/* +	 * it is possible that we didn't log all the parent directories +	 * for a given inode.  If we don't find the dir, just don't +	 * copy the back ref in.  The link count fixup code will take +	 * care of the rest +	 */ +	dir = read_one_inode(root, key->offset); +	if (!dir) +		return -ENOENT; + +	inode = read_one_inode(root, key->objectid); +	BUG_ON(!dir); + +	ref_ptr = btrfs_item_ptr_offset(eb, slot); +	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + +again: +	ref = (struct btrfs_inode_ref *)ref_ptr; + +	namelen = btrfs_inode_ref_name_len(eb, ref); +	name = kmalloc(namelen, GFP_NOFS); +	BUG_ON(!name); + +	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + +	/* if we already have a perfect match, we're done */ +	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, +			 btrfs_inode_ref_index(eb, ref), +			 name, namelen)) { +		goto out; +	} + +	/* +	 * look for a conflicting back reference in the metadata. +	 * if we find one we have to unlink that name of the file +	 * before we add our new link.  Later on, we overwrite any +	 * existing back reference, and we don't want to create +	 * dangling pointers in the directory. +	 */ +conflict_again: +	ret = btrfs_search_slot(NULL, root, key, path, 0, 0); +	if (ret == 0) { +		char *victim_name; +		int victim_name_len; +		struct btrfs_inode_ref *victim_ref; +		unsigned long ptr; +		unsigned long ptr_end; +		struct extent_buffer *leaf = path->nodes[0]; + +		/* are we trying to overwrite a back ref for the root directory +		 * if so, just jump out, we're done +		 */ +		if (key->objectid == key->offset) +			goto out_nowrite; + +		/* check all the names in this back reference to see +		 * if they are in the log.  if so, we allow them to stay +		 * otherwise they must be unlinked as a conflict +		 */ +		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); +		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); +		while(ptr < ptr_end) { +			victim_ref = (struct btrfs_inode_ref *)ptr; +			victim_name_len = btrfs_inode_ref_name_len(leaf, +								   victim_ref); +			victim_name = kmalloc(victim_name_len, GFP_NOFS); +			BUG_ON(!victim_name); + +			read_extent_buffer(leaf, victim_name, +					   (unsigned long)(victim_ref + 1), +					   victim_name_len); + +			if (!backref_in_log(log, key, victim_name, +					    victim_name_len)) { +				btrfs_inc_nlink(inode); +				btrfs_release_path(root, path); +				ret = btrfs_unlink_inode(trans, root, dir, +							 inode, victim_name, +							 victim_name_len); +				kfree(victim_name); +				btrfs_release_path(root, path); +				goto conflict_again; +			} +			kfree(victim_name); +			ptr = (unsigned long)(victim_ref + 1) + victim_name_len; +		} +		BUG_ON(ret); +	} +	btrfs_release_path(root, path); + +	/* look for a conflicting sequence number */ +	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, +					 btrfs_inode_ref_index(eb, ref), +					 name, namelen, 0); +	if (di && !IS_ERR(di)) { +		ret = drop_one_dir_item(trans, root, path, dir, di); +		BUG_ON(ret); +	} +	btrfs_release_path(root, path); + + +	/* look for a conflicting name */ +	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, +				   name, namelen, 0); +	if (di && !IS_ERR(di)) { +		ret = drop_one_dir_item(trans, root, path, dir, di); +		BUG_ON(ret); +	} +	btrfs_release_path(root, path); + +	/* insert our name */ +	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, +			     btrfs_inode_ref_index(eb, ref)); +	BUG_ON(ret); + +	btrfs_update_inode(trans, root, inode); + +out: +	ref_ptr = (unsigned long)(ref + 1) + namelen; +	kfree(name); +	if (ref_ptr < ref_end) +		goto again; + +	/* finally write the back reference in the inode */ +	ret = overwrite_item(trans, root, path, eb, slot, key); +	BUG_ON(ret); + +out_nowrite: +	btrfs_release_path(root, path); +	iput(dir); +	iput(inode); +	return 0; +} + +/* + * replay one csum item from the log tree into the subvolume 'root' + * eb, slot and key all refer to the log tree + * path is for temp use by this function and should be released on return + * + * This copies the checksums out of the log tree and inserts them into + * the subvolume.  Any existing checksums for this range in the file + * are overwritten, and new items are added where required. + * + * We keep this simple by reusing the btrfs_ordered_sum code from + * the data=ordered mode.  This basically means making a copy + * of all the checksums in ram, which we have to do anyway for kmap + * rules. + * + * The copy is then sent down to btrfs_csum_file_blocks, which + * does all the hard work of finding existing items in the file + * or adding new ones. + */ +static noinline int replay_one_csum(struct btrfs_trans_handle *trans, +				      struct btrfs_root *root, +				      struct btrfs_path *path, +				      struct extent_buffer *eb, int slot, +				      struct btrfs_key *key) +{ +	int ret; +	u32 item_size = btrfs_item_size_nr(eb, slot); +	u64 cur_offset; +	unsigned long file_bytes; +	struct btrfs_ordered_sum *sums; +	struct btrfs_sector_sum *sector_sum; +	struct inode *inode; +	unsigned long ptr; + +	file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize; +	inode = read_one_inode(root, key->objectid); +	if (!inode) { +		return -EIO; +	} + +	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS); +	if (!sums) { +		iput(inode); +		return -ENOMEM; +	} + +	INIT_LIST_HEAD(&sums->list); +	sums->len = file_bytes; +	sums->file_offset = key->offset; + +	/* +	 * copy all the sums into the ordered sum struct +	 */ +	sector_sum = sums->sums; +	cur_offset = key->offset; +	ptr = btrfs_item_ptr_offset(eb, slot); +	while(item_size > 0) { +		sector_sum->offset = cur_offset; +		read_extent_buffer(eb, §or_sum->sum, ptr, BTRFS_CRC32_SIZE); +		sector_sum++; +		item_size -= BTRFS_CRC32_SIZE; +		ptr += BTRFS_CRC32_SIZE; +		cur_offset += root->sectorsize; +	} + +	/* let btrfs_csum_file_blocks add them into the file */ +	ret = btrfs_csum_file_blocks(trans, root, inode, sums); +	BUG_ON(ret); +	kfree(sums); +	iput(inode); + +	return 0; +} +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay.  So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found.  If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, +					   struct btrfs_root *root, +					   struct inode *inode) +{ +	struct btrfs_path *path; +	int ret; +	struct btrfs_key key; +	u64 nlink = 0; +	unsigned long ptr; +	unsigned long ptr_end; +	int name_len; + +	key.objectid = inode->i_ino; +	key.type = BTRFS_INODE_REF_KEY; +	key.offset = (u64)-1; + +	path = btrfs_alloc_path(); + +	while(1) { +		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +		if (ret < 0) +			break; +		if (ret > 0) { +			if (path->slots[0] == 0) +				break; +			path->slots[0]--; +		} +		btrfs_item_key_to_cpu(path->nodes[0], &key, +				      path->slots[0]); +		if (key.objectid != inode->i_ino || +		    key.type != BTRFS_INODE_REF_KEY) +			break; +		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); +		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], +						   path->slots[0]); +		while(ptr < ptr_end) { +			struct btrfs_inode_ref *ref; + +			ref = (struct btrfs_inode_ref *)ptr; +			name_len = btrfs_inode_ref_name_len(path->nodes[0], +							    ref); +			ptr = (unsigned long)(ref + 1) + name_len; +			nlink++; +		} + +		if (key.offset == 0) +			break; +		key.offset--; +		btrfs_release_path(root, path); +	} +	btrfs_free_path(path); +	if (nlink != inode->i_nlink) { +		inode->i_nlink = nlink; +		btrfs_update_inode(trans, root, inode); +	} + +	return 0; +} + +static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, +					    struct btrfs_root *root, +					    struct btrfs_path *path) +{ +	int ret; +	struct btrfs_key key; +	struct inode *inode; + +	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; +	key.type = BTRFS_ORPHAN_ITEM_KEY; +	key.offset = (u64)-1; +	while(1) { +		ret = btrfs_search_slot(trans, root, &key, path, -1, 1); +		if (ret < 0) +			break; + +		if (ret == 1) { +			if (path->slots[0] == 0) +				break; +			path->slots[0]--; +		} + +		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); +		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || +		    key.type != BTRFS_ORPHAN_ITEM_KEY) +			break; + +		ret = btrfs_del_item(trans, root, path); +		BUG_ON(ret); + +		btrfs_release_path(root, path); +		inode = read_one_inode(root, key.offset); +		BUG_ON(!inode); + +		ret = fixup_inode_link_count(trans, root, inode); +		BUG_ON(ret); + +		iput(inode); + +		if (key.offset == 0) +			break; +		key.offset--; +	} +	btrfs_release_path(root, path); +	return 0; +} + + +/* + * record a given inode in the fixup dir so we can check its link + * count when replay is done.  The link count is incremented here + * so the inode won't go away until we check it + */ +static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, +				      struct btrfs_root *root, +				      struct btrfs_path *path, +				      u64 objectid) +{ +	struct btrfs_key key; +	int ret = 0; +	struct inode *inode; + +	inode = read_one_inode(root, objectid); +	BUG_ON(!inode); + +	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; +	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); +	key.offset = objectid; + +	ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + +	btrfs_release_path(root, path); +	if (ret == 0) { +		btrfs_inc_nlink(inode); +		btrfs_update_inode(trans, root, inode); +	} else if (ret == -EEXIST) { +		ret = 0; +	} else { +		BUG(); +	} +	iput(inode); + +	return ret; +} + +/* + * when replaying the log for a directory, we only insert names + * for inodes that actually exist.  This means an fsync on a directory + * does not implicitly fsync all the new files in it + */ +static noinline int insert_one_name(struct btrfs_trans_handle *trans, +				    struct btrfs_root *root, +				    struct btrfs_path *path, +				    u64 dirid, u64 index, +				    char *name, int name_len, u8 type, +				    struct btrfs_key *location) +{ +	struct inode *inode; +	struct inode *dir; +	int ret; + +	inode = read_one_inode(root, location->objectid); +	if (!inode) +		return -ENOENT; + +	dir = read_one_inode(root, dirid); +	if (!dir) { +		iput(inode); +		return -EIO; +	} +	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + +	/* FIXME, put inode into FIXUP list */ + +	iput(inode); +	iput(dir); +	return ret; +} + +/* + * take a single entry in a log directory item and replay it into + * the subvolume. + * + * if a conflicting item exists in the subdirectory already, + * the inode it points to is unlinked and put into the link count + * fix up tree. + * + * If a name from the log points to a file or directory that does + * not exist in the FS, it is skipped.  fsyncs on directories + * do not force down inodes inside that directory, just changes to the + * names or unlinks in a directory. + */ +static noinline int replay_one_name(struct btrfs_trans_handle *trans, +				    struct btrfs_root *root, +				    struct btrfs_path *path, +				    struct extent_buffer *eb, +				    struct btrfs_dir_item *di, +				    struct btrfs_key *key) +{ +	char *name; +	int name_len; +	struct btrfs_dir_item *dst_di; +	struct btrfs_key found_key; +	struct btrfs_key log_key; +	struct inode *dir; +	struct inode *inode; +	u8 log_type; +	int ret; + +	dir = read_one_inode(root, key->objectid); +	BUG_ON(!dir); + +	name_len = btrfs_dir_name_len(eb, di); +	name = kmalloc(name_len, GFP_NOFS); +	log_type = btrfs_dir_type(eb, di); +	read_extent_buffer(eb, name, (unsigned long)(di + 1), +		   name_len); + +	btrfs_dir_item_key_to_cpu(eb, di, &log_key); +	if (key->type == BTRFS_DIR_ITEM_KEY) { +		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, +				       name, name_len, 1); +	} +	else if (key->type == BTRFS_DIR_INDEX_KEY) { +		dst_di = btrfs_lookup_dir_index_item(trans, root, path, +						     key->objectid, +						     key->offset, name, +						     name_len, 1); +	} else { +		BUG(); +	} +	if (!dst_di || IS_ERR(dst_di)) { +		/* we need a sequence number to insert, so we only +		 * do inserts for the BTRFS_DIR_INDEX_KEY types +		 */ +		if (key->type != BTRFS_DIR_INDEX_KEY) +			goto out; +		goto insert; +	} + +	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); +	/* the existing item matches the logged item */ +	if (found_key.objectid == log_key.objectid && +	    found_key.type == log_key.type && +	    found_key.offset == log_key.offset && +	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) { +		goto out; +	} + +	/* +	 * don't drop the conflicting directory entry if the inode +	 * for the new entry doesn't exist +	 */ +	inode = read_one_inode(root, log_key.objectid); +	if (!inode) +		goto out; + +	iput(inode); +	ret = drop_one_dir_item(trans, root, path, dir, dst_di); +	BUG_ON(ret); + +	if (key->type == BTRFS_DIR_INDEX_KEY) +		goto insert; +out: +	btrfs_release_path(root, path); +	kfree(name); +	iput(dir); +	return 0; + +insert: +	btrfs_release_path(root, path); +	ret = insert_one_name(trans, root, path, key->objectid, key->offset, +			      name, name_len, log_type, &log_key); + +	if (ret && ret != -ENOENT) +		BUG(); +	goto out; +} + +/* + * find all the names in a directory item and reconcile them into + * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than + * one name in a directory item, but the same code gets used for + * both directory index types + */ +static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, +					struct btrfs_root *root, +					struct btrfs_path *path, +					struct extent_buffer *eb, int slot, +					struct btrfs_key *key) +{ +	int ret; +	u32 item_size = btrfs_item_size_nr(eb, slot); +	struct btrfs_dir_item *di; +	int name_len; +	unsigned long ptr; +	unsigned long ptr_end; + +	ptr = btrfs_item_ptr_offset(eb, slot); +	ptr_end = ptr + item_size; +	while(ptr < ptr_end) { +		di = (struct btrfs_dir_item *)ptr; +		name_len = btrfs_dir_name_len(eb, di); +		ret = replay_one_name(trans, root, path, eb, di, key); +		BUG_ON(ret); +		ptr = (unsigned long)(di + 1); +		ptr += name_len; +	} +	return 0; +} + +/* + * directory replay has two parts.  There are the standard directory + * items in the log copied from the subvolume, and range items + * created in the log while the subvolume was logged. + * + * The range items tell us which parts of the key space the log + * is authoritative for.  During replay, if a key in the subvolume + * directory is in a logged range item, but not actually in the log + * that means it was deleted from the directory before the fsync + * and should be removed. + */ +static noinline int find_dir_range(struct btrfs_root *root, +				   struct btrfs_path *path, +				   u64 dirid, int key_type, +				   u64 *start_ret, u64 *end_ret) +{ +	struct btrfs_key key; +	u64 found_end; +	struct btrfs_dir_log_item *item; +	int ret; +	int nritems; + +	if (*start_ret == (u64)-1) +		return 1; + +	key.objectid = dirid; +	key.type = key_type; +	key.offset = *start_ret; + +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) +		goto out; +	if (ret > 0) { +		if (path->slots[0] == 0) +			goto out; +		path->slots[0]--; +	} +	if (ret != 0) +		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + +	if (key.type != key_type || key.objectid != dirid) { +		ret = 1; +		goto next; +	} +	item = btrfs_item_ptr(path->nodes[0], path->slots[0], +			      struct btrfs_dir_log_item); +	found_end = btrfs_dir_log_end(path->nodes[0], item); + +	if (*start_ret >= key.offset && *start_ret <= found_end) { +		ret = 0; +		*start_ret = key.offset; +		*end_ret = found_end; +		goto out; +	} +	ret = 1; +next: +	/* check the next slot in the tree to see if it is a valid item */ +	nritems = btrfs_header_nritems(path->nodes[0]); +	if (path->slots[0] >= nritems) { +		ret = btrfs_next_leaf(root, path); +		if (ret) +			goto out; +	} else { +		path->slots[0]++; +	} + +	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + +	if (key.type != key_type || key.objectid != dirid) { +		ret = 1; +		goto out; +	} +	item = btrfs_item_ptr(path->nodes[0], path->slots[0], +			      struct btrfs_dir_log_item); +	found_end = btrfs_dir_log_end(path->nodes[0], item); +	*start_ret = key.offset; +	*end_ret = found_end; +	ret = 0; +out: +	btrfs_release_path(root, path); +	return ret; +} + +/* + * this looks for a given directory item in the log.  If the directory + * item is not in the log, the item is removed and the inode it points + * to is unlinked + */ +static noinline int check_item_in_log(struct btrfs_trans_handle *trans, +				      struct btrfs_root *root, +				      struct btrfs_root *log, +				      struct btrfs_path *path, +				      struct btrfs_path *log_path, +				      struct inode *dir, +				      struct btrfs_key *dir_key) +{ +	int ret; +	struct extent_buffer *eb; +	int slot; +	u32 item_size; +	struct btrfs_dir_item *di; +	struct btrfs_dir_item *log_di; +	int name_len; +	unsigned long ptr; +	unsigned long ptr_end; +	char *name; +	struct inode *inode; +	struct btrfs_key location; + +again: +	eb = path->nodes[0]; +	slot = path->slots[0]; +	item_size = btrfs_item_size_nr(eb, slot); +	ptr = btrfs_item_ptr_offset(eb, slot); +	ptr_end = ptr + item_size; +	while(ptr < ptr_end) { +		di = (struct btrfs_dir_item *)ptr; +		name_len = btrfs_dir_name_len(eb, di); +		name = kmalloc(name_len, GFP_NOFS); +		if (!name) { +			ret = -ENOMEM; +			goto out; +		} +		read_extent_buffer(eb, name, (unsigned long)(di + 1), +				  name_len); +		log_di = NULL; +		if (dir_key->type == BTRFS_DIR_ITEM_KEY) { +			log_di = btrfs_lookup_dir_item(trans, log, log_path, +						       dir_key->objectid, +						       name, name_len, 0); +		} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { +			log_di = btrfs_lookup_dir_index_item(trans, log, +						     log_path, +						     dir_key->objectid, +						     dir_key->offset, +						     name, name_len, 0); +		} +		if (!log_di || IS_ERR(log_di)) { +			btrfs_dir_item_key_to_cpu(eb, di, &location); +			btrfs_release_path(root, path); +			btrfs_release_path(log, log_path); +			inode = read_one_inode(root, location.objectid); +			BUG_ON(!inode); + +			ret = link_to_fixup_dir(trans, root, +						path, location.objectid); +			BUG_ON(ret); +			btrfs_inc_nlink(inode); +			ret = btrfs_unlink_inode(trans, root, dir, inode, +						 name, name_len); +			BUG_ON(ret); +			kfree(name); +			iput(inode); + +			/* there might still be more names under this key +			 * check and repeat if required +			 */ +			ret = btrfs_search_slot(NULL, root, dir_key, path, +						0, 0); +			if (ret == 0) +				goto again; +			ret = 0; +			goto out; +		} +		btrfs_release_path(log, log_path); +		kfree(name); + +		ptr = (unsigned long)(di + 1); +		ptr += name_len; +	} +	ret = 0; +out: +	btrfs_release_path(root, path); +	btrfs_release_path(log, log_path); +	return ret; +} + +/* + * deletion replay happens before we copy any new directory items + * out of the log or out of backreferences from inodes.  It + * scans the log to find ranges of keys that log is authoritative for, + * and then scans the directory to find items in those ranges that are + * not present in the log. + * + * Anything we don't find in the log is unlinked and removed from the + * directory. + */ +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, +				       struct btrfs_root *root, +				       struct btrfs_root *log, +				       struct btrfs_path *path, +				       u64 dirid) +{ +	u64 range_start; +	u64 range_end; +	int key_type = BTRFS_DIR_LOG_ITEM_KEY; +	int ret = 0; +	struct btrfs_key dir_key; +	struct btrfs_key found_key; +	struct btrfs_path *log_path; +	struct inode *dir; + +	dir_key.objectid = dirid; +	dir_key.type = BTRFS_DIR_ITEM_KEY; +	log_path = btrfs_alloc_path(); +	if (!log_path) +		return -ENOMEM; + +	dir = read_one_inode(root, dirid); +	/* it isn't an error if the inode isn't there, that can happen +	 * because we replay the deletes before we copy in the inode item +	 * from the log +	 */ +	if (!dir) { +		btrfs_free_path(log_path); +		return 0; +	} +again: +	range_start = 0; +	range_end = 0; +	while(1) { +		ret = find_dir_range(log, path, dirid, key_type, +				     &range_start, &range_end); +		if (ret != 0) +			break; + +		dir_key.offset = range_start; +		while(1) { +			int nritems; +			ret = btrfs_search_slot(NULL, root, &dir_key, path, +						0, 0); +			if (ret < 0) +				goto out; + +			nritems = btrfs_header_nritems(path->nodes[0]); +			if (path->slots[0] >= nritems) { +				ret = btrfs_next_leaf(root, path); +				if (ret) +					break; +			} +			btrfs_item_key_to_cpu(path->nodes[0], &found_key, +					      path->slots[0]); +			if (found_key.objectid != dirid || +			    found_key.type != dir_key.type) +				goto next_type; + +			if (found_key.offset > range_end) +				break; + +			ret = check_item_in_log(trans, root, log, path, +						log_path, dir, &found_key); +			BUG_ON(ret); +			if (found_key.offset == (u64)-1) +				break; +			dir_key.offset = found_key.offset + 1; +		} +		btrfs_release_path(root, path); +		if (range_end == (u64)-1) +			break; +		range_start = range_end + 1; +	} + +next_type: +	ret = 0; +	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { +		key_type = BTRFS_DIR_LOG_INDEX_KEY; +		dir_key.type = BTRFS_DIR_INDEX_KEY; +		btrfs_release_path(root, path); +		goto again; +	} +out: +	btrfs_release_path(root, path); +	btrfs_free_path(log_path); +	iput(dir); +	return ret; +} + +/* + * the process_func used to replay items from the log tree.  This + * gets called in two different stages.  The first stage just looks + * for inodes and makes sure they are all copied into the subvolume. + * + * The second stage copies all the other item types from the log into + * the subvolume.  The two stage approach is slower, but gets rid of + * lots of complexity around inodes referencing other inodes that exist + * only in the log (references come from either directory items or inode + * back refs). + */ +static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, +			     struct walk_control *wc, u64 gen) +{ +	int nritems; +	struct btrfs_path *path; +	struct btrfs_root *root = wc->replay_dest; +	struct btrfs_key key; +	u32 item_size; +	int level; +	int i; +	int ret; + +	btrfs_read_buffer(eb, gen); + +	level = btrfs_header_level(eb); + +	if (level != 0) +		return 0; + +	path = btrfs_alloc_path(); +	BUG_ON(!path); + +	nritems = btrfs_header_nritems(eb); +	for (i = 0; i < nritems; i++) { +		btrfs_item_key_to_cpu(eb, &key, i); +		item_size = btrfs_item_size_nr(eb, i); + +		/* inode keys are done during the first stage */ +		if (key.type == BTRFS_INODE_ITEM_KEY && +		    wc->stage == LOG_WALK_REPLAY_INODES) { +			struct inode *inode; +			struct btrfs_inode_item *inode_item; +			u32 mode; + +			inode_item = btrfs_item_ptr(eb, i, +					    struct btrfs_inode_item); +			mode = btrfs_inode_mode(eb, inode_item); +			if (S_ISDIR(mode)) { +				ret = replay_dir_deletes(wc->trans, +					 root, log, path, key.objectid); +				BUG_ON(ret); +			} +			ret = overwrite_item(wc->trans, root, path, +					     eb, i, &key); +			BUG_ON(ret); + +			/* for regular files, truncate away +			 * extents past the new EOF +			 */ +			if (S_ISREG(mode)) { +				inode = read_one_inode(root, +						       key.objectid); +				BUG_ON(!inode); + +				ret = btrfs_truncate_inode_items(wc->trans, +					root, inode, inode->i_size, +					BTRFS_EXTENT_DATA_KEY); +				BUG_ON(ret); +				iput(inode); +			} +			ret = link_to_fixup_dir(wc->trans, root, +						path, key.objectid); +			BUG_ON(ret); +		} +		if (wc->stage < LOG_WALK_REPLAY_ALL) +			continue; + +		/* these keys are simply copied */ +		if (key.type == BTRFS_XATTR_ITEM_KEY) { +			ret = overwrite_item(wc->trans, root, path, +					     eb, i, &key); +			BUG_ON(ret); +		} else if (key.type == BTRFS_INODE_REF_KEY) { +			ret = add_inode_ref(wc->trans, root, log, path, +					    eb, i, &key); +			BUG_ON(ret && ret != -ENOENT); +		} else if (key.type == BTRFS_EXTENT_DATA_KEY) { +			ret = replay_one_extent(wc->trans, root, path, +						eb, i, &key); +			BUG_ON(ret); +		} else if (key.type == BTRFS_CSUM_ITEM_KEY) { +			ret = replay_one_csum(wc->trans, root, path, +					      eb, i, &key); +			BUG_ON(ret); +		} else if (key.type == BTRFS_DIR_ITEM_KEY || +			   key.type == BTRFS_DIR_INDEX_KEY) { +			ret = replay_one_dir_item(wc->trans, root, path, +						  eb, i, &key); +			BUG_ON(ret); +		} +	} +	btrfs_free_path(path); +	return 0; +} + +static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans, +				   struct btrfs_root *root, +				   struct btrfs_path *path, int *level, +				   struct walk_control *wc) +{ +	u64 root_owner; +	u64 root_gen; +	u64 bytenr; +	u64 ptr_gen; +	struct extent_buffer *next; +	struct extent_buffer *cur; +	struct extent_buffer *parent; +	u32 blocksize; +	int ret = 0; + +	WARN_ON(*level < 0); +	WARN_ON(*level >= BTRFS_MAX_LEVEL); + +	while(*level > 0) { +		WARN_ON(*level < 0); +		WARN_ON(*level >= BTRFS_MAX_LEVEL); +		cur = path->nodes[*level]; + +		if (btrfs_header_level(cur) != *level) +			WARN_ON(1); + +		if (path->slots[*level] >= +		    btrfs_header_nritems(cur)) +			break; + +		bytenr = btrfs_node_blockptr(cur, path->slots[*level]); +		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); +		blocksize = btrfs_level_size(root, *level - 1); + +		parent = path->nodes[*level]; +		root_owner = btrfs_header_owner(parent); +		root_gen = btrfs_header_generation(parent); + +		next = btrfs_find_create_tree_block(root, bytenr, blocksize); + +		wc->process_func(root, next, wc, ptr_gen); + +		if (*level == 1) { +			path->slots[*level]++; +			if (wc->free) { +				btrfs_read_buffer(next, ptr_gen); + +				btrfs_tree_lock(next); +				clean_tree_block(trans, root, next); +				btrfs_wait_tree_block_writeback(next); +				btrfs_tree_unlock(next); + +				ret = btrfs_drop_leaf_ref(trans, root, next); +				BUG_ON(ret); + +				WARN_ON(root_owner != +					BTRFS_TREE_LOG_OBJECTID); +				ret = btrfs_free_extent(trans, root, bytenr, +							blocksize, root_owner, +							root_gen, 0, 0, 1); +				BUG_ON(ret); +			} +			free_extent_buffer(next); +			continue; +		} +		btrfs_read_buffer(next, ptr_gen); + +		WARN_ON(*level <= 0); +		if (path->nodes[*level-1]) +			free_extent_buffer(path->nodes[*level-1]); +		path->nodes[*level-1] = next; +		*level = btrfs_header_level(next); +		path->slots[*level] = 0; +		cond_resched(); +	} +	WARN_ON(*level < 0); +	WARN_ON(*level >= BTRFS_MAX_LEVEL); + +	if (path->nodes[*level] == root->node) { +		parent = path->nodes[*level]; +	} else { +		parent = path->nodes[*level + 1]; +	} +	bytenr = path->nodes[*level]->start; + +	blocksize = btrfs_level_size(root, *level); +	root_owner = btrfs_header_owner(parent); +	root_gen = btrfs_header_generation(parent); + +	wc->process_func(root, path->nodes[*level], wc, +			 btrfs_header_generation(path->nodes[*level])); + +	if (wc->free) { +		next = path->nodes[*level]; +		btrfs_tree_lock(next); +		clean_tree_block(trans, root, next); +		btrfs_wait_tree_block_writeback(next); +		btrfs_tree_unlock(next); + +		if (*level == 0) { +			ret = btrfs_drop_leaf_ref(trans, root, next); +			BUG_ON(ret); +		} +		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); +		ret = btrfs_free_extent(trans, root, bytenr, blocksize, +					  root_owner, root_gen, 0, 0, 1); +		BUG_ON(ret); +	} +	free_extent_buffer(path->nodes[*level]); +	path->nodes[*level] = NULL; +	*level += 1; + +	cond_resched(); +	return 0; +} + +static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans, +				 struct btrfs_root *root, +				 struct btrfs_path *path, int *level, +				 struct walk_control *wc) +{ +	u64 root_owner; +	u64 root_gen; +	int i; +	int slot; +	int ret; + +	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { +		slot = path->slots[i]; +		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { +			struct extent_buffer *node; +			node = path->nodes[i]; +			path->slots[i]++; +			*level = i; +			WARN_ON(*level == 0); +			return 0; +		} else { +			if (path->nodes[*level] == root->node) { +				root_owner = root->root_key.objectid; +				root_gen = +				   btrfs_header_generation(path->nodes[*level]); +			} else { +				struct extent_buffer *node; +				node = path->nodes[*level + 1]; +				root_owner = btrfs_header_owner(node); +				root_gen = btrfs_header_generation(node); +			} +			wc->process_func(root, path->nodes[*level], wc, +				 btrfs_header_generation(path->nodes[*level])); +			if (wc->free) { +				struct extent_buffer *next; + +				next = path->nodes[*level]; + +				btrfs_tree_lock(next); +				clean_tree_block(trans, root, next); +				btrfs_wait_tree_block_writeback(next); +				btrfs_tree_unlock(next); + +				if (*level == 0) { +					ret = btrfs_drop_leaf_ref(trans, root, +								  next); +					BUG_ON(ret); +				} + +				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); +				ret = btrfs_free_extent(trans, root, +						path->nodes[*level]->start, +						path->nodes[*level]->len, +						root_owner, root_gen, 0, 0, 1); +				BUG_ON(ret); +			} +			free_extent_buffer(path->nodes[*level]); +			path->nodes[*level] = NULL; +			*level = i + 1; +		} +	} +	return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'.  This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +static int walk_log_tree(struct btrfs_trans_handle *trans, +			 struct btrfs_root *log, struct walk_control *wc) +{ +	int ret = 0; +	int wret; +	int level; +	struct btrfs_path *path; +	int i; +	int orig_level; + +	path = btrfs_alloc_path(); +	BUG_ON(!path); + +	level = btrfs_header_level(log->node); +	orig_level = level; +	path->nodes[level] = log->node; +	extent_buffer_get(log->node); +	path->slots[level] = 0; + +	while(1) { +		wret = walk_down_log_tree(trans, log, path, &level, wc); +		if (wret > 0) +			break; +		if (wret < 0) +			ret = wret; + +		wret = walk_up_log_tree(trans, log, path, &level, wc); +		if (wret > 0) +			break; +		if (wret < 0) +			ret = wret; +	} + +	/* was the root node processed? if not, catch it here */ +	if (path->nodes[orig_level]) { +		wc->process_func(log, path->nodes[orig_level], wc, +			 btrfs_header_generation(path->nodes[orig_level])); +		if (wc->free) { +			struct extent_buffer *next; + +			next = path->nodes[orig_level]; + +			btrfs_tree_lock(next); +			clean_tree_block(trans, log, next); +			btrfs_wait_tree_block_writeback(next); +			btrfs_tree_unlock(next); + +			if (orig_level == 0) { +				ret = btrfs_drop_leaf_ref(trans, log, +							  next); +				BUG_ON(ret); +			} +			WARN_ON(log->root_key.objectid != +				BTRFS_TREE_LOG_OBJECTID); +			ret = btrfs_free_extent(trans, log, +						next->start, next->len, +						log->root_key.objectid, +						btrfs_header_generation(next), +						0, 0, 1); +			BUG_ON(ret); +		} +	} + +	for (i = 0; i <= orig_level; i++) { +		if (path->nodes[i]) { +			free_extent_buffer(path->nodes[i]); +			path->nodes[i] = NULL; +		} +	} +	btrfs_free_path(path); +	if (wc->free) +		free_extent_buffer(log->node); +	return ret; +} + +int wait_log_commit(struct btrfs_root *log) +{ +	DEFINE_WAIT(wait); +	u64 transid = log->fs_info->tree_log_transid; + +	do { +		prepare_to_wait(&log->fs_info->tree_log_wait, &wait, +				TASK_UNINTERRUPTIBLE); +		mutex_unlock(&log->fs_info->tree_log_mutex); +		if (atomic_read(&log->fs_info->tree_log_commit)) +			schedule(); +		finish_wait(&log->fs_info->tree_log_wait, &wait); +		mutex_lock(&log->fs_info->tree_log_mutex); +	} while(transid == log->fs_info->tree_log_transid && +		atomic_read(&log->fs_info->tree_log_commit)); +	return 0; +} + +/* + * btrfs_sync_log does sends a given tree log down to the disk and + * updates the super blocks to record it.  When this call is done, + * you know that any inodes previously logged are safely on disk + */ +int btrfs_sync_log(struct btrfs_trans_handle *trans, +		   struct btrfs_root *root) +{ +	int ret; +	unsigned long batch; +	struct btrfs_root *log = root->log_root; +	struct walk_control wc = { +		.write = 1, +		.process_func = process_one_buffer +	}; + +	mutex_lock(&log->fs_info->tree_log_mutex); +	if (atomic_read(&log->fs_info->tree_log_commit)) { +		wait_log_commit(log); +		goto out; +	} +	atomic_set(&log->fs_info->tree_log_commit, 1); + +	while(1) { +		mutex_unlock(&log->fs_info->tree_log_mutex); +		schedule_timeout_uninterruptible(1); +		mutex_lock(&log->fs_info->tree_log_mutex); +		batch = log->fs_info->tree_log_batch; + +		while(atomic_read(&log->fs_info->tree_log_writers)) { +			DEFINE_WAIT(wait); +			prepare_to_wait(&log->fs_info->tree_log_wait, &wait, +					TASK_UNINTERRUPTIBLE); +			batch = log->fs_info->tree_log_batch; +			mutex_unlock(&log->fs_info->tree_log_mutex); +			if (atomic_read(&log->fs_info->tree_log_writers)) +				schedule(); +			mutex_lock(&log->fs_info->tree_log_mutex); +			finish_wait(&log->fs_info->tree_log_wait, &wait); +		} +		if (batch == log->fs_info->tree_log_batch) +			break; +	} +	ret = walk_log_tree(trans, log, &wc); +	BUG_ON(ret); + +	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc); +	BUG_ON(ret); + +	wc.wait = 1; + +	ret = walk_log_tree(trans, log, &wc); +	BUG_ON(ret); + +	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc); +	BUG_ON(ret); + +	btrfs_set_super_log_root(&root->fs_info->super_for_commit, +				 log->fs_info->log_root_tree->node->start); +	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, +		       btrfs_header_level(log->fs_info->log_root_tree->node)); + +	write_ctree_super(trans, log->fs_info->tree_root); +	log->fs_info->tree_log_transid++; +	log->fs_info->tree_log_batch = 0; +	atomic_set(&log->fs_info->tree_log_commit, 0); +	smp_mb(); +	if (waitqueue_active(&log->fs_info->tree_log_wait)) +		wake_up(&log->fs_info->tree_log_wait); +out: +	mutex_unlock(&log->fs_info->tree_log_mutex); +	return 0; + +} + +/* + * free all the extents used by the tree log.  This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ +	int ret; +	struct btrfs_root *log; +	struct key; +	struct walk_control wc = { +		.free = 1, +		.process_func = process_one_buffer +	}; + +	if (!root->log_root) +		return 0; + +	log = root->log_root; +	ret = walk_log_tree(trans, log, &wc); +	BUG_ON(ret); + +	log = root->log_root; +	ret = btrfs_del_root(trans, root->fs_info->log_root_tree, +			     &log->root_key); +	BUG_ON(ret); +	root->log_root = NULL; +	kfree(root->log_root); +	return 0; +} + +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, +			   struct btrfs_root *log) +{ +	u64 bytenr = btrfs_root_bytenr(&log->root_item); +	int ret; + +	if (log->node->start == bytenr) +		return 0; + +	btrfs_set_root_bytenr(&log->root_item, log->node->start); +	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); +	ret = btrfs_update_root(trans, log->fs_info->log_root_tree, +				&log->root_key, &log->root_item); +	BUG_ON(ret); +	return ret; +} + +/* + * If both a file and directory are logged, and unlinks or renames are + * mixed in, we have a few interesting corners: + * + * create file X in dir Y + * link file X to X.link in dir Y + * fsync file X + * unlink file X but leave X.link + * fsync dir Y + * + * After a crash we would expect only X.link to exist.  But file X + * didn't get fsync'd again so the log has back refs for X and X.link. + * + * We solve this by removing directory entries and inode backrefs from the + * log when a file that was logged in the current transaction is + * unlinked.  Any later fsync will include the updated log entries, and + * we'll be able to reconstruct the proper directory items from backrefs. + * + * This optimizations allows us to avoid relogging the entire inode + * or the entire directory. + */ +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, +				 struct btrfs_root *root, +				 const char *name, int name_len, +				 struct inode *dir, u64 index) +{ +	struct btrfs_root *log; +	struct btrfs_dir_item *di; +	struct btrfs_path *path; +	int ret; +	int bytes_del = 0; + +	ret = join_running_log_trans(root); +	if (ret) +		return 0; + +	mutex_lock(&BTRFS_I(dir)->log_mutex); + +	log = root->log_root; +	path = btrfs_alloc_path(); +	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, +				   name, name_len, -1); +	if (di && !IS_ERR(di)) { +		ret = btrfs_delete_one_dir_name(trans, log, path, di); +		bytes_del += name_len; +		BUG_ON(ret); +	} +	btrfs_release_path(log, path); +	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, +					 index, name, name_len, -1); +	if (di && !IS_ERR(di)) { +		ret = btrfs_delete_one_dir_name(trans, log, path, di); +		bytes_del += name_len; +		BUG_ON(ret); +	} + +	/* update the directory size in the log to reflect the names +	 * we have removed +	 */ +	if (bytes_del) { +		struct btrfs_key key; + +		key.objectid = dir->i_ino; +		key.offset = 0; +		key.type = BTRFS_INODE_ITEM_KEY; +		btrfs_release_path(log, path); + +		ret = btrfs_search_slot(trans, log, &key, path, 0, 1); +		if (ret == 0) { +			struct btrfs_inode_item *item; +			u64 i_size; + +			item = btrfs_item_ptr(path->nodes[0], path->slots[0], +					      struct btrfs_inode_item); +			i_size = btrfs_inode_size(path->nodes[0], item); +			if (i_size > bytes_del) +				i_size -= bytes_del; +			else +				i_size = 0; +			btrfs_set_inode_size(path->nodes[0], item, i_size); +			btrfs_mark_buffer_dirty(path->nodes[0]); +		} else +			ret = 0; +		btrfs_release_path(log, path); +	} + +	btrfs_free_path(path); +	mutex_unlock(&BTRFS_I(dir)->log_mutex); +	end_log_trans(root); + +	return 0; +} + +/* see comments for btrfs_del_dir_entries_in_log */ +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, +			       struct btrfs_root *root, +			       const char *name, int name_len, +			       struct inode *inode, u64 dirid) +{ +	struct btrfs_root *log; +	u64 index; +	int ret; + +	ret = join_running_log_trans(root); +	if (ret) +		return 0; +	log = root->log_root; +	mutex_lock(&BTRFS_I(inode)->log_mutex); + +	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, +				  dirid, &index); +	mutex_unlock(&BTRFS_I(inode)->log_mutex); +	end_log_trans(root); + +	if (ret == 0 || ret == -ENOENT) +		return 0; +	return ret; +} + +/* + * creates a range item in the log for 'dirid'.  first_offset and + * last_offset tell us which parts of the key space the log should + * be considered authoritative for. + */ +static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, +				       struct btrfs_root *log, +				       struct btrfs_path *path, +				       int key_type, u64 dirid, +				       u64 first_offset, u64 last_offset) +{ +	int ret; +	struct btrfs_key key; +	struct btrfs_dir_log_item *item; + +	key.objectid = dirid; +	key.offset = first_offset; +	if (key_type == BTRFS_DIR_ITEM_KEY) +		key.type = BTRFS_DIR_LOG_ITEM_KEY; +	else +		key.type = BTRFS_DIR_LOG_INDEX_KEY; +	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); +	BUG_ON(ret); + +	item = btrfs_item_ptr(path->nodes[0], path->slots[0], +			      struct btrfs_dir_log_item); +	btrfs_set_dir_log_end(path->nodes[0], item, last_offset); +	btrfs_mark_buffer_dirty(path->nodes[0]); +	btrfs_release_path(log, path); +	return 0; +} + +/* + * log all the items included in the current transaction for a given + * directory.  This also creates the range items in the log tree required + * to replay anything deleted before the fsync + */ +static noinline int log_dir_items(struct btrfs_trans_handle *trans, +			  struct btrfs_root *root, struct inode *inode, +			  struct btrfs_path *path, +			  struct btrfs_path *dst_path, int key_type, +			  u64 min_offset, u64 *last_offset_ret) +{ +	struct btrfs_key min_key; +	struct btrfs_key max_key; +	struct btrfs_root *log = root->log_root; +	struct extent_buffer *src; +	int ret; +	int i; +	int nritems; +	u64 first_offset = min_offset; +	u64 last_offset = (u64)-1; + +	log = root->log_root; +	max_key.objectid = inode->i_ino; +	max_key.offset = (u64)-1; +	max_key.type = key_type; + +	min_key.objectid = inode->i_ino; +	min_key.type = key_type; +	min_key.offset = min_offset; + +	path->keep_locks = 1; + +	ret = btrfs_search_forward(root, &min_key, &max_key, +				   path, 0, trans->transid); + +	/* +	 * we didn't find anything from this transaction, see if there +	 * is anything at all +	 */ +	if (ret != 0 || min_key.objectid != inode->i_ino || +	    min_key.type != key_type) { +		min_key.objectid = inode->i_ino; +		min_key.type = key_type; +		min_key.offset = (u64)-1; +		btrfs_release_path(root, path); +		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); +		if (ret < 0) { +			btrfs_release_path(root, path); +			return ret; +		} +		ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + +		/* if ret == 0 there are items for this type, +		 * create a range to tell us the last key of this type. +		 * otherwise, there are no items in this directory after +		 * *min_offset, and we create a range to indicate that. +		 */ +		if (ret == 0) { +			struct btrfs_key tmp; +			btrfs_item_key_to_cpu(path->nodes[0], &tmp, +					      path->slots[0]); +			if (key_type == tmp.type) { +				first_offset = max(min_offset, tmp.offset) + 1; +			} +		} +		goto done; +	} + +	/* go backward to find any previous key */ +	ret = btrfs_previous_item(root, path, inode->i_ino, key_type); +	if (ret == 0) { +		struct btrfs_key tmp; +		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); +		if (key_type == tmp.type) { +			first_offset = tmp.offset; +			ret = overwrite_item(trans, log, dst_path, +					     path->nodes[0], path->slots[0], +					     &tmp); +		} +	} +	btrfs_release_path(root, path); + +	/* find the first key from this transaction again */ +	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); +	if (ret != 0) { +		WARN_ON(1); +		goto done; +	} + +	/* +	 * we have a block from this transaction, log every item in it +	 * from our directory +	 */ +	while(1) { +		struct btrfs_key tmp; +		src = path->nodes[0]; +		nritems = btrfs_header_nritems(src); +		for (i = path->slots[0]; i < nritems; i++) { +			btrfs_item_key_to_cpu(src, &min_key, i); + +			if (min_key.objectid != inode->i_ino || +			    min_key.type != key_type) +				goto done; +			ret = overwrite_item(trans, log, dst_path, src, i, +					     &min_key); +			BUG_ON(ret); +		} +		path->slots[0] = nritems; + +		/* +		 * look ahead to the next item and see if it is also +		 * from this directory and from this transaction +		 */ +		ret = btrfs_next_leaf(root, path); +		if (ret == 1) { +			last_offset = (u64)-1; +			goto done; +		} +		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); +		if (tmp.objectid != inode->i_ino || tmp.type != key_type) { +			last_offset = (u64)-1; +			goto done; +		} +		if (btrfs_header_generation(path->nodes[0]) != trans->transid) { +			ret = overwrite_item(trans, log, dst_path, +					     path->nodes[0], path->slots[0], +					     &tmp); + +			BUG_ON(ret); +			last_offset = tmp.offset; +			goto done; +		} +	} +done: +	*last_offset_ret = last_offset; +	btrfs_release_path(root, path); +	btrfs_release_path(log, dst_path); + +	/* insert the log range keys to indicate where the log is valid */ +	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, +				 first_offset, last_offset); +	BUG_ON(ret); +	return 0; +} + +/* + * logging directories is very similar to logging inodes, We find all the items + * from the current transaction and write them to the log. + * + * The recovery code scans the directory in the subvolume, and if it finds a + * key in the range logged that is not present in the log tree, then it means + * that dir entry was unlinked during the transaction. + * + * In order for that scan to work, we must include one key smaller than + * the smallest logged by this transaction and one key larger than the largest + * key logged by this transaction. + */ +static noinline int log_directory_changes(struct btrfs_trans_handle *trans, +			  struct btrfs_root *root, struct inode *inode, +			  struct btrfs_path *path, +			  struct btrfs_path *dst_path) +{ +	u64 min_key; +	u64 max_key; +	int ret; +	int key_type = BTRFS_DIR_ITEM_KEY; + +again: +	min_key = 0; +	max_key = 0; +	while(1) { +		ret = log_dir_items(trans, root, inode, path, +				    dst_path, key_type, min_key, +				    &max_key); +		BUG_ON(ret); +		if (max_key == (u64)-1) +			break; +		min_key = max_key + 1; +	} + +	if (key_type == BTRFS_DIR_ITEM_KEY) { +		key_type = BTRFS_DIR_INDEX_KEY; +		goto again; +	} +	return 0; +} + +/* + * a helper function to drop items from the log before we relog an + * inode.  max_key_type indicates the highest item type to remove. + * This cannot be run for file data extents because it does not + * free the extents they point to. + */ +static int drop_objectid_items(struct btrfs_trans_handle *trans, +				  struct btrfs_root *log, +				  struct btrfs_path *path, +				  u64 objectid, int max_key_type) +{ +	int ret; +	struct btrfs_key key; +	struct btrfs_key found_key; + +	key.objectid = objectid; +	key.type = max_key_type; +	key.offset = (u64)-1; + +	while(1) { +		ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + +		if (ret != 1) +			break; + +		if (path->slots[0] == 0) +			break; + +		path->slots[0]--; +		btrfs_item_key_to_cpu(path->nodes[0], &found_key, +				      path->slots[0]); + +		if (found_key.objectid != objectid) +			break; + +		ret = btrfs_del_item(trans, log, path); +		BUG_ON(ret); +		btrfs_release_path(log, path); +	} +	btrfs_release_path(log, path); +	return 0; +} + +/* log a single inode in the tree log. + * At least one parent directory for this inode must exist in the tree + * or be logged already. + * + * Any items from this inode changed by the current transaction are copied + * to the log tree.  An extra reference is taken on any extents in this + * file, allowing us to avoid a whole pile of corner cases around logging + * blocks that have been removed from the tree. + * + * See LOG_INODE_ALL and related defines for a description of what inode_only + * does. + * + * This handles both files and directories. + */ +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +			     struct btrfs_root *root, struct inode *inode, +			     int inode_only) +{ +	struct btrfs_path *path; +	struct btrfs_path *dst_path; +	struct btrfs_key min_key; +	struct btrfs_key max_key; +	struct btrfs_root *log = root->log_root; +	unsigned long src_offset; +	unsigned long dst_offset; +	struct extent_buffer *src; +	struct btrfs_file_extent_item *extent; +	struct btrfs_inode_item *inode_item; +	u32 size; +	int ret; + +	log = root->log_root; + +	path = btrfs_alloc_path(); +	dst_path = btrfs_alloc_path(); + +	min_key.objectid = inode->i_ino; +	min_key.type = BTRFS_INODE_ITEM_KEY; +	min_key.offset = 0; + +	max_key.objectid = inode->i_ino; +	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) +		max_key.type = BTRFS_XATTR_ITEM_KEY; +	else +		max_key.type = (u8)-1; +	max_key.offset = (u64)-1; + +	/* +	 * if this inode has already been logged and we're in inode_only +	 * mode, we don't want to delete the things that have already +	 * been written to the log. +	 * +	 * But, if the inode has been through an inode_only log, +	 * the logged_trans field is not set.  This allows us to catch +	 * any new names for this inode in the backrefs by logging it +	 * again +	 */ +	if (inode_only == LOG_INODE_EXISTS && +	    BTRFS_I(inode)->logged_trans == trans->transid) { +		btrfs_free_path(path); +		btrfs_free_path(dst_path); +		goto out; +	} +	mutex_lock(&BTRFS_I(inode)->log_mutex); + +	/* +	 * a brute force approach to making sure we get the most uptodate +	 * copies of everything. +	 */ +	if (S_ISDIR(inode->i_mode)) { +		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + +		if (inode_only == LOG_INODE_EXISTS) +			max_key_type = BTRFS_XATTR_ITEM_KEY; +		ret = drop_objectid_items(trans, log, path, +					  inode->i_ino, max_key_type); +	} else { +		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); +	} +	BUG_ON(ret); +	path->keep_locks = 1; + +	while(1) { +		ret = btrfs_search_forward(root, &min_key, &max_key, +					   path, 0, trans->transid); +		if (ret != 0) +			break; + +		if (min_key.objectid != inode->i_ino) +			break; +		if (min_key.type > max_key.type) +			break; + +		src = path->nodes[0]; +		size = btrfs_item_size_nr(src, path->slots[0]); +		ret = btrfs_insert_empty_item(trans, log, dst_path, &min_key, +					      size); +		if (ret) +			BUG(); + +		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], +						   dst_path->slots[0]); + +		src_offset = btrfs_item_ptr_offset(src, path->slots[0]); + +		copy_extent_buffer(dst_path->nodes[0], src, dst_offset, +				   src_offset, size); + +		if (inode_only == LOG_INODE_EXISTS && +		    min_key.type == BTRFS_INODE_ITEM_KEY) { +			inode_item = btrfs_item_ptr(dst_path->nodes[0], +						    dst_path->slots[0], +						    struct btrfs_inode_item); +			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); + +			/* set the generation to zero so the recover code +			 * can tell the difference between an logging +			 * just to say 'this inode exists' and a logging +			 * to say 'update this inode with these values' +			 */ +			btrfs_set_inode_generation(dst_path->nodes[0], +						   inode_item, 0); +		} +		/* take a reference on file data extents so that truncates +		 * or deletes of this inode don't have to relog the inode +		 * again +		 */ +		if (btrfs_key_type(&min_key) == BTRFS_EXTENT_DATA_KEY) { +			int found_type; +			extent = btrfs_item_ptr(src, path->slots[0], +						struct btrfs_file_extent_item); + +			found_type = btrfs_file_extent_type(src, extent); +			if (found_type == BTRFS_FILE_EXTENT_REG) { +				u64 ds = btrfs_file_extent_disk_bytenr(src, +								   extent); +				u64 dl = btrfs_file_extent_disk_num_bytes(src, +								      extent); +				/* ds == 0 is a hole */ +				if (ds != 0) { +					ret = btrfs_inc_extent_ref(trans, log, +						   ds, dl, +						   log->root_key.objectid, +						   0, +						   inode->i_ino, +						   min_key.offset); +					BUG_ON(ret); +				} +			} +		} + +		btrfs_mark_buffer_dirty(dst_path->nodes[0]); +		btrfs_release_path(root, path); +		btrfs_release_path(log, dst_path); + +		if (min_key.offset < (u64)-1) +			min_key.offset++; +		else if (min_key.type < (u8)-1) +			min_key.type++; +		else if (min_key.objectid < (u64)-1) +			min_key.objectid++; +		else +			break; +	} +	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { +		btrfs_release_path(root, path); +		btrfs_release_path(log, dst_path); +		ret = log_directory_changes(trans, root, inode, path, dst_path); +		BUG_ON(ret); +	} +	mutex_unlock(&BTRFS_I(inode)->log_mutex); + +	btrfs_free_path(path); +	btrfs_free_path(dst_path); + +	mutex_lock(&root->fs_info->tree_log_mutex); +	ret = update_log_root(trans, log); +	BUG_ON(ret); +	mutex_unlock(&root->fs_info->tree_log_mutex); +out: +	return 0; +} + +int btrfs_log_inode(struct btrfs_trans_handle *trans, +		    struct btrfs_root *root, struct inode *inode, +		    int inode_only) +{ +	int ret; + +	start_log_trans(trans, root); +	ret = __btrfs_log_inode(trans, root, inode, inode_only); +	end_log_trans(root); +	return ret; +} + +/* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log.  A minimal inode and backref + * only logging is done of any parent directories that are older than + * the last committed transaction + */ +int btrfs_log_dentry(struct btrfs_trans_handle *trans, +		    struct btrfs_root *root, struct dentry *dentry) +{ +	int inode_only = LOG_INODE_ALL; +	struct super_block *sb; +	int ret; + +	start_log_trans(trans, root); +	sb = dentry->d_inode->i_sb; +	while(1) { +		ret = __btrfs_log_inode(trans, root, dentry->d_inode, +					inode_only); +		BUG_ON(ret); +		inode_only = LOG_INODE_EXISTS; + +		dentry = dentry->d_parent; +		if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) +			break; + +		if (BTRFS_I(dentry->d_inode)->generation <= +		    root->fs_info->last_trans_committed) +			break; +	} +	end_log_trans(root); +	return 0; +} + +/* + * it is not safe to log dentry if the chunk root has added new + * chunks.  This returns 0 if the dentry was logged, and 1 otherwise. + * If this returns 1, you must commit the transaction to safely get your + * data on disk. + */ +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, +			  struct btrfs_root *root, struct dentry *dentry) +{ +	u64 gen; +	gen = root->fs_info->last_trans_new_blockgroup; +	if (gen > root->fs_info->last_trans_committed) +		return 1; +	else +		return btrfs_log_dentry(trans, root, dentry); +} + +/* + * should be called during mount to recover any replay any log trees + * from the FS + */ +int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) +{ +	int ret; +	struct btrfs_path *path; +	struct btrfs_trans_handle *trans; +	struct btrfs_key key; +	struct btrfs_key found_key; +	struct btrfs_key tmp_key; +	struct btrfs_root *log; +	struct btrfs_fs_info *fs_info = log_root_tree->fs_info; +	struct walk_control wc = { +		.process_func = process_one_buffer, +		.stage = 0, +	}; + +	fs_info->log_root_recovering = 1; +	path = btrfs_alloc_path(); +	BUG_ON(!path); + +	trans = btrfs_start_transaction(fs_info->tree_root, 1); + +	wc.trans = trans; +	wc.pin = 1; + +	walk_log_tree(trans, log_root_tree, &wc); + +again: +	key.objectid = BTRFS_TREE_LOG_OBJECTID; +	key.offset = (u64)-1; +	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + +	while(1) { +		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); +		if (ret < 0) +			break; +		if (ret > 0) { +			if (path->slots[0] == 0) +				break; +			path->slots[0]--; +		} +		btrfs_item_key_to_cpu(path->nodes[0], &found_key, +				      path->slots[0]); +		btrfs_release_path(log_root_tree, path); +		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) +			break; + +		log = btrfs_read_fs_root_no_radix(log_root_tree, +						  &found_key); +		BUG_ON(!log); + + +		tmp_key.objectid = found_key.offset; +		tmp_key.type = BTRFS_ROOT_ITEM_KEY; +		tmp_key.offset = (u64)-1; + +		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + +		BUG_ON(!wc.replay_dest); + +		btrfs_record_root_in_trans(wc.replay_dest); +		ret = walk_log_tree(trans, log, &wc); +		BUG_ON(ret); + +		if (wc.stage == LOG_WALK_REPLAY_ALL) { +			ret = fixup_inode_link_counts(trans, wc.replay_dest, +						      path); +			BUG_ON(ret); +		} + +		key.offset = found_key.offset - 1; +		free_extent_buffer(log->node); +		kfree(log); + +		if (found_key.offset == 0) +			break; +	} +	btrfs_release_path(log_root_tree, path); + +	/* step one is to pin it all, step two is to replay just inodes */ +	if (wc.pin) { +		wc.pin = 0; +		wc.process_func = replay_one_buffer; +		wc.stage = LOG_WALK_REPLAY_INODES; +		goto again; +	} +	/* step three is to replay everything */ +	if (wc.stage < LOG_WALK_REPLAY_ALL) { +		wc.stage++; +		goto again; +	} + +	btrfs_free_path(path); + +	free_extent_buffer(log_root_tree->node); +	log_root_tree->log_root = NULL; +	fs_info->log_root_recovering = 0; + +	/* step 4: commit the transaction, which also unpins the blocks */ +	btrfs_commit_transaction(trans, fs_info->tree_root); + +	kfree(log_root_tree); +	return 0; +} diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h new file mode 100644 index 00000000000..b9409b32ed0 --- /dev/null +++ b/fs/btrfs/tree-log.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2008 Oracle.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __TREE_LOG_ +#define __TREE_LOG_ + +int btrfs_sync_log(struct btrfs_trans_handle *trans, +		   struct btrfs_root *root); +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_log_dentry(struct btrfs_trans_handle *trans, +		    struct btrfs_root *root, struct dentry *dentry); +int btrfs_recover_log_trees(struct btrfs_root *tree_root); +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, +			  struct btrfs_root *root, struct dentry *dentry); +int btrfs_log_inode(struct btrfs_trans_handle *trans, +		    struct btrfs_root *root, struct inode *inode, +		    int inode_only); +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, +				 struct btrfs_root *root, +				 const char *name, int name_len, +				 struct inode *dir, u64 index); +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, +			       struct btrfs_root *root, +			       const char *name, int name_len, +			       struct inode *inode, u64 dirid); +#endif  |