diff options
Diffstat (limited to 'fs/btrfs')
| -rw-r--r-- | fs/btrfs/Makefile | 2 | ||||
| -rw-r--r-- | fs/btrfs/TODO | 20 | ||||
| -rw-r--r-- | fs/btrfs/async-thread.c | 10 | ||||
| -rw-r--r-- | fs/btrfs/async-thread.h | 7 | ||||
| -rw-r--r-- | fs/btrfs/bit-radix.c | 130 | ||||
| -rw-r--r-- | fs/btrfs/bit-radix.h | 33 | ||||
| -rw-r--r-- | fs/btrfs/btrfs_inode.h | 54 | ||||
| -rw-r--r-- | fs/btrfs/crc32c.h | 18 | ||||
| -rw-r--r-- | fs/btrfs/ctree.c | 127 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 1 | ||||
| -rw-r--r-- | fs/btrfs/dir-item.c | 41 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 33 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 34 | ||||
| -rw-r--r-- | fs/btrfs/extent_map.c | 10 | ||||
| -rw-r--r-- | fs/btrfs/file.c | 44 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 189 | ||||
| -rw-r--r-- | fs/btrfs/locking.c | 13 | ||||
| -rw-r--r-- | fs/btrfs/ordered-data.c | 19 | ||||
| -rw-r--r-- | fs/btrfs/ref-cache.c | 26 | ||||
| -rw-r--r-- | fs/btrfs/ref-cache.h | 3 | ||||
| -rw-r--r-- | fs/btrfs/root-tree.c | 21 | ||||
| -rw-r--r-- | fs/btrfs/struct-funcs.c | 21 | ||||
| -rw-r--r-- | fs/btrfs/super.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 67 | ||||
| -rw-r--r-- | fs/btrfs/tree-defrag.c | 4 | 
25 files changed, 653 insertions, 277 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d5c28557fba..48b7909ca8d 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),)  obj-m  := btrfs.o  btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \  	   file-item.o inode-item.o inode-map.o disk-io.o \ -	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \ +	   transaction.o inode.o file.o tree-defrag.o \  	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \  	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \  	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO deleted file mode 100644 index d9b6d38c603..00000000000 --- a/fs/btrfs/TODO +++ /dev/null @@ -1,20 +0,0 @@ -* cleanup, add more error checking, get rid of BUG_ONs -* Fix ENOSPC handling -* Make allocator smarter -* add a block group to struct inode -* Do actual block accounting -* Check compat and incompat flags on the inode -* Get rid of struct ctree_path, limiting tree levels held at one time -* Add generation number to key pointer in nodes -* Add generation number to inode -* forbid cross subvolume renames and hardlinks -* Release -* Do real tree locking -* Add extent mirroring (backup copies of blocks) -* Add fancy interface to get access to incremental backups -* Add fancy striped extents to make big reads faster -* Use relocation to try and fix write errors -* Make allocator much smarter -* xattrs (directory streams for regular files) -* Scrub & defrag - diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4e780b279de..04fb9702d14 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -231,17 +231,25 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)  	/*  	 * if we pick a busy task, move the task to the end of the list. -	 * hopefully this will keep things somewhat evenly balanced +	 * hopefully this will keep things somewhat evenly balanced. +	 * Do the move in batches based on the sequence number.  This groups +	 * requests submitted at roughly the same time onto the same worker.  	 */  	next = workers->worker_list.next;  	worker = list_entry(next, struct btrfs_worker_thread, worker_list);  	atomic_inc(&worker->num_pending);  	worker->sequence++; +  	if (worker->sequence % workers->idle_thresh == 0)  		list_move_tail(next, &workers->worker_list);  	return worker;  } +/* + * selects a worker thread to take the next job.  This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */  static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)  {  	struct btrfs_worker_thread *worker; diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 43e44d115dd..4ec9a2ee0f9 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -63,14 +63,17 @@ struct btrfs_workers {  	/* once a worker has this many requests or fewer, it is idle */  	int idle_thresh; -	/* list with all the work threads */ +	/* list with all the work threads.  The workers on the idle thread +	 * may be actively servicing jobs, but they haven't yet hit the +	 * idle thresh limit above. +	 */  	struct list_head worker_list;  	struct list_head idle_list;  	/* lock for finding the next worker thread to queue on */  	spinlock_t lock; -	/* extra name for this worker */ +	/* extra name for this worker, used for current->name */  	char *name;  }; diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c deleted file mode 100644 index e8bf876db39..00000000000 --- a/fs/btrfs/bit-radix.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (C) 2007 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include "bit-radix.h" - -#define BIT_ARRAY_BYTES 256 -#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8) - -extern struct kmem_cache *btrfs_bit_radix_cachep; -int set_radix_bit(struct radix_tree_root *radix, unsigned long bit) -{ -	unsigned long *bits; -	unsigned long slot; -	int bit_slot; -	int ret; - -	slot = bit / BIT_RADIX_BITS_PER_ARRAY; -	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; - -	bits = radix_tree_lookup(radix, slot); -	if (!bits) { -		bits = kmem_cache_alloc(btrfs_bit_radix_cachep, GFP_NOFS); -		if (!bits) -			return -ENOMEM; -		memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long)); -		bits[0] = slot; -		ret = radix_tree_insert(radix, slot, bits); -		if (ret) -			return ret; -	} -	ret = test_and_set_bit(bit_slot, bits + 1); -	if (ret < 0) -		ret = 1; -	return ret; -} - -int test_radix_bit(struct radix_tree_root *radix, unsigned long bit) -{ -	unsigned long *bits; -	unsigned long slot; -	int bit_slot; - -	slot = bit / BIT_RADIX_BITS_PER_ARRAY; -	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; - -	bits = radix_tree_lookup(radix, slot); -	if (!bits) -		return 0; -	return test_bit(bit_slot, bits + 1); -} - -int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit) -{ -	unsigned long *bits; -	unsigned long slot; -	int bit_slot; -	int i; -	int empty = 1; - -	slot = bit / BIT_RADIX_BITS_PER_ARRAY; -	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; - -	bits = radix_tree_lookup(radix, slot); -	if (!bits) -		return 0; -	clear_bit(bit_slot, bits + 1); -	for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) { -		if (bits[i]) { -			empty = 0; -			break; -		} -	} -	if (empty) { -		bits = radix_tree_delete(radix, slot); -		BUG_ON(!bits); -		kmem_cache_free(btrfs_bit_radix_cachep, bits); -	} -	return 0; -} - -int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits, -			 unsigned long start, int nr) -{ -	unsigned long *bits; -	unsigned long *gang[4]; -	int found; -	int ret; -	int i; -	int total_found = 0; -	unsigned long slot; - -	slot = start / BIT_RADIX_BITS_PER_ARRAY; -	ret = radix_tree_gang_lookup(radix, (void **)gang, slot, -				     ARRAY_SIZE(gang)); -	found = start % BIT_RADIX_BITS_PER_ARRAY; -	for (i = 0; i < ret && nr > 0; i++) { -		bits = gang[i]; -		while(nr > 0) { -			found = find_next_bit(bits + 1, -					      BIT_RADIX_BITS_PER_ARRAY, -					      found); -			if (found < BIT_RADIX_BITS_PER_ARRAY) { -				*retbits = bits[0] * -					BIT_RADIX_BITS_PER_ARRAY + found; -				retbits++; -				nr--; -				total_found++; -				found++; -			} else -				break; -		} -		found = 0; -	} -	return total_found; -} diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h deleted file mode 100644 index c100f54d5c3..00000000000 --- a/fs/btrfs/bit-radix.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 2007 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BIT_RADIX__ -#define __BIT_RADIX__ -#include <linux/radix-tree.h> - -int set_radix_bit(struct radix_tree_root *radix, unsigned long bit); -int test_radix_bit(struct radix_tree_root *radix, unsigned long bit); -int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit); -int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits, -			 unsigned long start, int nr); - -static inline void init_bit_radix(struct radix_tree_root *radix) -{ -	INIT_RADIX_TREE(radix, GFP_NOFS); -} -#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0577fda2168..0b2e623cf42 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -25,27 +25,58 @@  /* in memory btrfs inode */  struct btrfs_inode { +	/* which subvolume this inode belongs to */  	struct btrfs_root *root; + +	/* the block group preferred for allocations.  This pointer is buggy +	 * and needs to be replaced with a bytenr instead +	 */  	struct btrfs_block_group_cache *block_group; + +	/* key used to find this inode on disk.  This is used by the code +	 * to read in roots of subvolumes +	 */  	struct btrfs_key location; + +	/* the extent_tree has caches of all the extent mappings to disk */  	struct extent_map_tree extent_tree; + +	/* the io_tree does range state (DIRTY, LOCKED etc) */  	struct extent_io_tree io_tree; + +	/* special utility tree used to record which mirrors have already been +	 * tried when checksums fail for a given block +	 */  	struct extent_io_tree io_failure_tree; + +	/* held while inserting checksums to avoid races */  	struct mutex csum_mutex; + +	/* held while inesrting or deleting extents from files */  	struct mutex extent_mutex; + +	/* held while logging the inode in tree-log.c */  	struct mutex log_mutex; -	struct inode vfs_inode; + +	/* used to order data wrt metadata */  	struct btrfs_ordered_inode_tree ordered_tree; +	/* standard acl pointers */  	struct posix_acl *i_acl;  	struct posix_acl *i_default_acl;  	/* for keeping track of orphaned inodes */  	struct list_head i_orphan; +	/* list of all the delalloc inodes in the FS.  There are times we need +	 * to write all the delalloc pages to disk, and this list is used +	 * to walk them all. +	 */  	struct list_head delalloc_inodes; -	/* full 64 bit generation number */ +	/* full 64 bit generation number, struct vfs_inode doesn't have a big +	 * enough field for this. +	 */  	u64 generation;  	/* @@ -57,10 +88,25 @@ struct btrfs_inode {  	 */  	u64 logged_trans; -	/* trans that last made a change that should be fully fsync'd */ +	/* +	 * trans that last made a change that should be fully fsync'd.  This +	 * gets reset to zero each time the inode is logged +	 */  	u64 log_dirty_trans; + +	/* total number of bytes pending delalloc, used by stat to calc the +	 * real block usage of the file +	 */  	u64 delalloc_bytes; + +	/* +	 * the size of the file stored in the metadata on disk.  data=ordered +	 * means the in-memory i_size might be larger than the size on disk +	 * because not all the blocks are written yet. +	 */  	u64 disk_i_size; + +	/* flags field from the on disk inode */  	u32 flags;  	/* @@ -68,6 +114,8 @@ struct btrfs_inode {  	 * number for new files that are created  	 */  	u64 index_cnt; + +	struct inode vfs_inode;  };  static inline struct btrfs_inode *BTRFS_I(struct inode *inode) diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h index 4f0fefed132..1eaf11d334f 100644 --- a/fs/btrfs/crc32c.h +++ b/fs/btrfs/crc32c.h @@ -1,3 +1,21 @@ +/* + * Copyright (C) 2008 Oracle.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +  #ifndef __BTRFS_CRC32C__  #define __BTRFS_CRC32C__  #include <asm/byteorder.h> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 50e81f43e6d..ff3261ff2e1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1,5 +1,5 @@  /* - * Copyright (C) 2007 Oracle.  All rights reserved. + * Copyright (C) 2007,2008 Oracle.  All rights reserved.   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of the GNU General Public @@ -54,12 +54,19 @@ struct btrfs_path *btrfs_alloc_path(void)  	return path;  } +/* this also releases the path */  void btrfs_free_path(struct btrfs_path *p)  {  	btrfs_release_path(NULL, p);  	kmem_cache_free(btrfs_path_cachep, p);  } +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */  void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)  {  	int i; @@ -77,6 +84,16 @@ void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)  	}  } +/* + * safely gets a reference on the root node of a tree.  A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree.  See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear.  It may stop being the root of the tree + * at any time because there are no locks held. + */  struct extent_buffer *btrfs_root_node(struct btrfs_root *root)  {  	struct extent_buffer *eb; @@ -87,6 +104,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)  	return eb;  } +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root.  A locked buffer + * is returned, with a reference held. + */  struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)  {  	struct extent_buffer *eb; @@ -108,6 +129,10 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)  	return eb;  } +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list.  transaction.c walks this to make sure they + * get properly updated on disk. + */  static void add_root_to_dirty_list(struct btrfs_root *root)  {  	if (root->track_dirty && list_empty(&root->dirty_list)) { @@ -116,6 +141,11 @@ static void add_root_to_dirty_list(struct btrfs_root *root)  	}  } +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid.  The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */  int btrfs_copy_root(struct btrfs_trans_handle *trans,  		      struct btrfs_root *root,  		      struct extent_buffer *buf, @@ -167,6 +197,22 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	return 0;  } +/* + * does the dirty work in cow of a single block.  The parent block + * (if supplied) is updated to point to the new cow copy.  The new + * buffer is marked dirty and returned locked.  If you modify the block + * it needs to be marked dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes + * the allocator should try to find free next to the block it returns.  This is + * just a hint and may be ignored by the allocator. + * + * prealloc_dest -- if you have already reserved a destination for the cow, + * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent + * is used to finish the allocation. + */  int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root,  			     struct extent_buffer *buf, @@ -311,6 +357,11 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,  	return 0;  } +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */  int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,  		    struct btrfs_root *root, struct extent_buffer *buf,  		    struct extent_buffer *parent, int parent_slot, @@ -347,6 +398,10 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,  	return ret;  } +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */  static int close_blocks(u64 blocknr, u64 other, u32 blocksize)  {  	if (blocknr < other && other - (blocknr + blocksize) < 32768) @@ -381,6 +436,11 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)  } +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */  int btrfs_realloc_node(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct extent_buffer *parent,  		       int start_slot, int cache_only, u64 *last_ret, @@ -521,6 +581,10 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,  	return btrfs_item_offset_nr(leaf, nr - 1);  } +/* + * extra debugging checks to make sure all the items in a key are + * well formed and in the proper order + */  static int check_node(struct btrfs_root *root, struct btrfs_path *path,  		      int level)  { @@ -561,6 +625,10 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,  	return 0;  } +/* + * extra checking to make sure all the items in a leaf are + * well formed and in the proper order + */  static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,  		      int level)  { @@ -782,6 +850,10 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,  	return -1;  } +/* given a node and slot number, this reads the blocks it points to.  The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */  static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,  				   struct extent_buffer *parent, int slot)  { @@ -798,6 +870,11 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,  		       btrfs_node_ptr_generation(parent, slot));  } +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion.  We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */  static noinline int balance_level(struct btrfs_trans_handle *trans,  			 struct btrfs_root *root,  			 struct btrfs_path *path, int level) @@ -1024,7 +1101,10 @@ enospc:  	return ret;  } -/* returns zero if the push worked, non-zero otherwise */ +/* Node balancing for insertion.  Here we only split or push nodes around + * when they are completely full.  This is also done top down, so we + * have to be pessimistic. + */  static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,  					  struct btrfs_root *root,  					  struct btrfs_path *path, int level) @@ -1150,7 +1230,8 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,  }  /* - * readahead one full node of leaves + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them.   */  static noinline void reada_for_search(struct btrfs_root *root,  				      struct btrfs_path *path, @@ -1226,6 +1307,19 @@ static noinline void reada_for_search(struct btrfs_root *root,  	}  } +/* + * when we walk down the tree, it is usually safe to unlock the higher layers in + * the tree.  The exceptions are when our path goes through slot 0, because operations + * on the tree might require changing key pointers higher up in the tree. + * + * callers might also have set path->keep_locks, which tells this code to + * keep the lock if the path points to the last slot in the block.  This is + * part of walking through the tree, and selecting the next slot in the higher + * block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. + * so if lowest_unlock is 1, level 0 won't be unlocked + */  static noinline void unlock_up(struct btrfs_path *path, int level,  			       int lowest_unlock)  { @@ -2705,6 +2799,12 @@ again:  	return ret;  } +/* + * make the item pointed to by the path smaller.  new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */  int btrfs_truncate_item(struct btrfs_trans_handle *trans,  			struct btrfs_root *root,  			struct btrfs_path *path, @@ -2818,6 +2918,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,  	return ret;  } +/* + * make the item pointed to by the path bigger, data_size is the new size. + */  int btrfs_extend_item(struct btrfs_trans_handle *trans,  		      struct btrfs_root *root, struct btrfs_path *path,  		      u32 data_size) @@ -2897,7 +3000,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,  }  /* - * Given a key and some data, insert an item into the tree. + * Given a key and some data, insert items into the tree.   * This does all the path init required, making room in the tree if needed.   */  int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -3046,9 +3149,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root  /*   * delete the pointer from a given node.   * - * If the delete empties a node, the node is removed from the tree, - * continuing all the way the root if required.  The root is converted into - * a leaf if all the nodes are emptied. + * the tree should have been previously balanced so the deletion does not + * empty a node.   */  static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		   struct btrfs_path *path, int level, int slot) @@ -3233,6 +3335,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,   * search the tree again to find a leaf with lesser keys   * returns 0 if it found something or 1 if there are no lesser leaves.   * returns < 0 on io errors. + * + * This may release the path, and so you may lose any locks held at the + * time you call it.   */  int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)  { @@ -3265,9 +3370,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)  /*   * A helper function to walk down the tree starting at min_key, and looking   * for nodes or leaves that are either in cache or have a minimum - * transaction id.  This is used by the btree defrag code, but could - * also be used to search for blocks that have changed since a given - * transaction id. + * transaction id.  This is used by the btree defrag code, and tree logging   *   * This does not cow, but it does stuff the starting key it finds back   * into min_key, so you can call btrfs_search_slot with cow=1 on the @@ -3279,6 +3382,10 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)   * This honors path->lowest_level to prevent descent past a given level   * of the tree.   * + * min_trans indicates the oldest transaction that you are interested + * in walking through.  Any nodes or leaves older than min_trans are + * skipped over (without reading them). + *   * returns zero if something useful was found, < 0 on error and 1 if there   * was nothing in the tree that matched the search criteria.   */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0079b60b18f..ded1643c027 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -27,7 +27,6 @@  #include <linux/backing-dev.h>  #include <linux/wait.h>  #include <asm/kmap_types.h> -#include "bit-radix.h"  #include "extent_io.h"  #include "extent_map.h"  #include "async-thread.h" diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index e4f30090d64..5040b71f190 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,6 +21,14 @@  #include "hash.h"  #include "transaction.h" +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision.  data_size indicates how big the item inserted should be.  On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */  static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle  						   *trans,  						   struct btrfs_root *root, @@ -55,6 +63,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle  	return (struct btrfs_dir_item *)ptr;  } +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */  int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,  			    struct btrfs_root *root, const char *name,  			    u16 name_len, const void *data, u16 data_len, @@ -109,6 +121,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,  	return ret;  } +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + */  int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root  			  *root, const char *name, int name_len, u64 dir,  			  struct btrfs_key *location, u8 type, u64 index) @@ -184,6 +203,11 @@ out:  	return 0;  } +/* + * lookup a directory item based on name.  'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + */  struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,  					     struct btrfs_root *root,  					     struct btrfs_path *path, u64 dir, @@ -222,6 +246,14 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,  	return btrfs_match_dir_item_name(root, path, name, name_len);  } +/* + * lookup a directory item based on index.  'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + * + * The name is used to make sure the index really points to the name you were + * looking for. + */  struct btrfs_dir_item *  btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,  			    struct btrfs_root *root, @@ -282,6 +314,11 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,  	return btrfs_match_dir_item_name(root, path, name, name_len);  } +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */  struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,  			      struct btrfs_path *path,  			      const char *name, int name_len) @@ -313,6 +350,10 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,  	return NULL;  } +/* + * given a pointer into a directory item, delete it.  This + * handles items that have more than one entry in them. + */  int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,  			      struct btrfs_root *root,  			      struct btrfs_path *path, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 45b4f728527..5ee10d3136f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,6 +55,11 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)  static struct extent_io_ops btree_extent_io_ops;  static void end_workqueue_fn(struct btrfs_work *work); +/* + * end_io_wq structs are used to do processing in task context when an IO is + * complete.  This is used during reads to verify checksums, and it is used + * by writes to insert metadata for new file extents after IO is complete. + */  struct end_io_wq {  	struct bio *bio;  	bio_end_io_t *end_io; @@ -66,6 +71,11 @@ struct end_io_wq {  	struct btrfs_work work;  }; +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads.  They checksum file and metadata bios + * just before they are sent down the IO stack. + */  struct async_submit_bio {  	struct inode *inode;  	struct bio *bio; @@ -76,6 +86,10 @@ struct async_submit_bio {  	struct btrfs_work work;  }; +/* + * extents on the btree inode are pretty simple, there's one extent + * that covers the entire device + */  struct extent_map *btree_get_extent(struct inode *inode, struct page *page,  				    size_t page_offset, u64 start, u64 len,  				    int create) @@ -151,6 +165,10 @@ void btrfs_csum_final(u32 crc, char *result)  	*(__le32 *)result = ~cpu_to_le32(crc);  } +/* + * compute the csum for a btree block, and either verify it or write it + * into the csum field of the block. + */  static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,  			   int verify)  { @@ -204,6 +222,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,  	return 0;  } +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer.  This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */  static int verify_parent_transid(struct extent_io_tree *io_tree,  				 struct extent_buffer *eb, u64 parent_transid)  { @@ -228,9 +252,12 @@ out:  	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,  		      GFP_NOFS);  	return ret; -  } +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + */  static int btree_read_extent_buffer_pages(struct btrfs_root *root,  					  struct extent_buffer *eb,  					  u64 start, u64 parent_transid) @@ -260,6 +287,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror  	return -EIO;  } +/* + * checksum a dirty tree block before IO.  This has extra checks to make + * sure we only fill in the checksum field in the first page of a multi-page block + */  int csum_dirty_buffer(struct btrfs_root *root, struct page *page)  {  	struct extent_io_tree *tree; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8bd1b402f3f..563b2d12f4f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -914,6 +914,10 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)  }  EXPORT_SYMBOL(wait_on_extent_writeback); +/* + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */  int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)  {  	int err; @@ -982,6 +986,13 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)  }  EXPORT_SYMBOL(set_range_writeback); +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */  int find_first_extent_bit(struct extent_io_tree *tree, u64 start,  			  u64 *start_ret, u64 *end_ret, int bits)  { @@ -1017,6 +1028,10 @@ out:  }  EXPORT_SYMBOL(find_first_extent_bit); +/* find the first state struct with 'bits' set after 'start', and + * return it.  tree->lock must be held.  NULL will returned if + * nothing was found after 'start' + */  struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,  						 u64 start, int bits)  { @@ -1046,8 +1061,14 @@ out:  }  EXPORT_SYMBOL(find_first_extent_bit_state); -u64 find_lock_delalloc_range(struct extent_io_tree *tree, -			     u64 *start, u64 *end, u64 max_bytes) +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'.  start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, +					     u64 *start, u64 *end, u64 max_bytes)  {  	struct rb_node *node;  	struct extent_state *state; @@ -1130,6 +1151,11 @@ out:  	return found;  } +/* + * count the number of bytes in the tree that have a given bit(s) + * set.  This can be fairly slow, except for EXTENT_DIRTY which is + * cached.  The total number found is returned. + */  u64 count_range_bits(struct extent_io_tree *tree,  		     u64 *start, u64 search_end, u64 max_bytes,  		     unsigned long bits) @@ -1245,6 +1271,10 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)  }  EXPORT_SYMBOL(unlock_range); +/* + * set the private field for a given byte offset in the tree.  If there isn't + * an extent_state there already, this does nothing. + */  int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)  {  	struct rb_node *node; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 78ced11d18c..74b2a29880d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -114,6 +114,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,  	return NULL;  } +/* + * search through the tree for an extent_map with a given offset.  If + * it can't be found, try to find some neighboring extents + */  static struct rb_node *__tree_search(struct rb_root *root, u64 offset,  				     struct rb_node **prev_ret,  				     struct rb_node **next_ret) @@ -160,6 +164,10 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,  	return NULL;  } +/* + * look for an offset in the tree, and if it can't be found, return + * the first offset we can find smaller than 'offset'. + */  static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)  {  	struct rb_node *prev; @@ -170,6 +178,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)  	return ret;  } +/* check to see if two extent_map structs are adjacent and safe to merge */  static int mergable_maps(struct extent_map *prev, struct extent_map *next)  {  	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) @@ -250,6 +259,7 @@ out:  }  EXPORT_SYMBOL(add_extent_mapping); +/* simple helper to do math around the end of an extent, handling wrap */  static u64 range_end(u64 start, u64 len)  {  	if (start + len < start) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1b7e51a9db0..3088a118448 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -41,6 +41,9 @@  #include "compat.h" +/* simple helper to fault in pages and copy.  This should go away + * and be replaced with calls into generic code. + */  static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,  					 int write_bytes,  					 struct page **prepared_pages, @@ -72,12 +75,19 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,  	return page_fault ? -EFAULT : 0;  } +/* + * unlocks pages after btrfs_file_write is done with them + */  static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)  {  	size_t i;  	for (i = 0; i < num_pages; i++) {  		if (!pages[i])  			break; +		/* page checked is some magic around finding pages that +		 * have been modified without going through btrfs_set_page_dirty +		 * clear it here +		 */  		ClearPageChecked(pages[i]);  		unlock_page(pages[i]);  		mark_page_accessed(pages[i]); @@ -85,6 +95,10 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)  	}  } +/* this does all the hard work for inserting an inline extent into + * the btree.  Any existing inline extent is extended as required to make room, + * otherwise things are inserted as required into the btree + */  static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,  				struct btrfs_root *root, struct inode *inode,  				u64 offset, size_t size, @@ -228,6 +242,14 @@ fail:  	return err;  } +/* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of + * any next extents (if required). + * + * this also makes the decision about creating an inline extent vs + * doing real data extents, marking pages dirty and delalloc as required. + */  static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root,  				   struct file *file, @@ -362,6 +384,10 @@ out_unlock:  	return err;  } +/* + * this drops all the extents in the cache that intersect the range + * [start, end].  Existing extents are split as required. + */  int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			    int skip_pinned)  { @@ -536,6 +562,9 @@ out:   * If an extent intersects the range but is not entirely inside the range   * it is either truncated or split.  Anything entirely inside the range   * is deleted from the tree. + * + * inline_limit is used to tell this code which offsets in the file to keep + * if they contain inline extents.   */  int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct inode *inode, @@ -796,7 +825,9 @@ out:  }  /* - * this gets pages into the page cache and locks them down + * this gets pages into the page cache and locks them down, it also properly + * waits for data=ordered extents to finish before allowing the pages to be + * modified.   */  static int noinline prepare_pages(struct btrfs_root *root, struct file *file,  			 struct page **pages, size_t num_pages, @@ -1034,6 +1065,17 @@ int btrfs_release_file(struct inode * inode, struct file * filp)  	return 0;  } +/* + * fsync call for both files and directories.  This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit.  This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */  int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)  {  	struct inode *inode = dentry->d_inode; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 404704d2682..f3abecc2d14 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -83,6 +83,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {  static void btrfs_truncate(struct inode *inode); +/* + * a very lame attempt at stopping writes when the FS is 85% full.  There + * are countless ways this is incorrect, but it is better than nothing. + */  int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,  			   int for_del)  { @@ -108,6 +112,12 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,  	return ret;  } +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code.  The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + */  static int cow_file_range(struct inode *inode, u64 start, u64 end)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -185,6 +195,13 @@ out:  	return ret;  } +/* + * when nowcow writeback call back.  This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */  static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)  {  	u64 extent_start; @@ -291,6 +308,9 @@ out:  	return err;  } +/* + * extent_io.c call back to do delayed allocation processing + */  static int run_delalloc_range(struct inode *inode, u64 start, u64 end)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -305,6 +325,11 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)  	return ret;  } +/* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that + * have pending delalloc work to be done. + */  int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,  		       unsigned long old, unsigned long bits)  { @@ -323,6 +348,9 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,  	return 0;  } +/* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */  int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,  			 unsigned long old, unsigned long bits)  { @@ -349,6 +377,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,  	return 0;  } +/* + * extent_io.c merge_bio_hook, this must check the chunk tree to make sure + * we don't create bios that span stripes or chunks + */  int btrfs_merge_bio_hook(struct page *page, unsigned long offset,  			 size_t size, struct bio *bio)  { @@ -371,6 +403,14 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,  	return 0;  } +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time.   All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */  int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  			  int mirror_num)  { @@ -383,6 +423,10 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  	return btrfs_map_bio(root, rw, bio, mirror_num, 1);  } +/* + * extent_io.c submission hook. This does the right thing for csum calculation on write, + * or reading the csums from the tree before a read + */  int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  			  int mirror_num)  { @@ -408,6 +452,10 @@ mapit:  	return btrfs_map_bio(root, rw, bio, mirror_num, 0);  } +/* + * given a list of ordered sums record them in the inode.  This happens + * at IO completion time based on sums calculated at bio submission time. + */  static noinline int add_pending_csums(struct btrfs_trans_handle *trans,  			     struct inode *inode, u64 file_offset,  			     struct list_head *list) @@ -430,12 +478,12 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)  				   GFP_NOFS);  } +/* see btrfs_writepage_start_hook for details on why this is required */  struct btrfs_writepage_fixup {  	struct page *page;  	struct btrfs_work work;  }; -/* see btrfs_writepage_start_hook for details on why this is required */  void btrfs_writepage_fixup_worker(struct btrfs_work *work)  {  	struct btrfs_writepage_fixup *fixup; @@ -522,6 +570,10 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)  	return -EAGAIN;  } +/* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */  static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -631,6 +683,14 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,  	return btrfs_finish_ordered_io(page->mapping->host, start, end);  } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data.  This + * io_failure_record is used to record state as we go through all the + * mirrors.  If another mirror has good data, the page is set up to date + * and things continue.  If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */  struct io_failure_record {  	struct page *page;  	u64 start; @@ -725,6 +785,10 @@ int btrfs_io_failed_hook(struct bio *failed_bio,  	return 0;  } +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */  int btrfs_clean_io_failures(struct inode *inode, u64 start)  {  	u64 private; @@ -753,6 +817,11 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start)  	return 0;  } +/* + * when reads are done, we need to check csums to verify the data is correct + * if there's a match, we allow the bio to finish.  If not, we go through + * the io_failure_record routines to find good copies + */  int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,  			       struct extent_state *state)  { @@ -990,6 +1059,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)  	btrfs_free_path(path);  } +/* + * read an inode from the btree into the in-memory inode + */  void btrfs_read_locked_inode(struct inode *inode)  {  	struct btrfs_path *path; @@ -1083,6 +1155,9 @@ make_bad:  	make_bad_inode(inode);  } +/* + * given a leaf and an inode, copy the inode fields into the leaf + */  static void fill_inode_item(struct btrfs_trans_handle *trans,  			    struct extent_buffer *leaf,  			    struct btrfs_inode_item *item, @@ -1118,6 +1193,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  				    BTRFS_I(inode)->block_group->key.objectid);  } +/* + * copy everything in the in-memory inode into the btree. + */  int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,  			      struct btrfs_root *root,  			      struct inode *inode) @@ -1151,6 +1229,11 @@ failed:  } +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code.  It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */  int btrfs_unlink_inode(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root,  		       struct inode *dir, struct inode *inode, @@ -1309,7 +1392,7 @@ fail:  /*   * this can truncate away extent items, csum items and directory items.   * It starts at a high offset and removes keys until it can't find - * any higher than i_size. + * any higher than new_size   *   * csum items that cross the new i_size are truncated to the new size   * as well. @@ -2123,6 +2206,11 @@ void btrfs_dirty_inode(struct inode *inode)  	btrfs_end_transaction(trans, root);  } +/* + * find the highest existing sequence number in a directory + * and then set the in-memory index_cnt variable to reflect + * free sequence numbers + */  static int btrfs_set_inode_index_count(struct inode *inode)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2175,6 +2263,10 @@ out:  	return ret;  } +/* + * helper to find a free sequence number in a given directory.  This current + * code is very simple, later versions will do smarter things in the btree + */  static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,  				 u64 *index)  { @@ -2305,6 +2397,12 @@ static inline u8 btrfs_inode_type(struct inode *inode)  	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];  } +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */  int btrfs_add_link(struct btrfs_trans_handle *trans,  		   struct inode *parent_inode, struct inode *inode,  		   const char *name, int name_len, int add_backref, u64 index) @@ -2611,6 +2709,10 @@ out_unlock:  	return err;  } +/* helper for btfs_get_extent.  Given an existing extent in the tree, + * and an extent that you want to insert, deal with overlap and insert + * the new extent into the tree. + */  static int merge_extent_mapping(struct extent_map_tree *em_tree,  				struct extent_map *existing,  				struct extent_map *em, @@ -2627,6 +2729,14 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,  	return add_extent_mapping(em_tree, em);  } +/* + * a bit scary, this does extent mapping from logical file offset to the disk. + * the ugly parts come from merging extents from the disk with the + * in-ram representation.  This gets more complex because of the data=ordered code, + * where the in-ram extents might be locked pending data=ordered completion. + * + * This also copies inline extents directly into the page. + */  struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,  				    size_t pg_offset, u64 start, u64 len,  				    int create) @@ -2869,76 +2979,11 @@ out:  	return em;  } -#if 0 /* waiting for O_DIRECT reads */ -static int btrfs_get_block(struct inode *inode, sector_t iblock, -			struct buffer_head *bh_result, int create) -{ -	struct extent_map *em; -	u64 start = (u64)iblock << inode->i_blkbits; -	struct btrfs_multi_bio *multi = NULL; -	struct btrfs_root *root = BTRFS_I(inode)->root; -	u64 len; -	u64 logical; -	u64 map_length; -	int ret = 0; - -	em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0); - -	if (!em || IS_ERR(em)) -		goto out; - -	if (em->start > start || em->start + em->len <= start) { -	    goto out; -	} - -	if (em->block_start == EXTENT_MAP_INLINE) { -		ret = -EINVAL; -		goto out; -	} - -	len = em->start + em->len - start; -	len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size))); - -	if (em->block_start == EXTENT_MAP_HOLE || -	    em->block_start == EXTENT_MAP_DELALLOC) { -		bh_result->b_size = len; -		goto out; -	} - -	logical = start - em->start; -	logical = em->block_start + logical; - -	map_length = len; -	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, -			      logical, &map_length, &multi, 0); -	BUG_ON(ret); -	bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits; -	bh_result->b_size = min(map_length, len); - -	bh_result->b_bdev = multi->stripes[0].dev->bdev; -	set_buffer_mapped(bh_result); -	kfree(multi); -out: -	free_extent_map(em); -	return ret; -} -#endif -  static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,  			const struct iovec *iov, loff_t offset,  			unsigned long nr_segs)  {  	return -EINVAL; -#if 0 -	struct file *file = iocb->ki_filp; -	struct inode *inode = file->f_mapping->host; - -	if (rw == WRITE) -		return -EINVAL; - -	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, -				  offset, nr_segs, btrfs_get_block, NULL); -#endif  }  static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) @@ -3202,6 +3247,9 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,  	}  } +/* + * create a new subvolume directory/inode (helper for the ioctl). + */  int btrfs_create_subvol_root(struct btrfs_root *new_root,  		struct btrfs_trans_handle *trans, u64 new_dirid,  		struct btrfs_block_group_cache *block_group) @@ -3223,6 +3271,9 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,  	return btrfs_update_inode(trans, new_root, inode);  } +/* helper function for file defrag and space balancing.  This + * forces readahead on a given range of bytes in an inode + */  unsigned long btrfs_force_ra(struct address_space *mapping,  			      struct file_ra_state *ra, struct file *file,  			      pgoff_t offset, pgoff_t last_index) @@ -3424,6 +3475,10 @@ out_unlock:  	return ret;  } +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */  int btrfs_start_delalloc_inodes(struct btrfs_root *root)  {  	struct list_head *head = &root->fs_info->delalloc_inodes; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 0cc314c10d6..e30aa6e2958 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -25,6 +25,15 @@  #include "extent_io.h"  #include "locking.h" +/* + * locks the per buffer mutex in an extent buffer.  This uses adaptive locks + * and the spin is not tuned very extensively.  The spinning does make a big + * difference in almost every workload, but spinning for the right amount of + * time needs some help. + * + * In general, we want to spin as long as the lock holder is doing btree searches, + * and we should give up if they are in more expensive code. + */  int btrfs_tree_lock(struct extent_buffer *eb)  {  	int i; @@ -57,6 +66,10 @@ int btrfs_tree_locked(struct extent_buffer *eb)  	return mutex_is_locked(&eb->mutex);  } +/* + * btrfs_search_slot uses this to decide if it should drop its locks + * before doing something expensive like allocating free blocks for cow. + */  int btrfs_path_lock_waiting(struct btrfs_path *path, int level)  {  	int i; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 951eacff242..dcc1730dd83 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -26,7 +26,6 @@  #include "btrfs_inode.h"  #include "extent_io.h" -  static u64 entry_end(struct btrfs_ordered_extent *entry)  {  	if (entry->file_offset + entry->len < entry->file_offset) @@ -34,6 +33,9 @@ static u64 entry_end(struct btrfs_ordered_extent *entry)  	return entry->file_offset + entry->len;  } +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */  static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,  				   struct rb_node *node)  { @@ -58,6 +60,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,  	return NULL;  } +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */  static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,  				     struct rb_node **prev_ret)  { @@ -108,6 +114,9 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,  	return NULL;  } +/* + * helper to check if a given offset is inside a given entry + */  static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)  {  	if (file_offset < entry->file_offset || @@ -116,6 +125,10 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)  	return 1;  } +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */  static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,  					  u64 file_offset)  { @@ -305,6 +318,10 @@ int btrfs_remove_ordered_extent(struct inode *inode,  	return 0;  } +/* + * wait for all the ordered extents in a root.  This is done when balancing + * space between drives. + */  int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)  {  	struct list_head splice; diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c index 30fcb7aea5b..a50ebb67055 100644 --- a/fs/btrfs/ref-cache.c +++ b/fs/btrfs/ref-cache.c @@ -21,6 +21,16 @@  #include "ref-cache.h"  #include "transaction.h" +/* + * leaf refs are used to cache the information about which extents + * a given leaf has references on.  This allows us to process that leaf + * in btrfs_drop_snapshot without needing to read it back from disk. + */ + +/* + * kmalloc a leaf reference struct and update the counters for the + * total ref cache size + */  struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,  					    int nr_extents)  { @@ -40,6 +50,10 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,  	return ref;  } +/* + * free a leaf reference struct and update the counters for the + * total ref cache size + */  void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)  {  	if (!ref) @@ -135,6 +149,10 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,  	return 0;  } +/* + * find the leaf ref for a given extent.  This returns the ref struct with + * a usage reference incremented + */  struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,  					     u64 bytenr)  { @@ -160,6 +178,10 @@ again:  	return NULL;  } +/* + * add a fully filled in leaf ref struct + * remove all the refs older than a given root generation + */  int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,  		       int shared)  { @@ -184,6 +206,10 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,  	return ret;  } +/* + * remove a single leaf ref from the tree.  This drops the ref held by the tree + * only + */  int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)  {  	struct btrfs_leaf_ref_tree *tree; diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h index 617564787f5..16f3183d7c5 100644 --- a/fs/btrfs/ref-cache.h +++ b/fs/btrfs/ref-cache.h @@ -19,8 +19,11 @@  #define __REFCACHE__  struct btrfs_extent_info { +	/* bytenr and num_bytes find the extent in the extent allocation tree */  	u64 bytenr;  	u64 num_bytes; + +	/* objectid and offset find the back reference for the file */  	u64 objectid;  	u64 offset;  }; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0091c01abb0..eb7f7655e9d 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -22,8 +22,10 @@  #include "print-tree.h"  /* - * returns 0 on finding something, 1 if no more roots are there - * and < 0 on error + *  search forward for a root, starting with objectid 'search_start' + *  if a root key is found, the objectid we find is filled into 'found_objectid' + *  and 0 is returned.  < 0 is returned on error, 1 if there is nothing + *  left in the tree.   */  int btrfs_search_root(struct btrfs_root *root, u64 search_start,  		      u64 *found_objectid) @@ -66,6 +68,11 @@ out:  	return ret;  } +/* + * lookup the root with the highest offset for a given objectid.  The key we do + * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0 + * on error. + */  int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,  			struct btrfs_root_item *item, struct btrfs_key *key)  { @@ -104,6 +111,9 @@ out:  	return ret;  } +/* + * copy the data in 'item' into the btree + */  int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root  		      *root, struct btrfs_key *key, struct btrfs_root_item  		      *item) @@ -147,6 +157,12 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root  	return ret;  } +/* + * at mount time we want to find all the old transaction snapshots that were in + * the process of being deleted if we crashed.  This is any root item with an offset + * lower than the latest root.  They need to be queued for deletion to finish + * what was happening when we crashed. + */  int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,  			  struct btrfs_root *latest)  { @@ -227,6 +243,7 @@ err:  	return ret;  } +/* drop the root item for 'key' from 'root' */  int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		   struct btrfs_key *key)  { diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index ad03a32d111..cdedbe144d4 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -17,6 +17,27 @@   */  #include <linux/highmem.h> + +/* this is some deeply nasty code.  ctree.h has a different + * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef + * + * The end result is that anyone who #includes ctree.h gets a + * declaration for the btrfs_set_foo functions and btrfs_foo functions + * + * This file declares the macros and then #includes ctree.h, which results + * in cpp creating the function here based on the template below. + * + * These setget functions do all the extent_buffer related mapping + * required to efficiently read and write specific fields in the extent + * buffers.  Every pointer to metadata items in btrfs is really just + * an unsigned long offset into the extent buffer which has been + * cast to a specific type.  This gives us all the gcc type checking. + * + * The extent buffer api is used to do all the kmapping and page + * spanning work required to get extent buffers in highmem and have + * a metadata blocksize different from the page size. + */ +  #define BTRFS_SETGET_FUNCS(name, type, member, bits)			\  u##bits btrfs_##name(struct extent_buffer *eb,				\  				   type *s)				\ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8399d6d05d6..2e6039825b7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -519,6 +519,9 @@ static struct file_system_type btrfs_fs_type = {  	.fs_flags	= FS_REQUIRES_DEV,  }; +/* + * used by btrfsctl to scan devices when no FS is mounted + */  static long btrfs_control_ioctl(struct file *file, unsigned int cmd,  				unsigned long arg)  { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 444abe0796a..11266d68a6c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -46,6 +46,9 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)  	}  } +/* + * either allocate a new transaction or hop into the existing one + */  static noinline int join_transaction(struct btrfs_root *root)  {  	struct btrfs_transaction *cur_trans; @@ -85,6 +88,12 @@ static noinline int join_transaction(struct btrfs_root *root)  	return 0;  } +/* + * this does all the record keeping required to make sure that a + * reference counted root is properly recorded in a given transaction. + * This is required to make sure the old root from before we joined the transaction + * is deleted when the transaction commits + */  noinline int btrfs_record_root_in_trans(struct btrfs_root *root)  {  	struct btrfs_dirty_root *dirty; @@ -127,6 +136,10 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root)  	return 0;  } +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */  static void wait_current_trans(struct btrfs_root *root)  {  	struct btrfs_transaction *cur_trans; @@ -198,7 +211,7 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,  	return start_transaction(r, num_blocks, 2);  } - +/* wait for a transaction commit to be fully complete */  static noinline int wait_for_commit(struct btrfs_root *root,  				    struct btrfs_transaction *commit)  { @@ -218,6 +231,10 @@ static noinline int wait_for_commit(struct btrfs_root *root,  	return 0;  } +/* + * rate limit against the drop_snapshot code.  This helps to slow down new operations + * if the drop_snapshot code isn't able to keep up. + */  static void throttle_on_drops(struct btrfs_root *root)  {  	struct btrfs_fs_info *info = root->fs_info; @@ -302,7 +319,11 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,  	return __btrfs_end_transaction(trans, root, 1);  } - +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees.  This is used to make sure all of + * those extents are on disk for transaction or log commit + */  int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,  					struct extent_io_tree *dirty_pages)  { @@ -393,6 +414,16 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,  					   &trans->transaction->dirty_pages);  } +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */  static int update_cowonly_root(struct btrfs_trans_handle *trans,  			       struct btrfs_root *root)  { @@ -418,6 +449,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,  	return 0;  } +/* + * update all the cowonly tree roots on disk + */  int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,  			    struct btrfs_root *root)  { @@ -433,6 +467,11 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,  	return 0;  } +/* + * dead roots are old snapshots that need to be deleted.  This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */  int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)  {  	struct btrfs_dirty_root *dirty; @@ -449,6 +488,12 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)  	return 0;  } +/* + * at transaction commit time we need to schedule the old roots for + * deletion via btrfs_drop_snapshot.  This runs through all the + * reference counted roots that were modified in the current + * transaction and puts them into the drop list + */  static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,  				    struct radix_tree_root *radix,  				    struct list_head *list) @@ -541,6 +586,10 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,  	return err;  } +/* + * defrag a given btree.  If cacheonly == 1, this won't read from the disk, + * otherwise every leaf in the btree is read and defragged. + */  int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)  {  	struct btrfs_fs_info *info = root->fs_info; @@ -570,6 +619,10 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)  	return 0;  } +/* + * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on + * all of them + */  static noinline int drop_dirty_roots(struct btrfs_root *tree_root,  				     struct list_head *list)  { @@ -664,6 +717,10 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,  	return ret;  } +/* + * new snapshots need to be created at a very specific time in the + * transaction commit.  This does the actual creation + */  static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  				   struct btrfs_fs_info *fs_info,  				   struct btrfs_pending_snapshot *pending) @@ -734,6 +791,9 @@ fail:  	return ret;  } +/* + * create all the snapshots we've scheduled for creation + */  static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,  					     struct btrfs_fs_info *fs_info)  { @@ -944,6 +1004,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	return ret;  } +/* + * interface function to delete all the snapshots we have scheduled for deletion + */  int btrfs_clean_old_snapshots(struct btrfs_root *root)  {  	struct list_head dirty_roots; diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b3bb5bbad76..6f57d0889b1 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -23,6 +23,10 @@  #include "transaction.h"  #include "locking.h" +/* defrag all the leaves in a given btree.  If cache_only == 1, don't read things + * from disk, otherwise read all the leaves and try to get key order to + * better reflect disk order + */  int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,  			struct btrfs_root *root, int cache_only)  {  |