diff options
Diffstat (limited to 'fs/ext4/extents_status.c')
| -rw-r--r-- | fs/ext4/extents_status.c | 500 | 
1 files changed, 500 insertions, 0 deletions
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c new file mode 100644 index 00000000000..564d981a2fc --- /dev/null +++ b/fs/ext4/extents_status.c @@ -0,0 +1,500 @@ +/* + *  fs/ext4/extents_status.c + * + * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> + * Modified by + *	Allison Henderson <achender@linux.vnet.ibm.com> + *	Hugh Dickins <hughd@google.com> + *	Zheng Liu <wenqing.lz@taobao.com> + * + * Ext4 extents status tree core functions. + */ +#include <linux/rbtree.h> +#include "ext4.h" +#include "extents_status.h" +#include "ext4_extents.h" + +#include <trace/events/ext4.h> + +/* + * According to previous discussion in Ext4 Developer Workshop, we + * will introduce a new structure called io tree to track all extent + * status in order to solve some problems that we have met + * (e.g. Reservation space warning), and provide extent-level locking. + * Delay extent tree is the first step to achieve this goal.  It is + * original built by Yongqiang Yang.  At that time it is called delay + * extent tree, whose goal is only track delay extent in memory to + * simplify the implementation of fiemap and bigalloc, and introduce + * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called + * delay extent tree at the following comment.  But for better + * understand what it does, it has been rename to extent status tree. + * + * Currently the first step has been done.  All delay extents are + * tracked in the tree.  It maintains the delay extent when a delay + * allocation is issued, and the delay extent is written out or + * invalidated.  Therefore the implementation of fiemap and bigalloc + * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. + * + * The following comment describes the implemenmtation of extent + * status tree and future works. + */ + +/* + * extents status tree implementation for ext4. + * + * + * ========================================================================== + * Extents status encompass delayed extents and extent locks + * + * 1. Why delayed extent implementation ? + * + * Without delayed extent, ext4 identifies a delayed extent by looking + * up page cache, this has several deficiencies - complicated, buggy, + * and inefficient code. + * + * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need + * to know if a block or a range of blocks are belonged to a delayed + * extent. + * + * Let us have a look at how they do without delayed extents implementation. + *   --	FIEMAP + *	FIEMAP looks up page cache to identify delayed allocations from holes. + * + *   --	SEEK_HOLE/DATA + *	SEEK_HOLE/DATA has the same problem as FIEMAP. + * + *   --	bigalloc + *	bigalloc looks up page cache to figure out if a block is + *	already under delayed allocation or not to determine whether + *	quota reserving is needed for the cluster. + * + *   -- punch hole + *	punch hole looks up page cache to identify a delayed extent. + * + *   --	writeout + *	Writeout looks up whole page cache to see if a buffer is + *	mapped, If there are not very many delayed buffers, then it is + *	time comsuming. + * + * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, + * bigalloc and writeout can figure out if a block or a range of + * blocks is under delayed allocation(belonged to a delayed extent) or + * not by searching the delayed extent tree. + * + * + * ========================================================================== + * 2. ext4 delayed extents impelmentation + * + *   --	delayed extent + *	A delayed extent is a range of blocks which are contiguous + *	logically and under delayed allocation.  Unlike extent in + *	ext4, delayed extent in ext4 is a in-memory struct, there is + *	no corresponding on-disk data.  There is no limit on length of + *	delayed extent, so a delayed extent can contain as many blocks + *	as they are contiguous logically. + * + *   --	delayed extent tree + *	Every inode has a delayed extent tree and all under delayed + *	allocation blocks are added to the tree as delayed extents. + *	Delayed extents in the tree are ordered by logical block no. + * + *   --	operations on a delayed extent tree + *	There are three operations on a delayed extent tree: find next + *	delayed extent, adding a space(a range of blocks) and removing + *	a space. + * + *   --	race on a delayed extent tree + *	Delayed extent tree is protected inode->i_es_lock. + * + * + * ========================================================================== + * 3. performance analysis + *   --	overhead + *	1. There is a cache extent for write access, so if writes are + *	not very random, adding space operaions are in O(1) time. + * + *   --	gain + *	2. Code is much simpler, more readable, more maintainable and + *	more efficient. + * + * + * ========================================================================== + * 4. TODO list + *   -- Track all extent status + * + *   -- Improve get block process + * + *   -- Extent-level locking + */ + +static struct kmem_cache *ext4_es_cachep; + +int __init ext4_init_es(void) +{ +	ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); +	if (ext4_es_cachep == NULL) +		return -ENOMEM; +	return 0; +} + +void ext4_exit_es(void) +{ +	if (ext4_es_cachep) +		kmem_cache_destroy(ext4_es_cachep); +} + +void ext4_es_init_tree(struct ext4_es_tree *tree) +{ +	tree->root = RB_ROOT; +	tree->cache_es = NULL; +} + +#ifdef ES_DEBUG__ +static void ext4_es_print_tree(struct inode *inode) +{ +	struct ext4_es_tree *tree; +	struct rb_node *node; + +	printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); +	tree = &EXT4_I(inode)->i_es_tree; +	node = rb_first(&tree->root); +	while (node) { +		struct extent_status *es; +		es = rb_entry(node, struct extent_status, rb_node); +		printk(KERN_DEBUG " [%u/%u)", es->start, es->len); +		node = rb_next(node); +	} +	printk(KERN_DEBUG "\n"); +} +#else +#define ext4_es_print_tree(inode) +#endif + +static inline ext4_lblk_t extent_status_end(struct extent_status *es) +{ +	BUG_ON(es->start + es->len < es->start); +	return es->start + es->len - 1; +} + +/* + * search through the tree for an delayed extent with a given offset.  If + * it can't be found, try to find next extent. + */ +static struct extent_status *__es_tree_search(struct rb_root *root, +					      ext4_lblk_t offset) +{ +	struct rb_node *node = root->rb_node; +	struct extent_status *es = NULL; + +	while (node) { +		es = rb_entry(node, struct extent_status, rb_node); +		if (offset < es->start) +			node = node->rb_left; +		else if (offset > extent_status_end(es)) +			node = node->rb_right; +		else +			return es; +	} + +	if (es && offset < es->start) +		return es; + +	if (es && offset > extent_status_end(es)) { +		node = rb_next(&es->rb_node); +		return node ? rb_entry(node, struct extent_status, rb_node) : +			      NULL; +	} + +	return NULL; +} + +/* + * ext4_es_find_extent: find the 1st delayed extent covering @es->start + * if it exists, otherwise, the next extent after @es->start. + * + * @inode: the inode which owns delayed extents + * @es: delayed extent that we found + * + * Returns the first block of the next extent after es, otherwise + * EXT_MAX_BLOCKS if no delay extent is found. + * Delayed extent is returned via @es. + */ +ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) +{ +	struct ext4_es_tree *tree = NULL; +	struct extent_status *es1 = NULL; +	struct rb_node *node; +	ext4_lblk_t ret = EXT_MAX_BLOCKS; + +	trace_ext4_es_find_extent_enter(inode, es->start); + +	read_lock(&EXT4_I(inode)->i_es_lock); +	tree = &EXT4_I(inode)->i_es_tree; + +	/* find delay extent in cache firstly */ +	if (tree->cache_es) { +		es1 = tree->cache_es; +		if (in_range(es->start, es1->start, es1->len)) { +			es_debug("%u cached by [%u/%u)\n", +				 es->start, es1->start, es1->len); +			goto out; +		} +	} + +	es->len = 0; +	es1 = __es_tree_search(&tree->root, es->start); + +out: +	if (es1) { +		tree->cache_es = es1; +		es->start = es1->start; +		es->len = es1->len; +		node = rb_next(&es1->rb_node); +		if (node) { +			es1 = rb_entry(node, struct extent_status, rb_node); +			ret = es1->start; +		} +	} + +	read_unlock(&EXT4_I(inode)->i_es_lock); + +	trace_ext4_es_find_extent_exit(inode, es, ret); +	return ret; +} + +static struct extent_status * +ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) +{ +	struct extent_status *es; +	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); +	if (es == NULL) +		return NULL; +	es->start = start; +	es->len = len; +	return es; +} + +static void ext4_es_free_extent(struct extent_status *es) +{ +	kmem_cache_free(ext4_es_cachep, es); +} + +static struct extent_status * +ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) +{ +	struct extent_status *es1; +	struct rb_node *node; + +	node = rb_prev(&es->rb_node); +	if (!node) +		return es; + +	es1 = rb_entry(node, struct extent_status, rb_node); +	if (es->start == extent_status_end(es1) + 1) { +		es1->len += es->len; +		rb_erase(&es->rb_node, &tree->root); +		ext4_es_free_extent(es); +		es = es1; +	} + +	return es; +} + +static struct extent_status * +ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) +{ +	struct extent_status *es1; +	struct rb_node *node; + +	node = rb_next(&es->rb_node); +	if (!node) +		return es; + +	es1 = rb_entry(node, struct extent_status, rb_node); +	if (es1->start == extent_status_end(es) + 1) { +		es->len += es1->len; +		rb_erase(node, &tree->root); +		ext4_es_free_extent(es1); +	} + +	return es; +} + +static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, +			      ext4_lblk_t len) +{ +	struct rb_node **p = &tree->root.rb_node; +	struct rb_node *parent = NULL; +	struct extent_status *es; +	ext4_lblk_t end = offset + len - 1; + +	BUG_ON(end < offset); +	es = tree->cache_es; +	if (es && offset == (extent_status_end(es) + 1)) { +		es_debug("cached by [%u/%u)\n", es->start, es->len); +		es->len += len; +		es = ext4_es_try_to_merge_right(tree, es); +		goto out; +	} else if (es && es->start == end + 1) { +		es_debug("cached by [%u/%u)\n", es->start, es->len); +		es->start = offset; +		es->len += len; +		es = ext4_es_try_to_merge_left(tree, es); +		goto out; +	} else if (es && es->start <= offset && +		   end <= extent_status_end(es)) { +		es_debug("cached by [%u/%u)\n", es->start, es->len); +		goto out; +	} + +	while (*p) { +		parent = *p; +		es = rb_entry(parent, struct extent_status, rb_node); + +		if (offset < es->start) { +			if (es->start == end + 1) { +				es->start = offset; +				es->len += len; +				es = ext4_es_try_to_merge_left(tree, es); +				goto out; +			} +			p = &(*p)->rb_left; +		} else if (offset > extent_status_end(es)) { +			if (offset == extent_status_end(es) + 1) { +				es->len += len; +				es = ext4_es_try_to_merge_right(tree, es); +				goto out; +			} +			p = &(*p)->rb_right; +		} else { +			if (extent_status_end(es) <= end) +				es->len = offset - es->start + len; +			goto out; +		} +	} + +	es = ext4_es_alloc_extent(offset, len); +	if (!es) +		return -ENOMEM; +	rb_link_node(&es->rb_node, parent, p); +	rb_insert_color(&es->rb_node, &tree->root); + +out: +	tree->cache_es = es; +	return 0; +} + +/* + * ext4_es_insert_extent() adds a space to a delayed extent tree. + * Caller holds inode->i_es_lock. + * + * ext4_es_insert_extent is called by ext4_da_write_begin and + * ext4_es_remove_extent. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, +			  ext4_lblk_t len) +{ +	struct ext4_es_tree *tree; +	int err = 0; + +	trace_ext4_es_insert_extent(inode, offset, len); +	es_debug("add [%u/%u) to extent status tree of inode %lu\n", +		 offset, len, inode->i_ino); + +	write_lock(&EXT4_I(inode)->i_es_lock); +	tree = &EXT4_I(inode)->i_es_tree; +	err = __es_insert_extent(tree, offset, len); +	write_unlock(&EXT4_I(inode)->i_es_lock); + +	ext4_es_print_tree(inode); + +	return err; +} + +/* + * ext4_es_remove_extent() removes a space from a delayed extent tree. + * Caller holds inode->i_es_lock. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, +			  ext4_lblk_t len) +{ +	struct rb_node *node; +	struct ext4_es_tree *tree; +	struct extent_status *es; +	struct extent_status orig_es; +	ext4_lblk_t len1, len2, end; +	int err = 0; + +	trace_ext4_es_remove_extent(inode, offset, len); +	es_debug("remove [%u/%u) from extent status tree of inode %lu\n", +		 offset, len, inode->i_ino); + +	end = offset + len - 1; +	BUG_ON(end < offset); +	write_lock(&EXT4_I(inode)->i_es_lock); +	tree = &EXT4_I(inode)->i_es_tree; +	es = __es_tree_search(&tree->root, offset); +	if (!es) +		goto out; +	if (es->start > end) +		goto out; + +	/* Simply invalidate cache_es. */ +	tree->cache_es = NULL; + +	orig_es.start = es->start; +	orig_es.len = es->len; +	len1 = offset > es->start ? offset - es->start : 0; +	len2 = extent_status_end(es) > end ? +	       extent_status_end(es) - end : 0; +	if (len1 > 0) +		es->len = len1; +	if (len2 > 0) { +		if (len1 > 0) { +			err = __es_insert_extent(tree, end + 1, len2); +			if (err) { +				es->start = orig_es.start; +				es->len = orig_es.len; +				goto out; +			} +		} else { +			es->start = end + 1; +			es->len = len2; +		} +		goto out; +	} + +	if (len1 > 0) { +		node = rb_next(&es->rb_node); +		if (node) +			es = rb_entry(node, struct extent_status, rb_node); +		else +			es = NULL; +	} + +	while (es && extent_status_end(es) <= end) { +		node = rb_next(&es->rb_node); +		rb_erase(&es->rb_node, &tree->root); +		ext4_es_free_extent(es); +		if (!node) { +			es = NULL; +			break; +		} +		es = rb_entry(node, struct extent_status, rb_node); +	} + +	if (es && es->start < end + 1) { +		len1 = extent_status_end(es) - end; +		es->start = end + 1; +		es->len = len1; +	} + +out: +	write_unlock(&EXT4_I(inode)->i_es_lock); +	ext4_es_print_tree(inode); +	return err; +}  |