diff options
| -rw-r--r-- | fs/btrfs/async-thread.c | 230 | ||||
| -rw-r--r-- | fs/btrfs/async-thread.h | 12 | ||||
| -rw-r--r-- | fs/btrfs/compression.c | 8 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 2 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 36 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 4 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 293 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.h | 16 | ||||
| -rw-r--r-- | fs/btrfs/extent_map.c | 55 | ||||
| -rw-r--r-- | fs/btrfs/extent_map.h | 3 | ||||
| -rw-r--r-- | fs/btrfs/file.c | 35 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 112 | ||||
| -rw-r--r-- | fs/btrfs/ioctl.c | 5 | ||||
| -rw-r--r-- | fs/btrfs/ordered-data.c | 33 | ||||
| -rw-r--r-- | fs/btrfs/ordered-data.h | 3 | ||||
| -rw-r--r-- | fs/btrfs/relocation.c | 6 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 42 | 
18 files changed, 580 insertions, 317 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 019e8af449a..6ea5cd0a595 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -48,6 +48,9 @@ struct btrfs_worker_thread {  	/* number of things on the pending list */  	atomic_t num_pending; +	/* reference counter for this struct */ +	atomic_t refs; +  	unsigned long sequence;  	/* protects the pending list. */ @@ -93,17 +96,40 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)  	}  } -static noinline int run_ordered_completions(struct btrfs_workers *workers, -					    struct btrfs_work *work) +static void check_pending_worker_creates(struct btrfs_worker_thread *worker)  { +	struct btrfs_workers *workers = worker->workers;  	unsigned long flags; +	rmb(); +	if (!workers->atomic_start_pending) +		return; + +	spin_lock_irqsave(&workers->lock, flags); +	if (!workers->atomic_start_pending) +		goto out; + +	workers->atomic_start_pending = 0; +	if (workers->num_workers >= workers->max_workers) +		goto out; + +	spin_unlock_irqrestore(&workers->lock, flags); +	btrfs_start_workers(workers, 1); +	return; + +out: +	spin_unlock_irqrestore(&workers->lock, flags); +} + +static noinline int run_ordered_completions(struct btrfs_workers *workers, +					    struct btrfs_work *work) +{  	if (!workers->ordered)  		return 0;  	set_bit(WORK_DONE_BIT, &work->flags); -	spin_lock_irqsave(&workers->lock, flags); +	spin_lock(&workers->order_lock);  	while (1) {  		if (!list_empty(&workers->prio_order_list)) { @@ -126,45 +152,117 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,  		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))  			break; -		spin_unlock_irqrestore(&workers->lock, flags); +		spin_unlock(&workers->order_lock);  		work->ordered_func(work);  		/* now take the lock again and call the freeing code */ -		spin_lock_irqsave(&workers->lock, flags); +		spin_lock(&workers->order_lock);  		list_del(&work->order_list);  		work->ordered_free(work);  	} -	spin_unlock_irqrestore(&workers->lock, flags); +	spin_unlock(&workers->order_lock);  	return 0;  } +static void put_worker(struct btrfs_worker_thread *worker) +{ +	if (atomic_dec_and_test(&worker->refs)) +		kfree(worker); +} + +static int try_worker_shutdown(struct btrfs_worker_thread *worker) +{ +	int freeit = 0; + +	spin_lock_irq(&worker->lock); +	spin_lock_irq(&worker->workers->lock); +	if (worker->workers->num_workers > 1 && +	    worker->idle && +	    !worker->working && +	    !list_empty(&worker->worker_list) && +	    list_empty(&worker->prio_pending) && +	    list_empty(&worker->pending)) { +		freeit = 1; +		list_del_init(&worker->worker_list); +		worker->workers->num_workers--; +	} +	spin_unlock_irq(&worker->workers->lock); +	spin_unlock_irq(&worker->lock); + +	if (freeit) +		put_worker(worker); +	return freeit; +} + +static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, +					struct list_head *prio_head, +					struct list_head *head) +{ +	struct btrfs_work *work = NULL; +	struct list_head *cur = NULL; + +	if(!list_empty(prio_head)) +		cur = prio_head->next; + +	smp_mb(); +	if (!list_empty(&worker->prio_pending)) +		goto refill; + +	if (!list_empty(head)) +		cur = head->next; + +	if (cur) +		goto out; + +refill: +	spin_lock_irq(&worker->lock); +	list_splice_tail_init(&worker->prio_pending, prio_head); +	list_splice_tail_init(&worker->pending, head); + +	if (!list_empty(prio_head)) +		cur = prio_head->next; +	else if (!list_empty(head)) +		cur = head->next; +	spin_unlock_irq(&worker->lock); + +	if (!cur) +		goto out_fail; + +out: +	work = list_entry(cur, struct btrfs_work, list); + +out_fail: +	return work; +} +  /*   * main loop for servicing work items   */  static int worker_loop(void *arg)  {  	struct btrfs_worker_thread *worker = arg; -	struct list_head *cur; +	struct list_head head; +	struct list_head prio_head;  	struct btrfs_work *work; + +	INIT_LIST_HEAD(&head); +	INIT_LIST_HEAD(&prio_head); +  	do { -		spin_lock_irq(&worker->lock); -again_locked: +again:  		while (1) { -			if (!list_empty(&worker->prio_pending)) -				cur = worker->prio_pending.next; -			else if (!list_empty(&worker->pending)) -				cur = worker->pending.next; -			else + + +			work = get_next_work(worker, &prio_head, &head); +			if (!work)  				break; -			work = list_entry(cur, struct btrfs_work, list);  			list_del(&work->list);  			clear_bit(WORK_QUEUED_BIT, &work->flags);  			work->worker = worker; -			spin_unlock_irq(&worker->lock);  			work->func(work); @@ -175,9 +273,13 @@ again_locked:  			 */  			run_ordered_completions(worker->workers, work); -			spin_lock_irq(&worker->lock); -			check_idle_worker(worker); +			check_pending_worker_creates(worker); +  		} + +		spin_lock_irq(&worker->lock); +		check_idle_worker(worker); +  		if (freezing(current)) {  			worker->working = 0;  			spin_unlock_irq(&worker->lock); @@ -216,8 +318,10 @@ again_locked:  				spin_lock_irq(&worker->lock);  				set_current_state(TASK_INTERRUPTIBLE);  				if (!list_empty(&worker->pending) || -				    !list_empty(&worker->prio_pending)) -					goto again_locked; +				    !list_empty(&worker->prio_pending)) { +					spin_unlock_irq(&worker->lock); +					goto again; +				}  				/*  				 * this makes sure we get a wakeup when someone @@ -226,8 +330,13 @@ again_locked:  				worker->working = 0;  				spin_unlock_irq(&worker->lock); -				if (!kthread_should_stop()) -					schedule(); +				if (!kthread_should_stop()) { +					schedule_timeout(HZ * 120); +					if (!worker->working && +					    try_worker_shutdown(worker)) { +						return 0; +					} +				}  			}  			__set_current_state(TASK_RUNNING);  		} @@ -242,16 +351,30 @@ int btrfs_stop_workers(struct btrfs_workers *workers)  {  	struct list_head *cur;  	struct btrfs_worker_thread *worker; +	int can_stop; +	spin_lock_irq(&workers->lock);  	list_splice_init(&workers->idle_list, &workers->worker_list);  	while (!list_empty(&workers->worker_list)) {  		cur = workers->worker_list.next;  		worker = list_entry(cur, struct btrfs_worker_thread,  				    worker_list); -		kthread_stop(worker->task); -		list_del(&worker->worker_list); -		kfree(worker); + +		atomic_inc(&worker->refs); +		workers->num_workers -= 1; +		if (!list_empty(&worker->worker_list)) { +			list_del_init(&worker->worker_list); +			put_worker(worker); +			can_stop = 1; +		} else +			can_stop = 0; +		spin_unlock_irq(&workers->lock); +		if (can_stop) +			kthread_stop(worker->task); +		spin_lock_irq(&workers->lock); +		put_worker(worker);  	} +	spin_unlock_irq(&workers->lock);  	return 0;  } @@ -266,10 +389,13 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)  	INIT_LIST_HEAD(&workers->order_list);  	INIT_LIST_HEAD(&workers->prio_order_list);  	spin_lock_init(&workers->lock); +	spin_lock_init(&workers->order_lock);  	workers->max_workers = max;  	workers->idle_thresh = 32;  	workers->name = name;  	workers->ordered = 0; +	workers->atomic_start_pending = 0; +	workers->atomic_worker_start = 0;  }  /* @@ -293,7 +419,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)  		INIT_LIST_HEAD(&worker->prio_pending);  		INIT_LIST_HEAD(&worker->worker_list);  		spin_lock_init(&worker->lock); +  		atomic_set(&worker->num_pending, 0); +		atomic_set(&worker->refs, 1);  		worker->workers = workers;  		worker->task = kthread_run(worker_loop, worker,  					   "btrfs-%s-%d", workers->name, @@ -303,7 +431,6 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)  			kfree(worker);  			goto fail;  		} -  		spin_lock_irq(&workers->lock);  		list_add_tail(&worker->worker_list, &workers->idle_list);  		worker->idle = 1; @@ -367,28 +494,18 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)  {  	struct btrfs_worker_thread *worker;  	unsigned long flags; +	struct list_head *fallback;  again:  	spin_lock_irqsave(&workers->lock, flags);  	worker = next_worker(workers); -	spin_unlock_irqrestore(&workers->lock, flags);  	if (!worker) { -		spin_lock_irqsave(&workers->lock, flags);  		if (workers->num_workers >= workers->max_workers) { -			struct list_head *fallback = NULL; -			/* -			 * we have failed to find any workers, just -			 * return the force one -			 */ -			if (!list_empty(&workers->worker_list)) -				fallback = workers->worker_list.next; -			if (!list_empty(&workers->idle_list)) -				fallback = workers->idle_list.next; -			BUG_ON(!fallback); -			worker = list_entry(fallback, -				  struct btrfs_worker_thread, worker_list); -			spin_unlock_irqrestore(&workers->lock, flags); +			goto fallback; +		} else if (workers->atomic_worker_start) { +			workers->atomic_start_pending = 1; +			goto fallback;  		} else {  			spin_unlock_irqrestore(&workers->lock, flags);  			/* we're below the limit, start another worker */ @@ -396,6 +513,23 @@ again:  			goto again;  		}  	} +	spin_unlock_irqrestore(&workers->lock, flags); +	return worker; + +fallback: +	fallback = NULL; +	/* +	 * we have failed to find any workers, just +	 * return the first one we can find. +	 */ +	if (!list_empty(&workers->worker_list)) +		fallback = workers->worker_list.next; +	if (!list_empty(&workers->idle_list)) +		fallback = workers->idle_list.next; +	BUG_ON(!fallback); +	worker = list_entry(fallback, +		  struct btrfs_worker_thread, worker_list); +	spin_unlock_irqrestore(&workers->lock, flags);  	return worker;  } @@ -435,9 +569,9 @@ int btrfs_requeue_work(struct btrfs_work *work)  		worker->working = 1;  	} -	spin_unlock_irqrestore(&worker->lock, flags);  	if (wake)  		wake_up_process(worker->task); +	spin_unlock_irqrestore(&worker->lock, flags);  out:  	return 0; @@ -463,14 +597,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)  	worker = find_worker(workers);  	if (workers->ordered) { -		spin_lock_irqsave(&workers->lock, flags); +		/* +		 * you're not allowed to do ordered queues from an +		 * interrupt handler +		 */ +		spin_lock(&workers->order_lock);  		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {  			list_add_tail(&work->order_list,  				      &workers->prio_order_list);  		} else {  			list_add_tail(&work->order_list, &workers->order_list);  		} -		spin_unlock_irqrestore(&workers->lock, flags); +		spin_unlock(&workers->order_lock);  	} else {  		INIT_LIST_HEAD(&work->order_list);  	} @@ -492,10 +630,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)  		wake = 1;  	worker->working = 1; -	spin_unlock_irqrestore(&worker->lock, flags); -  	if (wake)  		wake_up_process(worker->task); +	spin_unlock_irqrestore(&worker->lock, flags); +  out:  	return 0;  } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 1b511c109db..fc089b95ec1 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -73,6 +73,15 @@ struct btrfs_workers {  	/* force completions in the order they were queued */  	int ordered; +	/* more workers required, but in an interrupt handler */ +	int atomic_start_pending; + +	/* +	 * are we allowed to sleep while starting workers or are we required +	 * to start them at a later time? +	 */ +	int atomic_worker_start; +  	/* list with all the work threads.  The workers on the idle thread  	 * may be actively servicing jobs, but they haven't yet hit the  	 * idle thresh limit above. @@ -90,6 +99,9 @@ struct btrfs_workers {  	/* lock for finding the next worker thread to queue on */  	spinlock_t lock; +	/* lock for the ordered lists */ +	spinlock_t order_lock; +  	/* extra name for this worker, used for current->name */  	char *name;  }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 9d8ba4d54a3..a11a32058b5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,  		 */  		set_page_extent_mapped(page);  		lock_extent(tree, last_offset, end, GFP_NOFS); -		spin_lock(&em_tree->lock); +		read_lock(&em_tree->lock);  		em = lookup_extent_mapping(em_tree, last_offset,  					   PAGE_CACHE_SIZE); -		spin_unlock(&em_tree->lock); +		read_unlock(&em_tree->lock);  		if (!em || last_offset < em->start ||  		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || @@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  	em_tree = &BTRFS_I(inode)->extent_tree;  	/* we need the actual starting offset of this extent in the file */ -	spin_lock(&em_tree->lock); +	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree,  				   page_offset(bio->bi_io_vec->bv_page),  				   PAGE_CACHE_SIZE); -	spin_unlock(&em_tree->lock); +	read_unlock(&em_tree->lock);  	compressed_len = em->block_len;  	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 837435ce84c..732d5b884aa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2290,7 +2290,7 @@ extern struct file_operations btrfs_file_operations;  int btrfs_drop_extents(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct inode *inode,  		       u64 start, u64 end, u64 locked_end, -		       u64 inline_limit, u64 *hint_block); +		       u64 inline_limit, u64 *hint_block, int drop_cache);  int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,  			      struct btrfs_root *root,  			      struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e83be2e4602..253da7e01ab 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -123,15 +123,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,  	struct extent_map *em;  	int ret; -	spin_lock(&em_tree->lock); +	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, start, len);  	if (em) {  		em->bdev =  			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; -		spin_unlock(&em_tree->lock); +		read_unlock(&em_tree->lock);  		goto out;  	} -	spin_unlock(&em_tree->lock); +	read_unlock(&em_tree->lock);  	em = alloc_extent_map(GFP_NOFS);  	if (!em) { @@ -144,7 +144,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,  	em->block_start = 0;  	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; -	spin_lock(&em_tree->lock); +	write_lock(&em_tree->lock);  	ret = add_extent_mapping(em_tree, em);  	if (ret == -EEXIST) {  		u64 failed_start = em->start; @@ -163,7 +163,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,  		free_extent_map(em);  		em = NULL;  	} -	spin_unlock(&em_tree->lock); +	write_unlock(&em_tree->lock);  	if (ret)  		em = ERR_PTR(ret); @@ -1325,9 +1325,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)  	offset = page_offset(page);  	em_tree = &BTRFS_I(inode)->extent_tree; -	spin_lock(&em_tree->lock); +	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); -	spin_unlock(&em_tree->lock); +	read_unlock(&em_tree->lock);  	if (!em) {  		__unplug_io_fn(bdi, page);  		return; @@ -1698,7 +1698,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  		err = -EINVAL;  		goto fail_iput;  	} - +printk("thread pool is %d\n", fs_info->thread_pool_size);  	/*  	 * we need to start all the end_io workers up front because the  	 * queue work function gets called at interrupt time, and so it @@ -1743,20 +1743,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	fs_info->endio_workers.idle_thresh = 4;  	fs_info->endio_meta_workers.idle_thresh = 4; -	fs_info->endio_write_workers.idle_thresh = 64; -	fs_info->endio_meta_write_workers.idle_thresh = 64; +	fs_info->endio_write_workers.idle_thresh = 2; +	fs_info->endio_meta_write_workers.idle_thresh = 2; + +	fs_info->endio_workers.atomic_worker_start = 1; +	fs_info->endio_meta_workers.atomic_worker_start = 1; +	fs_info->endio_write_workers.atomic_worker_start = 1; +	fs_info->endio_meta_write_workers.atomic_worker_start = 1;  	btrfs_start_workers(&fs_info->workers, 1);  	btrfs_start_workers(&fs_info->submit_workers, 1);  	btrfs_start_workers(&fs_info->delalloc_workers, 1);  	btrfs_start_workers(&fs_info->fixup_workers, 1); -	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); -	btrfs_start_workers(&fs_info->endio_meta_workers, -			    fs_info->thread_pool_size); -	btrfs_start_workers(&fs_info->endio_meta_write_workers, -			    fs_info->thread_pool_size); -	btrfs_start_workers(&fs_info->endio_write_workers, -			    fs_info->thread_pool_size); +	btrfs_start_workers(&fs_info->endio_workers, 1); +	btrfs_start_workers(&fs_info->endio_meta_workers, 1); +	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); +	btrfs_start_workers(&fs_info->endio_write_workers, 1);  	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);  	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72a2b9c28e9..edd86ae9e14 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5396,9 +5396,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,  	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);  	while (1) {  		int ret; -		spin_lock(&em_tree->lock); +		write_lock(&em_tree->lock);  		ret = add_extent_mapping(em_tree, em); -		spin_unlock(&em_tree->lock); +		write_unlock(&em_tree->lock);  		if (ret != -EEXIST) {  			free_extent_map(em);  			break; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 68260180f58..a102422cd92 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -367,10 +367,10 @@ static int insert_state(struct extent_io_tree *tree,  	}  	if (bits & EXTENT_DIRTY)  		tree->dirty_bytes += end - start + 1; -	set_state_cb(tree, state, bits); -	state->state |= bits;  	state->start = start;  	state->end = end; +	set_state_cb(tree, state, bits); +	state->state |= bits;  	node = tree_insert(&tree->state, end, &state->rb_node);  	if (node) {  		struct extent_state *found; @@ -471,10 +471,14 @@ static int clear_state_bit(struct extent_io_tree *tree,   * bits were already set, or zero if none of the bits were already set.   */  int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, -		     int bits, int wake, int delete, gfp_t mask) +		     int bits, int wake, int delete, +		     struct extent_state **cached_state, +		     gfp_t mask)  {  	struct extent_state *state; +	struct extent_state *cached;  	struct extent_state *prealloc = NULL; +	struct rb_node *next_node;  	struct rb_node *node;  	u64 last_end;  	int err; @@ -488,6 +492,17 @@ again:  	}  	spin_lock(&tree->lock); +	if (cached_state) { +		cached = *cached_state; +		*cached_state = NULL; +		if (cached->tree && cached->start == start) { +			atomic_dec(&cached->refs); +			state = cached; +			last_end = state->end; +			goto found; +		} +		free_extent_state(cached); +	}  	/*  	 * this search will find the extents that end after  	 * our range starts @@ -496,6 +511,7 @@ again:  	if (!node)  		goto out;  	state = rb_entry(node, struct extent_state, rb_node); +hit_next:  	if (state->start > end)  		goto out;  	WARN_ON(state->end < start); @@ -555,11 +571,21 @@ again:  		prealloc = NULL;  		goto out;  	} - +found: +	if (state->end < end && prealloc && !need_resched()) +		next_node = rb_next(&state->rb_node); +	else +		next_node = NULL;  	set |= clear_state_bit(tree, state, bits, wake, delete);  	if (last_end == (u64)-1)  		goto out;  	start = last_end + 1; +	if (start <= end && next_node) { +		state = rb_entry(next_node, struct extent_state, +				 rb_node); +		if (state->start == start) +			goto hit_next; +	}  	goto search_again;  out: @@ -653,26 +679,37 @@ static void set_state_bits(struct extent_io_tree *tree,  	state->state |= bits;  } +static void cache_state(struct extent_state *state, +			struct extent_state **cached_ptr) +{ +	if (cached_ptr && !(*cached_ptr)) { +		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { +			*cached_ptr = state; +			atomic_inc(&state->refs); +		} +	} +} +  /* - * set some bits on a range in the tree.  This may require allocations - * or sleeping, so the gfp mask is used to indicate what is allowed. + * set some bits on a range in the tree.  This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed.   * - * If 'exclusive' == 1, this will fail with -EEXIST if some part of the - * range already has the desired bits set.  The start of the existing - * range is returned in failed_start in this case. + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set.  The start of the + * existing range is returned in failed_start in this case.   * - * [start, end] is inclusive - * This takes the tree lock. + * [start, end] is inclusive This takes the tree lock.   */ +  static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, -			  int bits, int exclusive, u64 *failed_start, +			  int bits, int exclusive_bits, u64 *failed_start, +			  struct extent_state **cached_state,  			  gfp_t mask)  {  	struct extent_state *state;  	struct extent_state *prealloc = NULL;  	struct rb_node *node;  	int err = 0; -	int set;  	u64 last_start;  	u64 last_end;  again: @@ -683,6 +720,13 @@ again:  	}  	spin_lock(&tree->lock); +	if (cached_state && *cached_state) { +		state = *cached_state; +		if (state->start == start && state->tree) { +			node = &state->rb_node; +			goto hit_next; +		} +	}  	/*  	 * this search will find all the extents that end after  	 * our range starts. @@ -694,8 +738,8 @@ again:  		BUG_ON(err == -EEXIST);  		goto out;  	} -  	state = rb_entry(node, struct extent_state, rb_node); +hit_next:  	last_start = state->start;  	last_end = state->end; @@ -706,17 +750,28 @@ again:  	 * Just lock what we found and keep going  	 */  	if (state->start == start && state->end <= end) { -		set = state->state & bits; -		if (set && exclusive) { +		struct rb_node *next_node; +		if (state->state & exclusive_bits) {  			*failed_start = state->start;  			err = -EEXIST;  			goto out;  		}  		set_state_bits(tree, state, bits); +		cache_state(state, cached_state);  		merge_state(tree, state);  		if (last_end == (u64)-1)  			goto out; +  		start = last_end + 1; +		if (start < end && prealloc && !need_resched()) { +			next_node = rb_next(node); +			if (next_node) { +				state = rb_entry(next_node, struct extent_state, +						 rb_node); +				if (state->start == start) +					goto hit_next; +			} +		}  		goto search_again;  	} @@ -737,8 +792,7 @@ again:  	 * desired bit on it.  	 */  	if (state->start < start) { -		set = state->state & bits; -		if (exclusive && set) { +		if (state->state & exclusive_bits) {  			*failed_start = start;  			err = -EEXIST;  			goto out; @@ -750,6 +804,7 @@ again:  			goto out;  		if (state->end <= end) {  			set_state_bits(tree, state, bits); +			cache_state(state, cached_state);  			merge_state(tree, state);  			if (last_end == (u64)-1)  				goto out; @@ -774,6 +829,7 @@ again:  			this_end = last_start - 1;  		err = insert_state(tree, prealloc, start, this_end,  				   bits); +		cache_state(prealloc, cached_state);  		prealloc = NULL;  		BUG_ON(err == -EEXIST);  		if (err) @@ -788,8 +844,7 @@ again:  	 * on the first half  	 */  	if (state->start <= end && state->end > end) { -		set = state->state & bits; -		if (exclusive && set) { +		if (state->state & exclusive_bits) {  			*failed_start = start;  			err = -EEXIST;  			goto out; @@ -798,6 +853,7 @@ again:  		BUG_ON(err == -EEXIST);  		set_state_bits(tree, prealloc, bits); +		cache_state(prealloc, cached_state);  		merge_state(tree, prealloc);  		prealloc = NULL;  		goto out; @@ -826,86 +882,64 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,  		     gfp_t mask)  {  	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, -			      mask); -} - -int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, -		       gfp_t mask) -{ -	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); +			      NULL, mask);  }  int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,  		    int bits, gfp_t mask)  {  	return set_extent_bit(tree, start, end, bits, 0, NULL, -			      mask); +			      NULL, mask);  }  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,  		      int bits, gfp_t mask)  { -	return clear_extent_bit(tree, start, end, bits, 0, 0, mask); +	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);  }  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,  		     gfp_t mask)  {  	return set_extent_bit(tree, start, end, -			      EXTENT_DELALLOC | EXTENT_DIRTY, -			      0, NULL, mask); +			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, +			      0, NULL, NULL, mask);  }  int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,  		       gfp_t mask)  {  	return clear_extent_bit(tree, start, end, -				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); -} - -int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, -			 gfp_t mask) -{ -	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); +				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, +				NULL, mask);  }  int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,  		     gfp_t mask)  {  	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, -			      mask); +			      NULL, mask);  }  static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,  		       gfp_t mask)  { -	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); +	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, +				NULL, mask);  }  int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,  			gfp_t mask)  {  	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, -			      mask); +			      NULL, mask);  }  static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,  				 u64 end, gfp_t mask)  { -	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); -} - -static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, -			 gfp_t mask) -{ -	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, -			      0, NULL, mask); -} - -static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, -				  u64 end, gfp_t mask) -{ -	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); +	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, +				NULL, mask);  }  int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) @@ -917,13 +951,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)   * either insert or lock state struct between start and end use mask to tell   * us if waiting is desired.   */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, +		     int bits, struct extent_state **cached_state, gfp_t mask)  {  	int err;  	u64 failed_start;  	while (1) { -		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, -				     &failed_start, mask); +		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, +				     EXTENT_LOCKED, &failed_start, +				     cached_state, mask);  		if (err == -EEXIST && (mask & __GFP_WAIT)) {  			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);  			start = failed_start; @@ -935,27 +971,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)  	return err;  } +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ +	return lock_extent_bits(tree, start, end, 0, NULL, mask); +} +  int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,  		    gfp_t mask)  {  	int err;  	u64 failed_start; -	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, -			     &failed_start, mask); +	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, +			     &failed_start, NULL, mask);  	if (err == -EEXIST) {  		if (failed_start > start)  			clear_extent_bit(tree, start, failed_start - 1, -					 EXTENT_LOCKED, 1, 0, mask); +					 EXTENT_LOCKED, 1, 0, NULL, mask);  		return 0;  	}  	return 1;  } +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, +			 struct extent_state **cached, gfp_t mask) +{ +	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, +				mask); +} +  int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,  		  gfp_t mask)  { -	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); +	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, +				mask);  }  /* @@ -974,7 +1023,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)  		page_cache_release(page);  		index++;  	} -	set_extent_dirty(tree, start, end, GFP_NOFS);  	return 0;  } @@ -994,7 +1042,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)  		page_cache_release(page);  		index++;  	} -	set_extent_writeback(tree, start, end, GFP_NOFS);  	return 0;  } @@ -1232,6 +1279,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,  	u64 delalloc_start;  	u64 delalloc_end;  	u64 found; +	struct extent_state *cached_state = NULL;  	int ret;  	int loops = 0; @@ -1269,6 +1317,7 @@ again:  		/* some of the pages are gone, lets avoid looping by  		 * shortening the size of the delalloc range we're searching  		 */ +		free_extent_state(cached_state);  		if (!loops) {  			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);  			max_bytes = PAGE_CACHE_SIZE - offset; @@ -1282,18 +1331,21 @@ again:  	BUG_ON(ret);  	/* step three, lock the state bits for the whole range */ -	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); +	lock_extent_bits(tree, delalloc_start, delalloc_end, +			 0, &cached_state, GFP_NOFS);  	/* then test to make sure it is all still delalloc */  	ret = test_range_bit(tree, delalloc_start, delalloc_end, -			     EXTENT_DELALLOC, 1); +			     EXTENT_DELALLOC, 1, cached_state);  	if (!ret) { -		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); +		unlock_extent_cached(tree, delalloc_start, delalloc_end, +				     &cached_state, GFP_NOFS);  		__unlock_for_delalloc(inode, locked_page,  			      delalloc_start, delalloc_end);  		cond_resched();  		goto again;  	} +	free_extent_state(cached_state);  	*start = delalloc_start;  	*end = delalloc_end;  out_failed: @@ -1307,7 +1359,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,  				int clear_unlock,  				int clear_delalloc, int clear_dirty,  				int set_writeback, -				int end_writeback) +				int end_writeback, +				int set_private2)  {  	int ret;  	struct page *pages[16]; @@ -1325,8 +1378,9 @@ int extent_clear_unlock_delalloc(struct inode *inode,  	if (clear_delalloc)  		clear_bits |= EXTENT_DELALLOC; -	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); -	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) +	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); +	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback || +	      set_private2))  		return 0;  	while (nr_pages > 0) { @@ -1334,6 +1388,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,  				     min_t(unsigned long,  				     nr_pages, ARRAY_SIZE(pages)), pages);  		for (i = 0; i < ret; i++) { + +			if (set_private2) +				SetPagePrivate2(pages[i]); +  			if (pages[i] == locked_page) {  				page_cache_release(pages[i]);  				continue; @@ -1476,14 +1534,17 @@ out:   * range is found set.   */  int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, -		   int bits, int filled) +		   int bits, int filled, struct extent_state *cached)  {  	struct extent_state *state = NULL;  	struct rb_node *node;  	int bitset = 0;  	spin_lock(&tree->lock); -	node = tree_search(tree, start); +	if (cached && cached->tree && cached->start == start) +		node = &cached->rb_node; +	else +		node = tree_search(tree, start);  	while (node && start <= end) {  		state = rb_entry(node, struct extent_state, rb_node); @@ -1526,7 +1587,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,  {  	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;  	u64 end = start + PAGE_CACHE_SIZE - 1; -	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) +	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))  		SetPageUptodate(page);  	return 0;  } @@ -1540,7 +1601,7 @@ static int check_page_locked(struct extent_io_tree *tree,  {  	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;  	u64 end = start + PAGE_CACHE_SIZE - 1; -	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) +	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))  		unlock_page(page);  	return 0;  } @@ -1552,10 +1613,7 @@ static int check_page_locked(struct extent_io_tree *tree,  static int check_page_writeback(struct extent_io_tree *tree,  			     struct page *page)  { -	u64 start = (u64)page->index << PAGE_CACHE_SHIFT; -	u64 end = start + PAGE_CACHE_SIZE - 1; -	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) -		end_page_writeback(page); +	end_page_writeback(page);  	return 0;  } @@ -1613,13 +1671,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)  		}  		if (!uptodate) { -			clear_extent_uptodate(tree, start, end, GFP_ATOMIC); +			clear_extent_uptodate(tree, start, end, GFP_NOFS);  			ClearPageUptodate(page);  			SetPageError(page);  		} -		clear_extent_writeback(tree, start, end, GFP_ATOMIC); -  		if (whole_page)  			end_page_writeback(page);  		else @@ -1983,7 +2039,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  			continue;  		}  		/* the get_extent function already copied into the page */ -		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { +		if (test_range_bit(tree, cur, cur_end, +				   EXTENT_UPTODATE, 1, NULL)) {  			check_page_uptodate(tree, page);  			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);  			cur = cur + iosize; @@ -2078,6 +2135,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	u64 iosize;  	u64 unlock_start;  	sector_t sector; +	struct extent_state *cached_state = NULL;  	struct extent_map *em;  	struct block_device *bdev;  	int ret; @@ -2124,6 +2182,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	delalloc_end = 0;  	page_started = 0;  	if (!epd->extent_locked) { +		u64 delalloc_to_write;  		/*  		 * make sure the wbc mapping index is at least updated  		 * to this page. @@ -2143,6 +2202,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  			tree->ops->fill_delalloc(inode, page, delalloc_start,  						 delalloc_end, &page_started,  						 &nr_written); +			delalloc_to_write = (delalloc_end - +					max_t(u64, page_offset(page), +					      delalloc_start) + 1) >> +				        PAGE_CACHE_SHIFT; +			if (wbc->nr_to_write < delalloc_to_write) { +				wbc->nr_to_write = min_t(long, 8192, +						 delalloc_to_write); +			}  			delalloc_start = delalloc_end + 1;  		} @@ -2160,15 +2227,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  			goto done_unlocked;  		}  	} -	lock_extent(tree, start, page_end, GFP_NOFS); - -	unlock_start = start; -  	if (tree->ops && tree->ops->writepage_start_hook) {  		ret = tree->ops->writepage_start_hook(page, start,  						      page_end);  		if (ret == -EAGAIN) { -			unlock_extent(tree, start, page_end, GFP_NOFS);  			redirty_page_for_writepage(wbc, page);  			update_nr_written(page, wbc, nr_written);  			unlock_page(page); @@ -2184,12 +2246,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	update_nr_written(page, wbc, nr_written + 1);  	end = page_end; -	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) -		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); -  	if (last_byte <= start) { -		clear_extent_dirty(tree, start, page_end, GFP_NOFS); -		unlock_extent(tree, start, page_end, GFP_NOFS);  		if (tree->ops && tree->ops->writepage_end_io_hook)  			tree->ops->writepage_end_io_hook(page, start,  							 page_end, NULL, 1); @@ -2197,13 +2254,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  		goto done;  	} -	set_extent_uptodate(tree, start, page_end, GFP_NOFS);  	blocksize = inode->i_sb->s_blocksize;  	while (cur <= end) {  		if (cur >= last_byte) { -			clear_extent_dirty(tree, cur, page_end, GFP_NOFS); -			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);  			if (tree->ops && tree->ops->writepage_end_io_hook)  				tree->ops->writepage_end_io_hook(page, cur,  							 page_end, NULL, 1); @@ -2235,12 +2289,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  		 */  		if (compressed || block_start == EXTENT_MAP_HOLE ||  		    block_start == EXTENT_MAP_INLINE) { -			clear_extent_dirty(tree, cur, -					   cur + iosize - 1, GFP_NOFS); - -			unlock_extent(tree, unlock_start, cur + iosize - 1, -				      GFP_NOFS); -  			/*  			 * end_io notification does not happen here for  			 * compressed extents @@ -2265,13 +2313,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  		}  		/* leave this out until we have a page_mkwrite call */  		if (0 && !test_range_bit(tree, cur, cur + iosize - 1, -				   EXTENT_DIRTY, 0)) { +				   EXTENT_DIRTY, 0, NULL)) {  			cur = cur + iosize;  			pg_offset += iosize;  			continue;  		} -		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);  		if (tree->ops && tree->ops->writepage_io_hook) {  			ret = tree->ops->writepage_io_hook(page, cur,  						cur + iosize - 1); @@ -2309,12 +2356,12 @@ done:  		set_page_writeback(page);  		end_page_writeback(page);  	} -	if (unlock_start <= page_end) -		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);  	unlock_page(page);  done_unlocked: +	/* drop our reference on any cached states */ +	free_extent_state(cached_state);  	return 0;  } @@ -2339,7 +2386,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,  			     writepage_t writepage, void *data,  			     void (*flush_fn)(void *))  { -	struct backing_dev_info *bdi = mapping->backing_dev_info;  	int ret = 0;  	int done = 0;  	struct pagevec pvec; @@ -2414,10 +2460,6 @@ retry:  			}  			if (ret || wbc->nr_to_write <= 0)  				done = 1; -			if (wbc->nonblocking && bdi_write_congested(bdi)) { -				wbc->encountered_congestion = 1; -				done = 1; -			}  		}  		pagevec_release(&pvec);  		cond_resched(); @@ -2604,10 +2646,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,  		return 0;  	lock_extent(tree, start, end, GFP_NOFS); -	wait_on_extent_writeback(tree, start, end); +	wait_on_page_writeback(page);  	clear_extent_bit(tree, start, end,  			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, -			 1, 1, GFP_NOFS); +			 1, 1, NULL, GFP_NOFS);  	return 0;  } @@ -2687,7 +2729,7 @@ int extent_prepare_write(struct extent_io_tree *tree,  		    !isnew && !PageUptodate(page) &&  		    (block_off_end > to || block_off_start < from) &&  		    !test_range_bit(tree, block_start, cur_end, -				    EXTENT_UPTODATE, 1)) { +				    EXTENT_UPTODATE, 1, NULL)) {  			u64 sector;  			u64 extent_offset = block_start - em->start;  			size_t iosize; @@ -2701,7 +2743,7 @@ int extent_prepare_write(struct extent_io_tree *tree,  			 */  			set_extent_bit(tree, block_start,  				       block_start + iosize - 1, -				       EXTENT_LOCKED, 0, NULL, GFP_NOFS); +				       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);  			ret = submit_extent_page(READ, tree, page,  					 sector, iosize, page_offset, em->bdev,  					 NULL, 1, @@ -2742,13 +2784,13 @@ int try_release_extent_state(struct extent_map_tree *map,  	int ret = 1;  	if (test_range_bit(tree, start, end, -			   EXTENT_IOBITS | EXTENT_ORDERED, 0)) +			   EXTENT_IOBITS, 0, NULL))  		ret = 0;  	else {  		if ((mask & GFP_NOFS) == GFP_NOFS)  			mask = GFP_NOFS;  		clear_extent_bit(tree, start, end, EXTENT_UPTODATE, -				 1, 1, mask); +				 1, 1, NULL, mask);  	}  	return ret;  } @@ -2771,29 +2813,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,  		u64 len;  		while (start <= end) {  			len = end - start + 1; -			spin_lock(&map->lock); +			write_lock(&map->lock);  			em = lookup_extent_mapping(map, start, len);  			if (!em || IS_ERR(em)) { -				spin_unlock(&map->lock); +				write_unlock(&map->lock);  				break;  			}  			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||  			    em->start != start) { -				spin_unlock(&map->lock); +				write_unlock(&map->lock);  				free_extent_map(em);  				break;  			}  			if (!test_range_bit(tree, em->start,  					    extent_map_end(em) - 1, -					    EXTENT_LOCKED | EXTENT_WRITEBACK | -					    EXTENT_ORDERED, -					    0)) { +					    EXTENT_LOCKED | EXTENT_WRITEBACK, +					    0, NULL)) {  				remove_extent_mapping(map, em);  				/* once for the rb tree */  				free_extent_map(em);  			}  			start = extent_map_end(em); -			spin_unlock(&map->lock); +			write_unlock(&map->lock);  			/* once for us */  			free_extent_map(em); @@ -3203,7 +3244,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,  	int uptodate;  	unsigned long index; -	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); +	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);  	if (ret)  		return 1;  	while (start <= end) { @@ -3233,7 +3274,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,  		return 1;  	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, -			   EXTENT_UPTODATE, 1); +			   EXTENT_UPTODATE, 1, NULL);  	if (ret)  		return ret; @@ -3269,7 +3310,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  		return 0;  	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, -			   EXTENT_UPTODATE, 1)) { +			   EXTENT_UPTODATE, 1, NULL)) {  		return 0;  	} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5bc20abf3f3..14ed16fd862 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -13,10 +13,8 @@  #define EXTENT_DEFRAG (1 << 6)  #define EXTENT_DEFRAG_DONE (1 << 7)  #define EXTENT_BUFFER_FILLED (1 << 8) -#define EXTENT_ORDERED (1 << 9) -#define EXTENT_ORDERED_METADATA (1 << 10) -#define EXTENT_BOUNDARY (1 << 11) -#define EXTENT_NODATASUM (1 << 12) +#define EXTENT_BOUNDARY (1 << 9) +#define EXTENT_NODATASUM (1 << 10)  #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)  /* flags for bio submission */ @@ -142,6 +140,8 @@ int try_release_extent_state(struct extent_map_tree *map,  			     struct extent_io_tree *tree, struct page *page,  			     gfp_t mask);  int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, +		     int bits, struct extent_state **cached, gfp_t mask);  int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);  int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,  		    gfp_t mask); @@ -155,11 +155,12 @@ u64 count_range_bits(struct extent_io_tree *tree,  		     u64 max_bytes, unsigned long bits);  int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, -		   int bits, int filled); +		   int bits, int filled, struct extent_state *cached_state);  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,  		      int bits, gfp_t mask);  int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, -		     int bits, int wake, int delete, gfp_t mask); +		     int bits, int wake, int delete, struct extent_state **cached, +		     gfp_t mask);  int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,  		    int bits, gfp_t mask);  int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, @@ -282,5 +283,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,  				int clear_unlock,  				int clear_delalloc, int clear_dirty,  				int set_writeback, -				int end_writeback); +				int end_writeback, +				int set_private2);  #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 30c9365861e..5bc7a0d325e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -36,7 +36,7 @@ void extent_map_exit(void)  void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)  {  	tree->map.rb_node = NULL; -	spin_lock_init(&tree->lock); +	rwlock_init(&tree->lock);  }  /** @@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)  	return 0;  } +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +{ +	int ret = 0; +	struct extent_map *merge = NULL; +	struct rb_node *rb; +	struct extent_map *em; + +	write_lock(&tree->lock); +	em = lookup_extent_mapping(tree, start, len); + +	WARN_ON(em->start != start || !em); + +	if (!em) +		goto out; + +	clear_bit(EXTENT_FLAG_PINNED, &em->flags); + +	if (em->start != 0) { +		rb = rb_prev(&em->rb_node); +		if (rb) +			merge = rb_entry(rb, struct extent_map, rb_node); +		if (rb && mergable_maps(merge, em)) { +			em->start = merge->start; +			em->len += merge->len; +			em->block_len += merge->block_len; +			em->block_start = merge->block_start; +			merge->in_tree = 0; +			rb_erase(&merge->rb_node, &tree->map); +			free_extent_map(merge); +		} +	} + +	rb = rb_next(&em->rb_node); +	if (rb) +		merge = rb_entry(rb, struct extent_map, rb_node); +	if (rb && mergable_maps(em, merge)) { +		em->len += merge->len; +		em->block_len += merge->len; +		rb_erase(&merge->rb_node, &tree->map); +		merge->in_tree = 0; +		free_extent_map(merge); +	} + +	free_extent_map(em); +out: +	write_unlock(&tree->lock); +	return ret; + +} +  /**   * add_extent_mapping - add new extent map to the extent tree   * @tree:	tree to insert new map in @@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,  		ret = -EEXIST;  		goto out;  	} -	assert_spin_locked(&tree->lock);  	rb = tree_insert(&tree->map, em->start, &em->rb_node);  	if (rb) {  		ret = -EEXIST; @@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,  	struct rb_node *next = NULL;  	u64 end = range_end(start, len); -	assert_spin_locked(&tree->lock);  	rb_node = __tree_search(&tree->map, start, &prev, &next);  	if (!rb_node && prev) {  		em = rb_entry(prev, struct extent_map, rb_node); @@ -331,7 +379,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)  	int ret = 0;  	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); -	assert_spin_locked(&tree->lock);  	rb_erase(&em->rb_node, &tree->map);  	em->in_tree = 0;  	return ret; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index fb6eeef06bb..d3d442f4bbb 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -31,7 +31,7 @@ struct extent_map {  struct extent_map_tree {  	struct rb_root map; -	spinlock_t lock; +	rwlock_t lock;  };  static inline u64 extent_map_end(struct extent_map *em) @@ -59,4 +59,5 @@ struct extent_map *alloc_extent_map(gfp_t mask);  void free_extent_map(struct extent_map *em);  int __init extent_map_init(void);  void extent_map_exit(void); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);  #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4b833972273..571ad3c13b4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,  	int err = 0;  	int i;  	struct inode *inode = fdentry(file)->d_inode; -	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; -	u64 hint_byte;  	u64 num_bytes;  	u64 start_pos;  	u64 end_of_last_block; @@ -125,22 +123,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,  		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);  	end_of_last_block = start_pos + num_bytes - 1; - -	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); -	trans = btrfs_join_transaction(root, 1); -	if (!trans) { -		err = -ENOMEM; -		goto out_unlock; -	} -	btrfs_set_trans_block_group(trans, inode); -	hint_byte = 0; - -	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); - -	/* check for reserved extents on each page, we don't want -	 * to reset the delalloc bit on things that already have -	 * extents reserved. -	 */  	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);  	for (i = 0; i < num_pages; i++) {  		struct page *p = pages[i]; @@ -155,9 +137,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,  		 * at this time.  		 */  	} -	err = btrfs_end_transaction(trans, root); -out_unlock: -	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);  	return err;  } @@ -189,18 +168,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  		if (!split2)  			split2 = alloc_extent_map(GFP_NOFS); -		spin_lock(&em_tree->lock); +		write_lock(&em_tree->lock);  		em = lookup_extent_mapping(em_tree, start, len);  		if (!em) { -			spin_unlock(&em_tree->lock); +			write_unlock(&em_tree->lock);  			break;  		}  		flags = em->flags;  		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { -			spin_unlock(&em_tree->lock);  			if (em->start <= start &&  			    (!testend || em->start + em->len >= start + len)) {  				free_extent_map(em); +				write_unlock(&em_tree->lock);  				break;  			}  			if (start < em->start) { @@ -210,6 +189,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  				start = em->start + em->len;  			}  			free_extent_map(em); +			write_unlock(&em_tree->lock);  			continue;  		}  		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -260,7 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			free_extent_map(split);  			split = NULL;  		} -		spin_unlock(&em_tree->lock); +		write_unlock(&em_tree->lock);  		/* once for us */  		free_extent_map(em); @@ -289,7 +269,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct inode *inode,  		       u64 start, u64 end, u64 locked_end, -		       u64 inline_limit, u64 *hint_byte) +		       u64 inline_limit, u64 *hint_byte, int drop_cache)  {  	u64 extent_end = 0;  	u64 search_start = start; @@ -314,7 +294,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,  	int ret;  	inline_limit = 0; -	btrfs_drop_extent_cache(inode, start, end - 1, 0); +	if (drop_cache) +		btrfs_drop_extent_cache(inode, start, end - 1, 0);  	path = btrfs_alloc_path();  	if (!path) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 59cba180fe8..941f1b71cd2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,  	}  	ret = btrfs_drop_extents(trans, root, inode, start, -				 aligned_end, aligned_end, start, &hint_byte); +				 aligned_end, aligned_end, start, +				 &hint_byte, 1);  	BUG_ON(ret);  	if (isize > actual_end) @@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,  				   inline_len, compressed_size,  				   compressed_pages);  	BUG_ON(ret); -	btrfs_drop_extent_cache(inode, start, aligned_end, 0); +	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);  	return 0;  } @@ -425,7 +426,7 @@ again:  			extent_clear_unlock_delalloc(inode,  						     &BTRFS_I(inode)->io_tree,  						     start, end, NULL, 1, 0, -						     0, 1, 1, 1); +						     0, 1, 1, 1, 0);  			ret = 0;  			goto free_pages_out;  		} @@ -611,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,  		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);  		while (1) { -			spin_lock(&em_tree->lock); +			write_lock(&em_tree->lock);  			ret = add_extent_mapping(em_tree, em); -			spin_unlock(&em_tree->lock); +			write_unlock(&em_tree->lock);  			if (ret != -EEXIST) {  				free_extent_map(em);  				break; @@ -640,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,  					     async_extent->start,  					     async_extent->start +  					     async_extent->ram_size - 1, -					     NULL, 1, 1, 0, 1, 1, 0); +					     NULL, 1, 1, 0, 1, 1, 0, 0);  		ret = btrfs_submit_compressed_write(inode,  				    async_extent->start, @@ -713,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,  			extent_clear_unlock_delalloc(inode,  						     &BTRFS_I(inode)->io_tree,  						     start, end, NULL, 1, 1, -						     1, 1, 1, 1); +						     1, 1, 1, 1, 0);  			*nr_written = *nr_written +  			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;  			*page_started = 1; @@ -747,9 +748,9 @@ static noinline int cow_file_range(struct inode *inode,  		set_bit(EXTENT_FLAG_PINNED, &em->flags);  		while (1) { -			spin_lock(&em_tree->lock); +			write_lock(&em_tree->lock);  			ret = add_extent_mapping(em_tree, em); -			spin_unlock(&em_tree->lock); +			write_unlock(&em_tree->lock);  			if (ret != -EEXIST) {  				free_extent_map(em);  				break; @@ -776,11 +777,14 @@ static noinline int cow_file_range(struct inode *inode,  		/* we're not doing compressed IO, don't unlock the first  		 * page (which the caller expects to stay locked), don't  		 * clear any dirty bits and don't set any writeback bits +		 * +		 * Do set the Private2 bit so we know this page was properly +		 * setup for writepage  		 */  		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,  					     start, start + ram_size - 1,  					     locked_page, unlock, 1, -					     1, 0, 0, 0); +					     1, 0, 0, 0, 1);  		disk_num_bytes -= cur_alloc_size;  		num_bytes -= cur_alloc_size;  		alloc_hint = ins.objectid + ins.offset; @@ -853,7 +857,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,  	int limit = 10 * 1024 * 1042;  	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | -			 EXTENT_DELALLOC, 1, 0, GFP_NOFS); +			 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);  	while (start < end) {  		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);  		async_cow->inode = inode; @@ -1080,9 +1084,9 @@ out_check:  			em->bdev = root->fs_info->fs_devices->latest_bdev;  			set_bit(EXTENT_FLAG_PINNED, &em->flags);  			while (1) { -				spin_lock(&em_tree->lock); +				write_lock(&em_tree->lock);  				ret = add_extent_mapping(em_tree, em); -				spin_unlock(&em_tree->lock); +				write_unlock(&em_tree->lock);  				if (ret != -EEXIST) {  					free_extent_map(em);  					break; @@ -1101,7 +1105,7 @@ out_check:  		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,  					cur_offset, cur_offset + num_bytes - 1, -					locked_page, 1, 1, 1, 0, 0, 0); +					locked_page, 1, 1, 1, 0, 0, 0, 1);  		cur_offset = extent_end;  		if (cur_offset > end)  			break; @@ -1374,10 +1378,8 @@ again:  	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);  	/* already ordered? We're done */ -	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, -			     EXTENT_ORDERED, 0)) { +	if (PagePrivate2(page))  		goto out; -	}  	ordered = btrfs_lookup_ordered_extent(inode, page_start);  	if (ordered) { @@ -1413,11 +1415,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)  	struct inode *inode = page->mapping->host;  	struct btrfs_writepage_fixup *fixup;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	int ret; -	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, -			     EXTENT_ORDERED, 0); -	if (ret) +	/* this page is properly in the ordered list */ +	if (TestClearPagePrivate2(page))  		return 0;  	if (PageChecked(page)) @@ -1455,9 +1455,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	BUG_ON(!path);  	path->leave_spinning = 1; + +	/* +	 * we may be replacing one extent in the tree with another. +	 * The new extent is pinned in the extent map, and we don't want +	 * to drop it from the cache until it is completely in the btree. +	 * +	 * So, tell btrfs_drop_extents to leave this extent in the cache. +	 * the caller is expected to unpin it and allow it to be merged +	 * with the others. +	 */  	ret = btrfs_drop_extents(trans, root, inode, file_pos,  				 file_pos + num_bytes, locked_end, -				 file_pos, &hint); +				 file_pos, &hint, 0);  	BUG_ON(ret);  	ins.objectid = inode->i_ino; @@ -1485,7 +1495,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	inode_add_bytes(inode, num_bytes); -	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);  	ins.objectid = disk_bytenr;  	ins.offset = disk_num_bytes; @@ -1596,6 +1605,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  						ordered_extent->len,  						compressed, 0, 0,  						BTRFS_FILE_EXTENT_REG); +		unpin_extent_cache(&BTRFS_I(inode)->extent_tree, +				   ordered_extent->file_offset, +				   ordered_extent->len);  		BUG_ON(ret);  	}  	unlock_extent(io_tree, ordered_extent->file_offset, @@ -1623,6 +1635,7 @@ nocow:  static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,  				struct extent_state *state, int uptodate)  { +	ClearPagePrivate2(page);  	return btrfs_finish_ordered_io(page->mapping->host, start, end);  } @@ -1669,13 +1682,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,  		failrec->last_mirror = 0;  		failrec->bio_flags = 0; -		spin_lock(&em_tree->lock); +		read_lock(&em_tree->lock);  		em = lookup_extent_mapping(em_tree, start, failrec->len);  		if (em->start > start || em->start + em->len < start) {  			free_extent_map(em);  			em = NULL;  		} -		spin_unlock(&em_tree->lock); +		read_unlock(&em_tree->lock);  		if (!em || IS_ERR(em)) {  			kfree(failrec); @@ -1794,7 +1807,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,  		return 0;  	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && -	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { +	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {  		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,  				  GFP_NOFS);  		return 0; @@ -2935,7 +2948,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)  						 cur_offset,  						 cur_offset + hole_size,  						 block_end, -						 cur_offset, &hint_byte); +						 cur_offset, &hint_byte, 1);  			if (err)  				break;  			err = btrfs_insert_file_extent(trans, root, @@ -4064,11 +4077,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,  	int compressed;  again: -	spin_lock(&em_tree->lock); +	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, start, len);  	if (em)  		em->bdev = root->fs_info->fs_devices->latest_bdev; -	spin_unlock(&em_tree->lock); +	read_unlock(&em_tree->lock);  	if (em) {  		if (em->start > start || em->start + em->len <= start) @@ -4215,6 +4228,11 @@ again:  				map = kmap(page);  				read_extent_buffer(leaf, map + pg_offset, ptr,  						   copy_size); +				if (pg_offset + copy_size < PAGE_CACHE_SIZE) { +					memset(map + pg_offset + copy_size, 0, +					       PAGE_CACHE_SIZE - pg_offset - +					       copy_size); +				}  				kunmap(page);  			}  			flush_dcache_page(page); @@ -4259,7 +4277,7 @@ insert:  	}  	err = 0; -	spin_lock(&em_tree->lock); +	write_lock(&em_tree->lock);  	ret = add_extent_mapping(em_tree, em);  	/* it is possible that someone inserted the extent into the tree  	 * while we had the lock dropped.  It is also possible that @@ -4299,7 +4317,7 @@ insert:  			err = 0;  		}  	} -	spin_unlock(&em_tree->lock); +	write_unlock(&em_tree->lock);  out:  	if (path)  		btrfs_free_path(path); @@ -4398,13 +4416,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  	u64 page_start = page_offset(page);  	u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + +	/* +	 * we have the page locked, so new writeback can't start, +	 * and the dirty bit won't be cleared while we are here. +	 * +	 * Wait for IO on this page so that we can safely clear +	 * the PagePrivate2 bit and do ordered accounting +	 */  	wait_on_page_writeback(page); +  	tree = &BTRFS_I(page->mapping->host)->io_tree;  	if (offset) {  		btrfs_releasepage(page, GFP_NOFS);  		return;  	} -  	lock_extent(tree, page_start, page_end, GFP_NOFS);  	ordered = btrfs_lookup_ordered_extent(page->mapping->host,  					   page_offset(page)); @@ -4415,16 +4441,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  		 */  		clear_extent_bit(tree, page_start, page_end,  				 EXTENT_DIRTY | EXTENT_DELALLOC | -				 EXTENT_LOCKED, 1, 0, GFP_NOFS); -		btrfs_finish_ordered_io(page->mapping->host, -					page_start, page_end); +				 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); +		/* +		 * whoever cleared the private bit is responsible +		 * for the finish_ordered_io +		 */ +		if (TestClearPagePrivate2(page)) { +			btrfs_finish_ordered_io(page->mapping->host, +						page_start, page_end); +		}  		btrfs_put_ordered_extent(ordered);  		lock_extent(tree, page_start, page_end, GFP_NOFS);  	}  	clear_extent_bit(tree, page_start, page_end, -		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | -		 EXTENT_ORDERED, -		 1, 1, GFP_NOFS); +		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, +		 1, 1, NULL, GFP_NOFS);  	__btrfs_releasepage(page, GFP_NOFS);  	ClearPageChecked(page); @@ -4521,11 +4552,14 @@ again:  	}  	ClearPageChecked(page);  	set_page_dirty(page); +	SetPageUptodate(page);  	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;  	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);  out_unlock: +	if (!ret) +		return VM_FAULT_LOCKED;  	unlock_page(page);  out:  	return ret; @@ -5058,6 +5092,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,  						  0, 0, 0,  						  BTRFS_FILE_EXTENT_PREALLOC);  		BUG_ON(ret); +		btrfs_drop_extent_cache(inode, cur_offset, +					cur_offset + ins.offset -1, 0);  		num_bytes -= ins.offset;  		cur_offset += ins.offset;  		alloc_hint = ins.objectid + ins.offset; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bd88f25889f..ef0188fb3cc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -596,9 +596,8 @@ again:  		clear_page_dirty_for_io(page);  		btrfs_set_extent_delalloc(inode, page_start, page_end); - -		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);  		set_page_dirty(page); +		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);  		unlock_page(page);  		page_cache_release(page);  		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); @@ -976,7 +975,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  	/* punch hole in destination first */  	btrfs_drop_extents(trans, root, inode, off, off + len, -			   off + len, 0, &hint_byte); +			   off + len, 0, &hint_byte, 1);  	/* clone data */  	key.objectid = src->i_ino; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d6f0806c682..4a9c8c4cec2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,   *   * len is the length of the extent   * - * This also sets the EXTENT_ORDERED bit on the range in the inode. - *   * The tree is given a single reference on the ordered extent that was   * inserted.   */ @@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  	entry->start = start;  	entry->len = len;  	entry->disk_len = disk_len; +	entry->bytes_left = len;  	entry->inode = inode;  	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)  		set_bit(type, &entry->flags); @@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  			   &entry->rb_node);  	BUG_ON(node); -	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, -			   entry_end(entry) - 1, GFP_NOFS); -  	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);  	list_add_tail(&entry->root_extent_list,  		      &BTRFS_I(inode)->root->fs_info->ordered_extents); @@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,  	struct btrfs_ordered_inode_tree *tree;  	struct rb_node *node;  	struct btrfs_ordered_extent *entry; -	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	int ret;  	tree = &BTRFS_I(inode)->ordered_tree;  	mutex_lock(&tree->mutex); -	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, -			     GFP_NOFS);  	node = tree_search(tree, file_offset);  	if (!node) {  		ret = 1; @@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,  		goto out;  	} -	ret = test_range_bit(io_tree, entry->file_offset, -			     entry->file_offset + entry->len - 1, -			     EXTENT_ORDERED, 0); -	if (ret == 0) +	if (io_size > entry->bytes_left) { +		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", +		       (unsigned long long)entry->bytes_left, +		       (unsigned long long)io_size); +	} +	entry->bytes_left -= io_size; +	if (entry->bytes_left == 0)  		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); +	else +		ret = 1;  out:  	mutex_unlock(&tree->mutex);  	return ret == 0; @@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)  	u64 orig_end;  	u64 wait_end;  	struct btrfs_ordered_extent *ordered; +	int found;  	if (start + len < start) {  		orig_end = INT_LIMIT(loff_t); @@ -502,6 +501,7 @@ again:  					   orig_end >> PAGE_CACHE_SHIFT);  	end = orig_end; +	found = 0;  	while (1) {  		ordered = btrfs_lookup_first_ordered_extent(inode, end);  		if (!ordered) @@ -514,6 +514,7 @@ again:  			btrfs_put_ordered_extent(ordered);  			break;  		} +		found++;  		btrfs_start_ordered_extent(inode, ordered, 1);  		end = ordered->file_offset;  		btrfs_put_ordered_extent(ordered); @@ -521,8 +522,8 @@ again:  			break;  		end--;  	} -	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, -			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { +	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, +			   EXTENT_DELALLOC, 0, NULL)) {  		schedule_timeout(1);  		goto again;  	} @@ -613,7 +614,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,  	 */  	if (test_range_bit(io_tree, disk_i_size,  			   ordered->file_offset + ordered->len - 1, -			   EXTENT_DELALLOC, 0)) { +			   EXTENT_DELALLOC, 0, NULL)) {  		goto out;  	}  	/* @@ -664,7 +665,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,  	 */  	if (i_size_test > entry_end(ordered) &&  	    !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, -			   EXTENT_DELALLOC, 0)) { +			   EXTENT_DELALLOC, 0, NULL)) {  		new_i_size = min_t(u64, i_size_test, i_size_read(inode));  	}  	BTRFS_I(inode)->disk_i_size = new_i_size; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 3d31c8827b0..993a7ea45c7 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -85,6 +85,9 @@ struct btrfs_ordered_extent {  	/* extent length on disk */  	u64 disk_len; +	/* number of bytes that still need writing */ +	u64 bytes_left; +  	/* flags (described above) */  	unsigned long flags; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c04f7f21260..3be16ccc7ee 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2180,7 +2180,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,  				struct reloc_control *rc)  {  	if (test_range_bit(&rc->processed_blocks, bytenr, -			   bytenr + blocksize - 1, EXTENT_DIRTY, 1)) +			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))  		return 1;  	return 0;  } @@ -2646,9 +2646,9 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)  	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);  	while (1) {  		int ret; -		spin_lock(&em_tree->lock); +		write_lock(&em_tree->lock);  		ret = add_extent_mapping(em_tree, em); -		spin_unlock(&em_tree->lock); +		write_unlock(&em_tree->lock);  		if (ret != -EEXIST) {  			free_extent_map(em);  			break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d91b0de7c50..8661a7381b3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,  	saved_nbytes = inode_get_bytes(inode);  	/* drop any overlapping extents */  	ret = btrfs_drop_extents(trans, root, inode, -			 start, extent_end, extent_end, start, &alloc_hint); +			 start, extent_end, extent_end, start, &alloc_hint, 1);  	BUG_ON(ret);  	if (found_type == BTRFS_FILE_EXTENT_REG || diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5dbefd11b4a..d2358c06bbd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -276,7 +276,7 @@ loop_lock:  		 * is now congested.  Back off and let other work structs  		 * run instead  		 */ -		if (pending && bdi_write_congested(bdi) && batch_run > 32 && +		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&  		    fs_info->fs_devices->open_devices > 1) {  			struct io_context *ioc; @@ -1749,9 +1749,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,  	 * step two, delete the device extents and the  	 * chunk tree entries  	 */ -	spin_lock(&em_tree->lock); +	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, chunk_offset, 1); -	spin_unlock(&em_tree->lock); +	read_unlock(&em_tree->lock);  	BUG_ON(em->start > chunk_offset ||  	       em->start + em->len < chunk_offset); @@ -1780,9 +1780,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,  	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);  	BUG_ON(ret); -	spin_lock(&em_tree->lock); +	write_lock(&em_tree->lock);  	remove_extent_mapping(em_tree, em); -	spin_unlock(&em_tree->lock); +	write_unlock(&em_tree->lock);  	kfree(map);  	em->bdev = NULL; @@ -2294,9 +2294,9 @@ again:  	em->block_len = em->len;  	em_tree = &extent_root->fs_info->mapping_tree.map_tree; -	spin_lock(&em_tree->lock); +	write_lock(&em_tree->lock);  	ret = add_extent_mapping(em_tree, em); -	spin_unlock(&em_tree->lock); +	write_unlock(&em_tree->lock);  	BUG_ON(ret);  	free_extent_map(em); @@ -2491,9 +2491,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)  	int readonly = 0;  	int i; -	spin_lock(&map_tree->map_tree.lock); +	read_lock(&map_tree->map_tree.lock);  	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); -	spin_unlock(&map_tree->map_tree.lock); +	read_unlock(&map_tree->map_tree.lock);  	if (!em)  		return 1; @@ -2518,11 +2518,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)  	struct extent_map *em;  	while (1) { -		spin_lock(&tree->map_tree.lock); +		write_lock(&tree->map_tree.lock);  		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);  		if (em)  			remove_extent_mapping(&tree->map_tree, em); -		spin_unlock(&tree->map_tree.lock); +		write_unlock(&tree->map_tree.lock);  		if (!em)  			break;  		kfree(em->bdev); @@ -2540,9 +2540,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)  	struct extent_map_tree *em_tree = &map_tree->map_tree;  	int ret; -	spin_lock(&em_tree->lock); +	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, logical, len); -	spin_unlock(&em_tree->lock); +	read_unlock(&em_tree->lock);  	BUG_ON(!em);  	BUG_ON(em->start > logical || em->start + em->len < logical); @@ -2604,9 +2604,9 @@ again:  		atomic_set(&multi->error, 0);  	} -	spin_lock(&em_tree->lock); +	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, logical, *length); -	spin_unlock(&em_tree->lock); +	read_unlock(&em_tree->lock);  	if (!em && unplug_page)  		return 0; @@ -2763,9 +2763,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  	u64 stripe_nr;  	int i, j, nr = 0; -	spin_lock(&em_tree->lock); +	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, chunk_start, 1); -	spin_unlock(&em_tree->lock); +	read_unlock(&em_tree->lock);  	BUG_ON(!em || em->start != chunk_start);  	map = (struct map_lookup *)em->bdev; @@ -3053,9 +3053,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  	logical = key->offset;  	length = btrfs_chunk_length(leaf, chunk); -	spin_lock(&map_tree->map_tree.lock); +	read_lock(&map_tree->map_tree.lock);  	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); -	spin_unlock(&map_tree->map_tree.lock); +	read_unlock(&map_tree->map_tree.lock);  	/* already mapped? */  	if (em && em->start <= logical && em->start + em->len > logical) { @@ -3114,9 +3114,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  		map->stripes[i].dev->in_fs_metadata = 1;  	} -	spin_lock(&map_tree->map_tree.lock); +	write_lock(&map_tree->map_tree.lock);  	ret = add_extent_mapping(&map_tree->map_tree, em); -	spin_unlock(&map_tree->map_tree.lock); +	write_unlock(&map_tree->map_tree.lock);  	BUG_ON(ret);  	free_extent_map(em);  |