diff options
| -rw-r--r-- | fs/btrfs/btrfs_inode.h | 16 | ||||
| -rw-r--r-- | fs/btrfs/ctree.c | 457 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 14 | ||||
| -rw-r--r-- | fs/btrfs/delayed-inode.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/dir-item.c | 9 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 116 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.h | 10 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 285 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 168 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.h | 35 | ||||
| -rw-r--r-- | fs/btrfs/file-item.c | 41 | ||||
| -rw-r--r-- | fs/btrfs/file.c | 11 | ||||
| -rw-r--r-- | fs/btrfs/free-space-cache.c | 173 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 90 | ||||
| -rw-r--r-- | fs/btrfs/ioctl.c | 8 | ||||
| -rw-r--r-- | fs/btrfs/locking.c | 274 | ||||
| -rw-r--r-- | fs/btrfs/locking.h | 36 | ||||
| -rw-r--r-- | fs/btrfs/relocation.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/struct-funcs.c | 100 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 47 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 6 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/xattr.c | 66 | 
23 files changed, 962 insertions, 1007 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 52d7eca8c7b..502b9e98867 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -34,6 +34,9 @@ struct btrfs_inode {  	 */  	struct btrfs_key location; +	/* Lock for counters */ +	spinlock_t lock; +  	/* the extent_tree has caches of all the extent mappings to disk */  	struct extent_map_tree extent_tree; @@ -134,8 +137,8 @@ struct btrfs_inode {  	 * items we think we'll end up using, and reserved_extents is the number  	 * of extent items we've reserved metadata for.  	 */ -	atomic_t outstanding_extents; -	atomic_t reserved_extents; +	unsigned outstanding_extents; +	unsigned reserved_extents;  	/*  	 * ordered_data_close is set by truncate when a file that used @@ -184,4 +187,13 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)  	BTRFS_I(inode)->disk_i_size = size;  } +static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, +				       struct inode *inode) +{ +	if (root == root->fs_info->tree_root || +	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) +		return true; +	return false; +} +  #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2e667868e0d..011cab3aca8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)  {  	int i;  	for (i = 0; i < BTRFS_MAX_LEVEL; i++) { -		if (p->nodes[i] && p->locks[i]) -			btrfs_set_lock_blocking(p->nodes[i]); +		if (!p->nodes[i] || !p->locks[i]) +			continue; +		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); +		if (p->locks[i] == BTRFS_READ_LOCK) +			p->locks[i] = BTRFS_READ_LOCK_BLOCKING; +		else if (p->locks[i] == BTRFS_WRITE_LOCK) +			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;  	}  } @@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)   * for held   */  noinline void btrfs_clear_path_blocking(struct btrfs_path *p, -					struct extent_buffer *held) +					struct extent_buffer *held, int held_rw)  {  	int i; @@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,  	 * really sure by forcing the path to blocking before we clear  	 * the path blocking.  	 */ -	if (held) -		btrfs_set_lock_blocking(held); +	if (held) { +		btrfs_set_lock_blocking_rw(held, held_rw); +		if (held_rw == BTRFS_WRITE_LOCK) +			held_rw = BTRFS_WRITE_LOCK_BLOCKING; +		else if (held_rw == BTRFS_READ_LOCK) +			held_rw = BTRFS_READ_LOCK_BLOCKING; +	}  	btrfs_set_path_blocking(p);  #endif  	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { -		if (p->nodes[i] && p->locks[i]) -			btrfs_clear_lock_blocking(p->nodes[i]); +		if (p->nodes[i] && p->locks[i]) { +			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); +			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) +				p->locks[i] = BTRFS_WRITE_LOCK; +			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) +				p->locks[i] = BTRFS_READ_LOCK; +		}  	}  #ifdef CONFIG_DEBUG_LOCK_ALLOC  	if (held) -		btrfs_clear_lock_blocking(held); +		btrfs_clear_lock_blocking_rw(held, held_rw);  #endif  } @@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p)  		if (!p->nodes[i])  			continue;  		if (p->locks[i]) { -			btrfs_tree_unlock(p->nodes[i]); +			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);  			p->locks[i] = 0;  		}  		free_extent_buffer(p->nodes[i]); @@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)  	return eb;  } +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root.  A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ +	struct extent_buffer *eb; + +	while (1) { +		eb = btrfs_root_node(root); +		btrfs_tree_read_lock(eb); +		if (eb == root->node) +			break; +		btrfs_tree_read_unlock(eb); +		free_extent_buffer(eb); +	} +	return eb; +} +  /* cowonly root (everything not a reference counted cow subvolume), just get   * put onto a simple dirty list.  transaction.c walks this to make sure they   * get properly updated on disk. @@ -626,14 +660,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  	for (i = start_slot; i < end_slot; i++) {  		int close = 1; -		if (!parent->map_token) { -			map_extent_buffer(parent, -					btrfs_node_key_ptr_offset(i), -					sizeof(struct btrfs_key_ptr), -					&parent->map_token, &parent->kaddr, -					&parent->map_start, &parent->map_len, -					KM_USER1); -		}  		btrfs_node_key(parent, &disk_key, i);  		if (!progress_passed && comp_keys(&disk_key, progress) < 0)  			continue; @@ -656,11 +682,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  			last_block = blocknr;  			continue;  		} -		if (parent->map_token) { -			unmap_extent_buffer(parent, parent->map_token, -					    KM_USER1); -			parent->map_token = NULL; -		}  		cur = btrfs_find_tree_block(root, blocknr, blocksize);  		if (cur) @@ -701,11 +722,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  		btrfs_tree_unlock(cur);  		free_extent_buffer(cur);  	} -	if (parent->map_token) { -		unmap_extent_buffer(parent, parent->map_token, -				    KM_USER1); -		parent->map_token = NULL; -	}  	return err;  } @@ -746,7 +762,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb,  	struct btrfs_disk_key *tmp = NULL;  	struct btrfs_disk_key unaligned;  	unsigned long offset; -	char *map_token = NULL;  	char *kaddr = NULL;  	unsigned long map_start = 0;  	unsigned long map_len = 0; @@ -756,18 +771,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,  		mid = (low + high) / 2;  		offset = p + mid * item_size; -		if (!map_token || offset < map_start || +		if (!kaddr || offset < map_start ||  		    (offset + sizeof(struct btrfs_disk_key)) >  		    map_start + map_len) { -			if (map_token) { -				unmap_extent_buffer(eb, map_token, KM_USER0); -				map_token = NULL; -			}  			err = map_private_extent_buffer(eb, offset,  						sizeof(struct btrfs_disk_key), -						&map_token, &kaddr, -						&map_start, &map_len, KM_USER0); +						&kaddr, &map_start, &map_len);  			if (!err) {  				tmp = (struct btrfs_disk_key *)(kaddr + offset - @@ -790,14 +800,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb,  			high = mid;  		else {  			*slot = mid; -			if (map_token) -				unmap_extent_buffer(eb, map_token, KM_USER0);  			return 0;  		}  	}  	*slot = low; -	if (map_token) -		unmap_extent_buffer(eb, map_token, KM_USER0);  	return 1;  } @@ -890,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	mid = path->nodes[level]; -	WARN_ON(!path->locks[level]); +	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && +		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);  	WARN_ON(btrfs_header_generation(mid) != trans->transid);  	orig_ptr = btrfs_node_blockptr(mid, orig_slot); @@ -1228,7 +1235,6 @@ static void reada_for_search(struct btrfs_root *root,  	u32 nr;  	u32 blocksize;  	u32 nscan = 0; -	bool map = true;  	if (level != 1)  		return; @@ -1250,19 +1256,8 @@ static void reada_for_search(struct btrfs_root *root,  	nritems = btrfs_header_nritems(node);  	nr = slot; -	if (node->map_token || path->skip_locking) -		map = false;  	while (1) { -		if (map && !node->map_token) { -			unsigned long offset = btrfs_node_key_ptr_offset(nr); -			map_private_extent_buffer(node, offset, -						  sizeof(struct btrfs_key_ptr), -						  &node->map_token, -						  &node->kaddr, -						  &node->map_start, -						  &node->map_len, KM_USER1); -		}  		if (direction < 0) {  			if (nr == 0)  				break; @@ -1281,11 +1276,6 @@ static void reada_for_search(struct btrfs_root *root,  		if ((search <= target && target - search <= 65536) ||  		    (search > target && search - target <= 65536)) {  			gen = btrfs_node_ptr_generation(node, nr); -			if (map && node->map_token) { -				unmap_extent_buffer(node, node->map_token, -						    KM_USER1); -				node->map_token = NULL; -			}  			readahead_tree_block(root, search, blocksize, gen);  			nread += blocksize;  		} @@ -1293,10 +1283,6 @@ static void reada_for_search(struct btrfs_root *root,  		if ((nread > 65536 || nscan > 32))  			break;  	} -	if (map && node->map_token) { -		unmap_extent_buffer(node, node->map_token, KM_USER1); -		node->map_token = NULL; -	}  }  /* @@ -1409,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,  		t = path->nodes[i];  		if (i >= lowest_unlock && i > skip_level && path->locks[i]) { -			btrfs_tree_unlock(t); +			btrfs_tree_unlock_rw(t, path->locks[i]);  			path->locks[i] = 0;  		}  	} @@ -1436,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)  			continue;  		if (!path->locks[i])  			continue; -		btrfs_tree_unlock(path->nodes[i]); +		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);  		path->locks[i] = 0;  	}  } @@ -1485,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,  			 * we can trust our generation number  			 */  			free_extent_buffer(tmp); +			btrfs_set_path_blocking(p); +  			tmp = read_tree_block(root, blocknr, blocksize, gen);  			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {  				*eb_ret = tmp; @@ -1540,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans,  static int  setup_nodes_for_search(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct btrfs_path *p, -		       struct extent_buffer *b, int level, int ins_len) +		       struct extent_buffer *b, int level, int ins_len, +		       int *write_lock_level)  {  	int ret;  	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=  	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {  		int sret; +		if (*write_lock_level < level + 1) { +			*write_lock_level = level + 1; +			btrfs_release_path(p); +			goto again; +		} +  		sret = reada_for_balance(root, p, level);  		if (sret)  			goto again;  		btrfs_set_path_blocking(p);  		sret = split_node(trans, root, p, level); -		btrfs_clear_path_blocking(p, NULL); +		btrfs_clear_path_blocking(p, NULL, 0);  		BUG_ON(sret > 0);  		if (sret) { @@ -1565,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,  		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {  		int sret; +		if (*write_lock_level < level + 1) { +			*write_lock_level = level + 1; +			btrfs_release_path(p); +			goto again; +		} +  		sret = reada_for_balance(root, p, level);  		if (sret)  			goto again;  		btrfs_set_path_blocking(p);  		sret = balance_level(trans, root, p, level); -		btrfs_clear_path_blocking(p, NULL); +		btrfs_clear_path_blocking(p, NULL, 0);  		if (sret) {  			ret = sret; @@ -1615,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root  	int err;  	int level;  	int lowest_unlock = 1; +	int root_lock; +	/* everything at write_lock_level or lower must be write locked */ +	int write_lock_level = 0;  	u8 lowest_level = 0;  	lowest_level = p->lowest_level;  	WARN_ON(lowest_level && ins_len > 0);  	WARN_ON(p->nodes[0] != NULL); -	if (ins_len < 0) +	if (ins_len < 0) {  		lowest_unlock = 2; +		/* when we are removing items, we might have to go up to level +		 * two as we update tree pointers  Make sure we keep write +		 * for those levels as well +		 */ +		write_lock_level = 2; +	} else if (ins_len > 0) { +		/* +		 * for inserting items, make sure we have a write lock on +		 * level 1 so we can update keys +		 */ +		write_lock_level = 1; +	} + +	if (!cow) +		write_lock_level = -1; + +	if (cow && (p->keep_locks || p->lowest_level)) +		write_lock_level = BTRFS_MAX_LEVEL; +  again: +	/* +	 * we try very hard to do read locks on the root +	 */ +	root_lock = BTRFS_READ_LOCK; +	level = 0;  	if (p->search_commit_root) { +		/* +		 * the commit roots are read only +		 * so we always do read locks +		 */  		b = root->commit_root;  		extent_buffer_get(b); +		level = btrfs_header_level(b);  		if (!p->skip_locking) -			btrfs_tree_lock(b); +			btrfs_tree_read_lock(b);  	} else { -		if (p->skip_locking) +		if (p->skip_locking) {  			b = btrfs_root_node(root); -		else -			b = btrfs_lock_root_node(root); +			level = btrfs_header_level(b); +		} else { +			/* we don't know the level of the root node +			 * until we actually have it read locked +			 */ +			b = btrfs_read_lock_root_node(root); +			level = btrfs_header_level(b); +			if (level <= write_lock_level) { +				/* whoops, must trade for write lock */ +				btrfs_tree_read_unlock(b); +				free_extent_buffer(b); +				b = btrfs_lock_root_node(root); +				root_lock = BTRFS_WRITE_LOCK; + +				/* the level might have changed, check again */ +				level = btrfs_header_level(b); +			} +		}  	} +	p->nodes[level] = b; +	if (!p->skip_locking) +		p->locks[level] = root_lock;  	while (b) {  		level = btrfs_header_level(b); @@ -1644,10 +1696,6 @@ again:  		 * setup the path here so we can release it under lock  		 * contention with the cow code  		 */ -		p->nodes[level] = b; -		if (!p->skip_locking) -			p->locks[level] = 1; -  		if (cow) {  			/*  			 * if we don't really need to cow this block @@ -1659,6 +1707,16 @@ again:  			btrfs_set_path_blocking(p); +			/* +			 * must have write locks on this node and the +			 * parent +			 */ +			if (level + 1 > write_lock_level) { +				write_lock_level = level + 1; +				btrfs_release_path(p); +				goto again; +			} +  			err = btrfs_cow_block(trans, root, b,  					      p->nodes[level + 1],  					      p->slots[level + 1], &b); @@ -1671,10 +1729,7 @@ cow_done:  		BUG_ON(!cow && ins_len);  		p->nodes[level] = b; -		if (!p->skip_locking) -			p->locks[level] = 1; - -		btrfs_clear_path_blocking(p, NULL); +		btrfs_clear_path_blocking(p, NULL, 0);  		/*  		 * we have a lock on b and as long as we aren't changing @@ -1700,7 +1755,7 @@ cow_done:  			}  			p->slots[level] = slot;  			err = setup_nodes_for_search(trans, root, p, b, level, -						     ins_len); +					     ins_len, &write_lock_level);  			if (err == -EAGAIN)  				goto again;  			if (err) { @@ -1710,6 +1765,19 @@ cow_done:  			b = p->nodes[level];  			slot = p->slots[level]; +			/* +			 * slot 0 is special, if we change the key +			 * we have to update the parent pointer +			 * which means we must have a write lock +			 * on the parent +			 */ +			if (slot == 0 && cow && +			    write_lock_level < level + 1) { +				write_lock_level = level + 1; +				btrfs_release_path(p); +				goto again; +			} +  			unlock_up(p, level, lowest_unlock);  			if (level == lowest_level) { @@ -1728,23 +1796,42 @@ cow_done:  			}  			if (!p->skip_locking) { -				btrfs_clear_path_blocking(p, NULL); -				err = btrfs_try_spin_lock(b); - -				if (!err) { -					btrfs_set_path_blocking(p); -					btrfs_tree_lock(b); -					btrfs_clear_path_blocking(p, b); +				level = btrfs_header_level(b); +				if (level <= write_lock_level) { +					err = btrfs_try_tree_write_lock(b); +					if (!err) { +						btrfs_set_path_blocking(p); +						btrfs_tree_lock(b); +						btrfs_clear_path_blocking(p, b, +								  BTRFS_WRITE_LOCK); +					} +					p->locks[level] = BTRFS_WRITE_LOCK; +				} else { +					err = btrfs_try_tree_read_lock(b); +					if (!err) { +						btrfs_set_path_blocking(p); +						btrfs_tree_read_lock(b); +						btrfs_clear_path_blocking(p, b, +								  BTRFS_READ_LOCK); +					} +					p->locks[level] = BTRFS_READ_LOCK;  				} +				p->nodes[level] = b;  			}  		} else {  			p->slots[level] = slot;  			if (ins_len > 0 &&  			    btrfs_leaf_free_space(root, b) < ins_len) { +				if (write_lock_level < 1) { +					write_lock_level = 1; +					btrfs_release_path(p); +					goto again; +				} +  				btrfs_set_path_blocking(p);  				err = split_leaf(trans, root, key,  						 p, ins_len, ret == 0); -				btrfs_clear_path_blocking(p, NULL); +				btrfs_clear_path_blocking(p, NULL, 0);  				BUG_ON(err > 0);  				if (err) { @@ -2025,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  	add_root_to_dirty_list(root);  	extent_buffer_get(c);  	path->nodes[level] = c; -	path->locks[level] = 1; +	path->locks[level] = BTRFS_WRITE_LOCK;  	path->slots[level] = 0;  	return 0;  } @@ -2253,14 +2340,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,  		if (path->slots[0] == i)  			push_space += data_size; -		if (!left->map_token) { -			map_extent_buffer(left, (unsigned long)item, -					sizeof(struct btrfs_item), -					&left->map_token, &left->kaddr, -					&left->map_start, &left->map_len, -					KM_USER1); -		} -  		this_item_size = btrfs_item_size(left, item);  		if (this_item_size + sizeof(*item) + push_space > free_space)  			break; @@ -2271,10 +2350,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,  			break;  		i--;  	} -	if (left->map_token) { -		unmap_extent_buffer(left, left->map_token, KM_USER1); -		left->map_token = NULL; -	}  	if (push_items == 0)  		goto out_unlock; @@ -2316,21 +2391,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,  	push_space = BTRFS_LEAF_DATA_SIZE(root);  	for (i = 0; i < right_nritems; i++) {  		item = btrfs_item_nr(right, i); -		if (!right->map_token) { -			map_extent_buffer(right, (unsigned long)item, -					sizeof(struct btrfs_item), -					&right->map_token, &right->kaddr, -					&right->map_start, &right->map_len, -					KM_USER1); -		}  		push_space -= btrfs_item_size(right, item);  		btrfs_set_item_offset(right, item, push_space);  	} -	if (right->map_token) { -		unmap_extent_buffer(right, right->map_token, KM_USER1); -		right->map_token = NULL; -	}  	left_nritems -= push_items;  	btrfs_set_header_nritems(left, left_nritems); @@ -2467,13 +2531,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,  	for (i = 0; i < nr; i++) {  		item = btrfs_item_nr(right, i); -		if (!right->map_token) { -			map_extent_buffer(right, (unsigned long)item, -					sizeof(struct btrfs_item), -					&right->map_token, &right->kaddr, -					&right->map_start, &right->map_len, -					KM_USER1); -		}  		if (!empty && push_items > 0) {  			if (path->slots[0] < i) @@ -2496,11 +2553,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,  		push_space += this_item_size + sizeof(*item);  	} -	if (right->map_token) { -		unmap_extent_buffer(right, right->map_token, KM_USER1); -		right->map_token = NULL; -	} -  	if (push_items == 0) {  		ret = 1;  		goto out; @@ -2530,23 +2582,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,  		u32 ioff;  		item = btrfs_item_nr(left, i); -		if (!left->map_token) { -			map_extent_buffer(left, (unsigned long)item, -					sizeof(struct btrfs_item), -					&left->map_token, &left->kaddr, -					&left->map_start, &left->map_len, -					KM_USER1); -		}  		ioff = btrfs_item_offset(left, item);  		btrfs_set_item_offset(left, item,  		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));  	}  	btrfs_set_header_nritems(left, old_left_nritems + push_items); -	if (left->map_token) { -		unmap_extent_buffer(left, left->map_token, KM_USER1); -		left->map_token = NULL; -	}  	/* fixup right node */  	if (push_items > right_nritems) { @@ -2574,21 +2615,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,  	for (i = 0; i < right_nritems; i++) {  		item = btrfs_item_nr(right, i); -		if (!right->map_token) { -			map_extent_buffer(right, (unsigned long)item, -					sizeof(struct btrfs_item), -					&right->map_token, &right->kaddr, -					&right->map_start, &right->map_len, -					KM_USER1); -		} -  		push_space = push_space - btrfs_item_size(right, item);  		btrfs_set_item_offset(right, item, push_space);  	} -	if (right->map_token) { -		unmap_extent_buffer(right, right->map_token, KM_USER1); -		right->map_token = NULL; -	}  	btrfs_mark_buffer_dirty(left);  	if (right_nritems) @@ -2729,23 +2758,10 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,  		struct btrfs_item *item = btrfs_item_nr(right, i);  		u32 ioff; -		if (!right->map_token) { -			map_extent_buffer(right, (unsigned long)item, -					sizeof(struct btrfs_item), -					&right->map_token, &right->kaddr, -					&right->map_start, &right->map_len, -					KM_USER1); -		} -  		ioff = btrfs_item_offset(right, item);  		btrfs_set_item_offset(right, item, ioff + rt_data_off);  	} -	if (right->map_token) { -		unmap_extent_buffer(right, right->map_token, KM_USER1); -		right->map_token = NULL; -	} -  	btrfs_set_header_nritems(l, mid);  	ret = 0;  	btrfs_item_key(right, &disk_key, 0); @@ -3264,23 +3280,10 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,  		u32 ioff;  		item = btrfs_item_nr(leaf, i); -		if (!leaf->map_token) { -			map_extent_buffer(leaf, (unsigned long)item, -					sizeof(struct btrfs_item), -					&leaf->map_token, &leaf->kaddr, -					&leaf->map_start, &leaf->map_len, -					KM_USER1); -		} -  		ioff = btrfs_item_offset(leaf, item);  		btrfs_set_item_offset(leaf, item, ioff + size_diff);  	} -	if (leaf->map_token) { -		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); -		leaf->map_token = NULL; -	} -  	/* shift the data */  	if (from_end) {  		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + @@ -3377,22 +3380,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,  		u32 ioff;  		item = btrfs_item_nr(leaf, i); -		if (!leaf->map_token) { -			map_extent_buffer(leaf, (unsigned long)item, -					sizeof(struct btrfs_item), -					&leaf->map_token, &leaf->kaddr, -					&leaf->map_start, &leaf->map_len, -					KM_USER1); -		}  		ioff = btrfs_item_offset(leaf, item);  		btrfs_set_item_offset(leaf, item, ioff - data_size);  	} -	if (leaf->map_token) { -		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); -		leaf->map_token = NULL; -	} -  	/* shift the data */  	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +  		      data_end - data_size, btrfs_leaf_data(leaf) + @@ -3494,27 +3485,13 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,  		 * item0..itemN ... dataN.offset..dataN.size .. data0.size  		 */  		/* first correct the data pointers */ -		WARN_ON(leaf->map_token);  		for (i = slot; i < nritems; i++) {  			u32 ioff;  			item = btrfs_item_nr(leaf, i); -			if (!leaf->map_token) { -				map_extent_buffer(leaf, (unsigned long)item, -					sizeof(struct btrfs_item), -					&leaf->map_token, &leaf->kaddr, -					&leaf->map_start, &leaf->map_len, -					KM_USER1); -			} -  			ioff = btrfs_item_offset(leaf, item);  			btrfs_set_item_offset(leaf, item, ioff - total_data);  		} -		if (leaf->map_token) { -			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); -			leaf->map_token = NULL; -		} -  		/* shift the items */  		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),  			      btrfs_item_nr_offset(slot), @@ -3608,27 +3585,13 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,  		 * item0..itemN ... dataN.offset..dataN.size .. data0.size  		 */  		/* first correct the data pointers */ -		WARN_ON(leaf->map_token);  		for (i = slot; i < nritems; i++) {  			u32 ioff;  			item = btrfs_item_nr(leaf, i); -			if (!leaf->map_token) { -				map_extent_buffer(leaf, (unsigned long)item, -					sizeof(struct btrfs_item), -					&leaf->map_token, &leaf->kaddr, -					&leaf->map_start, &leaf->map_len, -					KM_USER1); -			} -  			ioff = btrfs_item_offset(leaf, item);  			btrfs_set_item_offset(leaf, item, ioff - total_data);  		} -		if (leaf->map_token) { -			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); -			leaf->map_token = NULL; -		} -  		/* shift the items */  		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),  			      btrfs_item_nr_offset(slot), @@ -3840,22 +3803,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,  			u32 ioff;  			item = btrfs_item_nr(leaf, i); -			if (!leaf->map_token) { -				map_extent_buffer(leaf, (unsigned long)item, -					sizeof(struct btrfs_item), -					&leaf->map_token, &leaf->kaddr, -					&leaf->map_start, &leaf->map_len, -					KM_USER1); -			}  			ioff = btrfs_item_offset(leaf, item);  			btrfs_set_item_offset(leaf, item, ioff + dsize);  		} -		if (leaf->map_token) { -			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); -			leaf->map_token = NULL; -		} -  		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),  			      btrfs_item_nr_offset(slot + nr),  			      sizeof(struct btrfs_item) * @@ -4004,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,  	WARN_ON(!path->keep_locks);  again: -	cur = btrfs_lock_root_node(root); +	cur = btrfs_read_lock_root_node(root);  	level = btrfs_header_level(cur);  	WARN_ON(path->nodes[level]);  	path->nodes[level] = cur; -	path->locks[level] = 1; +	path->locks[level] = BTRFS_READ_LOCK;  	if (btrfs_header_generation(cur) < min_trans) {  		ret = 1; @@ -4098,12 +4049,12 @@ find_next_key:  		cur = read_node_slot(root, cur, slot);  		BUG_ON(!cur); -		btrfs_tree_lock(cur); +		btrfs_tree_read_lock(cur); -		path->locks[level - 1] = 1; +		path->locks[level - 1] = BTRFS_READ_LOCK;  		path->nodes[level - 1] = cur;  		unlock_up(path, level, 1); -		btrfs_clear_path_blocking(path, NULL); +		btrfs_clear_path_blocking(path, NULL, 0);  	}  out:  	if (ret == 0) @@ -4218,30 +4169,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)  	u32 nritems;  	int ret;  	int old_spinning = path->leave_spinning; -	int force_blocking = 0; +	int next_rw_lock = 0;  	nritems = btrfs_header_nritems(path->nodes[0]);  	if (nritems == 0)  		return 1; -	/* -	 * we take the blocks in an order that upsets lockdep.  Using -	 * blocking mode is the only way around it. -	 */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -	force_blocking = 1; -#endif -  	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);  again:  	level = 1;  	next = NULL; +	next_rw_lock = 0;  	btrfs_release_path(path);  	path->keep_locks = 1; - -	if (!force_blocking) -		path->leave_spinning = 1; +	path->leave_spinning = 1;  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  	path->keep_locks = 0; @@ -4281,11 +4223,12 @@ again:  		}  		if (next) { -			btrfs_tree_unlock(next); +			btrfs_tree_unlock_rw(next, next_rw_lock);  			free_extent_buffer(next);  		}  		next = c; +		next_rw_lock = path->locks[level];  		ret = read_block_for_search(NULL, root, path, &next, level,  					    slot, &key);  		if (ret == -EAGAIN) @@ -4297,15 +4240,14 @@ again:  		}  		if (!path->skip_locking) { -			ret = btrfs_try_spin_lock(next); +			ret = btrfs_try_tree_read_lock(next);  			if (!ret) {  				btrfs_set_path_blocking(path); -				btrfs_tree_lock(next); -				if (!force_blocking) -					btrfs_clear_path_blocking(path, next); +				btrfs_tree_read_lock(next); +				btrfs_clear_path_blocking(path, next, +							  BTRFS_READ_LOCK);  			} -			if (force_blocking) -				btrfs_set_lock_blocking(next); +			next_rw_lock = BTRFS_READ_LOCK;  		}  		break;  	} @@ -4314,14 +4256,13 @@ again:  		level--;  		c = path->nodes[level];  		if (path->locks[level]) -			btrfs_tree_unlock(c); +			btrfs_tree_unlock_rw(c, path->locks[level]);  		free_extent_buffer(c);  		path->nodes[level] = next;  		path->slots[level] = 0;  		if (!path->skip_locking) -			path->locks[level] = 1; - +			path->locks[level] = next_rw_lock;  		if (!level)  			break; @@ -4336,16 +4277,14 @@ again:  		}  		if (!path->skip_locking) { -			btrfs_assert_tree_locked(path->nodes[level]); -			ret = btrfs_try_spin_lock(next); +			ret = btrfs_try_tree_read_lock(next);  			if (!ret) {  				btrfs_set_path_blocking(path); -				btrfs_tree_lock(next); -				if (!force_blocking) -					btrfs_clear_path_blocking(path, next); +				btrfs_tree_read_lock(next); +				btrfs_clear_path_blocking(path, next, +							  BTRFS_READ_LOCK);  			} -			if (force_blocking) -				btrfs_set_lock_blocking(next); +			next_rw_lock = BTRFS_READ_LOCK;  		}  	}  	ret = 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3b859a3e6a0..3be57c61104 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -755,6 +755,8 @@ struct btrfs_space_info {  				   chunks for this space */  	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */ +	unsigned int flush:1;		/* set if we are trying to make space */ +  	unsigned int force_alloc;	/* set if we need to force a chunk  					   alloc for this space */ @@ -764,7 +766,7 @@ struct btrfs_space_info {  	struct list_head block_groups[BTRFS_NR_RAID_TYPES];  	spinlock_t lock;  	struct rw_semaphore groups_sem; -	atomic_t caching_threads; +	wait_queue_head_t wait;  };  struct btrfs_block_rsv { @@ -824,6 +826,7 @@ struct btrfs_caching_control {  	struct list_head list;  	struct mutex mutex;  	wait_queue_head_t wait; +	struct btrfs_work work;  	struct btrfs_block_group_cache *block_group;  	u64 progress;  	atomic_t count; @@ -1032,6 +1035,8 @@ struct btrfs_fs_info {  	struct btrfs_workers endio_write_workers;  	struct btrfs_workers endio_freespace_worker;  	struct btrfs_workers submit_workers; +	struct btrfs_workers caching_workers; +  	/*  	 * fixup workers take dirty pages that didn't properly go through  	 * the cow mechanism and make them safe to write.  It happens @@ -2128,7 +2133,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)  /* extent-tree.c */  static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, -						 int num_items) +						 unsigned num_items)  {  	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *  		3 * num_items; @@ -2222,9 +2227,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);  void btrfs_clear_space_info_full(struct btrfs_fs_info *info);  int btrfs_check_data_free_space(struct inode *inode, u64 bytes);  void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, -				struct btrfs_root *root, -				int num_items);  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,  				struct btrfs_root *root);  int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, @@ -2330,7 +2332,7 @@ struct btrfs_path *btrfs_alloc_path(void);  void btrfs_free_path(struct btrfs_path *p);  void btrfs_set_path_blocking(struct btrfs_path *p);  void btrfs_clear_path_blocking(struct btrfs_path *p, -			       struct extent_buffer *held); +			       struct extent_buffer *held, int held_rw);  void btrfs_unlock_up_safe(struct btrfs_path *p, int level);  int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 98c68e658a9..b52c672f4c1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,  	}  	/* reset all the locked nodes in the patch to spinning locks. */ -	btrfs_clear_path_blocking(path, NULL); +	btrfs_clear_path_blocking(path, NULL, 0);  	/* insert the keys of the items */  	ret = setup_items_for_insert(trans, root, path, keys, data_size, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 685f2593c4f..c360a848d97 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,  	data_size = sizeof(*dir_item) + name_len + data_len;  	dir_item = insert_with_overflow(trans, root, path, &key, data_size,  					name, name_len); -	/* -	 * FIXME: at some point we should handle xattr's that are larger than -	 * what we can fit in our leaf.  We set location to NULL b/c we arent -	 * pointing at anything else, that will change if we store the xattr -	 * data in a separate inode. -	 */ -	BUG_ON(IS_ERR(dir_item)); +	if (IS_ERR(dir_item)) +		return PTR_ERR(dir_item);  	memset(&location, 0, sizeof(location));  	leaf = path->nodes[0]; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1ac8db5dc0a..94ecac33cf2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -100,38 +100,83 @@ struct async_submit_bio {  	struct btrfs_work work;  }; -/* These are used to set the lockdep class on the extent buffer locks. - * The class is set by the readpage_end_io_hook after the buffer has - * passed csum validation but before the pages are unlocked. +/* + * Lockdep class keys for extent_buffer->lock's in this root.  For a given + * eb, the lockdep key is determined by the btrfs_root it belongs to and + * the level the eb occupies in the tree. + * + * Different roots are used for different purposes and may nest inside each + * other and they require separate keysets.  As lockdep keys should be + * static, assign keysets according to the purpose of the root as indicated + * by btrfs_root->objectid.  This ensures that all special purpose roots + * have separate keysets.   * - * The lockdep class is also set by btrfs_init_new_buffer on freshly - * allocated blocks. + * Lock-nesting across peer nodes is always done with the immediate parent + * node locked thus preventing deadlock.  As lockdep doesn't know this, use + * subclass to avoid triggering lockdep warning in such cases.   * - * The class is based on the level in the tree block, which allows lockdep - * to know that lower nodes nest inside the locks of higher nodes. + * The key is set by the readpage_end_io_hook after the buffer has passed + * csum validation but before the pages are unlocked.  It is also set by + * btrfs_init_new_buffer on freshly allocated blocks.   * - * We also add a check to make sure the highest level of the tree is - * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this - * code needs update as well. + * We also add a check to make sure the highest level of the tree is the + * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code + * needs update as well.   */  #ifdef CONFIG_DEBUG_LOCK_ALLOC  # if BTRFS_MAX_LEVEL != 8  #  error  # endif -static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; -static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { -	/* leaf */ -	"btrfs-extent-00", -	"btrfs-extent-01", -	"btrfs-extent-02", -	"btrfs-extent-03", -	"btrfs-extent-04", -	"btrfs-extent-05", -	"btrfs-extent-06", -	"btrfs-extent-07", -	/* highest possible level */ -	"btrfs-extent-08", + +static struct btrfs_lockdep_keyset { +	u64			id;		/* root objectid */ +	const char		*name_stem;	/* lock name stem */ +	char			names[BTRFS_MAX_LEVEL + 1][20]; +	struct lock_class_key	keys[BTRFS_MAX_LEVEL + 1]; +} btrfs_lockdep_keysets[] = { +	{ .id = BTRFS_ROOT_TREE_OBJECTID,	.name_stem = "root"	}, +	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	.name_stem = "extent"	}, +	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	.name_stem = "chunk"	}, +	{ .id = BTRFS_DEV_TREE_OBJECTID,	.name_stem = "dev"	}, +	{ .id = BTRFS_FS_TREE_OBJECTID,		.name_stem = "fs"	}, +	{ .id = BTRFS_CSUM_TREE_OBJECTID,	.name_stem = "csum"	}, +	{ .id = BTRFS_ORPHAN_OBJECTID,		.name_stem = "orphan"	}, +	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	}, +	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	}, +	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	}, +	{ .id = 0,				.name_stem = "tree"	},  }; + +void __init btrfs_init_lockdep(void) +{ +	int i, j; + +	/* initialize lockdep class names */ +	for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { +		struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; + +		for (j = 0; j < ARRAY_SIZE(ks->names); j++) +			snprintf(ks->names[j], sizeof(ks->names[j]), +				 "btrfs-%s-%02d", ks->name_stem, j); +	} +} + +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, +				    int level) +{ +	struct btrfs_lockdep_keyset *ks; + +	BUG_ON(level >= ARRAY_SIZE(ks->keys)); + +	/* find the matching keyset, id 0 is the default entry */ +	for (ks = btrfs_lockdep_keysets; ks->id; ks++) +		if (ks->id == objectid) +			break; + +	lockdep_set_class_and_name(&eb->lock, +				   &ks->keys[level], ks->names[level]); +} +  #endif  /* @@ -217,7 +262,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,  	unsigned long len;  	unsigned long cur_len;  	unsigned long offset = BTRFS_CSUM_SIZE; -	char *map_token = NULL;  	char *kaddr;  	unsigned long map_start;  	unsigned long map_len; @@ -228,8 +272,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,  	len = buf->len - offset;  	while (len > 0) {  		err = map_private_extent_buffer(buf, offset, 32, -					&map_token, &kaddr, -					&map_start, &map_len, KM_USER0); +					&kaddr, &map_start, &map_len);  		if (err)  			return 1;  		cur_len = min(len, map_len - (offset - map_start)); @@ -237,7 +280,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,  				      crc, cur_len);  		len -= cur_len;  		offset += cur_len; -		unmap_extent_buffer(buf, map_token, KM_USER0);  	}  	if (csum_size > sizeof(inline_result)) {  		result = kzalloc(csum_size * sizeof(char), GFP_NOFS); @@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root,  	return 0;  } -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) -{ -	lockdep_set_class_and_name(&eb->lock, -			   &btrfs_eb_class[level], -			   btrfs_eb_name[level]); -} -#endif -  static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,  			       struct extent_state *state)  { @@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,  	}  	found_level = btrfs_header_level(eb); -	btrfs_set_buffer_lockdep_class(eb, found_level); +	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), +				       eb, found_level);  	ret = csum_tree_block(root, eb, 1);  	if (ret) { @@ -1603,7 +1637,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  		goto fail_bdi;  	} -	fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; +	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);  	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);  	INIT_LIST_HEAD(&fs_info->trans_list); @@ -1807,6 +1841,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,  			   fs_info->thread_pool_size),  			   &fs_info->generic_worker); +	btrfs_init_workers(&fs_info->caching_workers, "cache", +			   2, &fs_info->generic_worker); +  	/* a higher idle thresh on the submit workers makes it much more  	 * likely that bios will be send down in a sane order to the  	 * devices @@ -1860,6 +1897,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	btrfs_start_workers(&fs_info->endio_write_workers, 1);  	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);  	btrfs_start_workers(&fs_info->delayed_workers, 1); +	btrfs_start_workers(&fs_info->caching_workers, 1);  	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);  	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2117,6 +2155,7 @@ fail_sb_buffer:  	btrfs_stop_workers(&fs_info->endio_freespace_worker);  	btrfs_stop_workers(&fs_info->submit_workers);  	btrfs_stop_workers(&fs_info->delayed_workers); +	btrfs_stop_workers(&fs_info->caching_workers);  fail_alloc:  	kfree(fs_info->delayed_root);  fail_iput: @@ -2584,6 +2623,7 @@ int close_ctree(struct btrfs_root *root)  	btrfs_stop_workers(&fs_info->endio_freespace_worker);  	btrfs_stop_workers(&fs_info->submit_workers);  	btrfs_stop_workers(&fs_info->delayed_workers); +	btrfs_stop_workers(&fs_info->caching_workers);  	btrfs_close_devices(fs_info->fs_devices);  	btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a0b610a67aa..bec3ea4bd67 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,10 +87,14 @@ int btree_lock_page_hook(struct page *page);  #ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +void btrfs_init_lockdep(void); +void btrfs_set_buffer_lockdep_class(u64 objectid, +			            struct extent_buffer *eb, int level);  #else -static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, -						 int level) +static inline void btrfs_init_lockdep(void) +{ } +static inline void btrfs_set_buffer_lockdep_class(u64 objectid, +					struct extent_buffer *eb, int level)  {  }  #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71cd456fdb6..4d08ed79405 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -320,12 +320,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,  	return total_added;  } -static int caching_kthread(void *data) +static noinline void caching_thread(struct btrfs_work *work)  { -	struct btrfs_block_group_cache *block_group = data; -	struct btrfs_fs_info *fs_info = block_group->fs_info; -	struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; -	struct btrfs_root *extent_root = fs_info->extent_root; +	struct btrfs_block_group_cache *block_group; +	struct btrfs_fs_info *fs_info; +	struct btrfs_caching_control *caching_ctl; +	struct btrfs_root *extent_root;  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	struct btrfs_key key; @@ -334,9 +334,14 @@ static int caching_kthread(void *data)  	u32 nritems;  	int ret = 0; +	caching_ctl = container_of(work, struct btrfs_caching_control, work); +	block_group = caching_ctl->block_group; +	fs_info = block_group->fs_info; +	extent_root = fs_info->extent_root; +  	path = btrfs_alloc_path();  	if (!path) -		return -ENOMEM; +		goto out;  	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -433,13 +438,11 @@ err:  	free_excluded_extents(extent_root, block_group);  	mutex_unlock(&caching_ctl->mutex); +out:  	wake_up(&caching_ctl->wait);  	put_caching_control(caching_ctl); -	atomic_dec(&block_group->space_info->caching_threads);  	btrfs_put_block_group(block_group); - -	return 0;  }  static int cache_block_group(struct btrfs_block_group_cache *cache, @@ -449,7 +452,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  {  	struct btrfs_fs_info *fs_info = cache->fs_info;  	struct btrfs_caching_control *caching_ctl; -	struct task_struct *tsk;  	int ret = 0;  	smp_mb(); @@ -501,6 +503,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  	caching_ctl->progress = cache->key.objectid;  	/* one for caching kthread, one for caching block group list */  	atomic_set(&caching_ctl->count, 2); +	caching_ctl->work.func = caching_thread;  	spin_lock(&cache->lock);  	if (cache->cached != BTRFS_CACHE_NO) { @@ -516,16 +519,9 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);  	up_write(&fs_info->extent_commit_sem); -	atomic_inc(&cache->space_info->caching_threads);  	btrfs_get_block_group(cache); -	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", -			  cache->key.objectid); -	if (IS_ERR(tsk)) { -		ret = PTR_ERR(tsk); -		printk(KERN_ERR "error running thread %d\n", ret); -		BUG(); -	} +	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);  	return ret;  } @@ -2932,9 +2928,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,  	found->full = 0;  	found->force_alloc = CHUNK_ALLOC_NO_FORCE;  	found->chunk_alloc = 0; +	found->flush = 0; +	init_waitqueue_head(&found->wait);  	*space_info = found;  	list_add_rcu(&found->list, &info->space_info); -	atomic_set(&found->caching_threads, 0);  	return 0;  } @@ -3314,6 +3311,14 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,  	if (reserved == 0)  		return 0; +	smp_mb(); +	if (root->fs_info->delalloc_bytes == 0) { +		if (trans) +			return 0; +		btrfs_wait_ordered_extents(root, 0, 0); +		return 0; +	} +  	max_reclaim = min(reserved, to_reclaim);  	while (loops < 1024) { @@ -3356,6 +3361,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,  		}  	} +	if (reclaimed >= to_reclaim && !trans) +		btrfs_wait_ordered_extents(root, 0, 0);  	return reclaimed >= to_reclaim;  } @@ -3380,15 +3387,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,  	u64 num_bytes = orig_bytes;  	int retries = 0;  	int ret = 0; -	bool reserved = false;  	bool committed = false; +	bool flushing = false;  again: -	ret = -ENOSPC; -	if (reserved) -		num_bytes = 0; - +	ret = 0;  	spin_lock(&space_info->lock); +	/* +	 * We only want to wait if somebody other than us is flushing and we are +	 * actually alloed to flush. +	 */ +	while (flush && !flushing && space_info->flush) { +		spin_unlock(&space_info->lock); +		/* +		 * If we have a trans handle we can't wait because the flusher +		 * may have to commit the transaction, which would mean we would +		 * deadlock since we are waiting for the flusher to finish, but +		 * hold the current transaction open. +		 */ +		if (trans) +			return -EAGAIN; +		ret = wait_event_interruptible(space_info->wait, +					       !space_info->flush); +		/* Must have been interrupted, return */ +		if (ret) +			return -EINTR; + +		spin_lock(&space_info->lock); +	} + +	ret = -ENOSPC;  	unused = space_info->bytes_used + space_info->bytes_reserved +  		 space_info->bytes_pinned + space_info->bytes_readonly +  		 space_info->bytes_may_use; @@ -3403,8 +3431,7 @@ again:  	if (unused <= space_info->total_bytes) {  		unused = space_info->total_bytes - unused;  		if (unused >= num_bytes) { -			if (!reserved) -				space_info->bytes_reserved += orig_bytes; +			space_info->bytes_reserved += orig_bytes;  			ret = 0;  		} else {  			/* @@ -3429,17 +3456,14 @@ again:  	 * to reclaim space we can actually use it instead of somebody else  	 * stealing it from us.  	 */ -	if (ret && !reserved) { -		space_info->bytes_reserved += orig_bytes; -		reserved = true; +	if (ret && flush) { +		flushing = true; +		space_info->flush = 1;  	}  	spin_unlock(&space_info->lock); -	if (!ret) -		return 0; - -	if (!flush) +	if (!ret || !flush)  		goto out;  	/* @@ -3447,11 +3471,11 @@ again:  	 * metadata until after the IO is completed.  	 */  	ret = shrink_delalloc(trans, root, num_bytes, 1); -	if (ret > 0) -		return 0; -	else if (ret < 0) +	if (ret < 0)  		goto out; +	ret = 0; +  	/*  	 * So if we were overcommitted it's possible that somebody else flushed  	 * out enough space and we simply didn't have enough space to reclaim, @@ -3462,11 +3486,11 @@ again:  		goto again;  	} -	spin_lock(&space_info->lock);  	/*  	 * Not enough space to be reclaimed, don't bother committing the  	 * transaction.  	 */ +	spin_lock(&space_info->lock);  	if (space_info->bytes_pinned < orig_bytes)  		ret = -ENOSPC;  	spin_unlock(&space_info->lock); @@ -3474,10 +3498,13 @@ again:  		goto out;  	ret = -EAGAIN; -	if (trans || committed) +	if (trans)  		goto out;  	ret = -ENOSPC; +	if (committed) +		goto out; +  	trans = btrfs_join_transaction(root);  	if (IS_ERR(trans))  		goto out; @@ -3489,12 +3516,12 @@ again:  	}  out: -	if (reserved) { +	if (flushing) {  		spin_lock(&space_info->lock); -		space_info->bytes_reserved -= orig_bytes; +		space_info->flush = 0; +		wake_up_all(&space_info->wait);  		spin_unlock(&space_info->lock);  	} -  	return ret;  } @@ -3704,7 +3731,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,  	if (commit_trans) {  		if (trans)  			return -EAGAIN; -  		trans = btrfs_join_transaction(root);  		BUG_ON(IS_ERR(trans));  		ret = btrfs_commit_transaction(trans, root); @@ -3874,26 +3900,6 @@ int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,  	return 0;  } -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, -				 struct btrfs_root *root, -				 int num_items) -{ -	u64 num_bytes; -	int ret; - -	if (num_items == 0 || root->fs_info->chunk_root == root) -		return 0; - -	num_bytes = btrfs_calc_trans_metadata_size(root, num_items); -	ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, -				  num_bytes); -	if (!ret) { -		trans->bytes_reserved += num_bytes; -		trans->block_rsv = &root->fs_info->trans_block_rsv; -	} -	return ret; -} -  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,  				  struct btrfs_root *root)  { @@ -3944,6 +3950,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,  	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);  } +static unsigned drop_outstanding_extent(struct inode *inode) +{ +	unsigned dropped_extents = 0; + +	spin_lock(&BTRFS_I(inode)->lock); +	BUG_ON(!BTRFS_I(inode)->outstanding_extents); +	BTRFS_I(inode)->outstanding_extents--; + +	/* +	 * If we have more or the same amount of outsanding extents than we have +	 * reserved then we need to leave the reserved extents count alone. +	 */ +	if (BTRFS_I(inode)->outstanding_extents >= +	    BTRFS_I(inode)->reserved_extents) +		goto out; + +	dropped_extents = BTRFS_I(inode)->reserved_extents - +		BTRFS_I(inode)->outstanding_extents; +	BTRFS_I(inode)->reserved_extents -= dropped_extents; +out: +	spin_unlock(&BTRFS_I(inode)->lock); +	return dropped_extents; +} +  static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)  {  	return num_bytes >>= 3; @@ -3953,9 +3983,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; -	u64 to_reserve; -	int nr_extents; -	int reserved_extents; +	u64 to_reserve = 0; +	unsigned nr_extents = 0;  	int ret;  	if (btrfs_transaction_in_commit(root->fs_info)) @@ -3963,66 +3992,49 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	num_bytes = ALIGN(num_bytes, root->sectorsize); -	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; -	reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); +	spin_lock(&BTRFS_I(inode)->lock); +	BTRFS_I(inode)->outstanding_extents++; + +	if (BTRFS_I(inode)->outstanding_extents > +	    BTRFS_I(inode)->reserved_extents) { +		nr_extents = BTRFS_I(inode)->outstanding_extents - +			BTRFS_I(inode)->reserved_extents; +		BTRFS_I(inode)->reserved_extents += nr_extents; -	if (nr_extents > reserved_extents) { -		nr_extents -= reserved_extents;  		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); -	} else { -		nr_extents = 0; -		to_reserve = 0;  	} +	spin_unlock(&BTRFS_I(inode)->lock);  	to_reserve += calc_csum_metadata_size(inode, num_bytes);  	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); -	if (ret) +	if (ret) { +		unsigned dropped; +		/* +		 * We don't need the return value since our reservation failed, +		 * we just need to clean up our counter. +		 */ +		dropped = drop_outstanding_extent(inode); +		WARN_ON(dropped > 1);  		return ret; - -	atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents); -	atomic_inc(&BTRFS_I(inode)->outstanding_extents); +	}  	block_rsv_add_bytes(block_rsv, to_reserve, 1); -	if (block_rsv->size > 512 * 1024 * 1024) -		shrink_delalloc(NULL, root, to_reserve, 0); -  	return 0;  }  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; -	u64 to_free; -	int nr_extents; -	int reserved_extents; +	u64 to_free = 0; +	unsigned dropped;  	num_bytes = ALIGN(num_bytes, root->sectorsize); -	atomic_dec(&BTRFS_I(inode)->outstanding_extents); -	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); - -	reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); -	do { -		int old, new; - -		nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); -		if (nr_extents >= reserved_extents) { -			nr_extents = 0; -			break; -		} -		old = reserved_extents; -		nr_extents = reserved_extents - nr_extents; -		new = reserved_extents - nr_extents; -		old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents, -				     reserved_extents, new); -		if (likely(old == reserved_extents)) -			break; -		reserved_extents = old; -	} while (1); +	dropped = drop_outstanding_extent(inode);  	to_free = calc_csum_metadata_size(inode, num_bytes); -	if (nr_extents > 0) -		to_free += btrfs_calc_trans_metadata_size(root, nr_extents); +	if (dropped > 0) +		to_free += btrfs_calc_trans_metadata_size(root, dropped);  	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,  				to_free); @@ -4990,14 +5002,10 @@ have_block_group:  			}  			/* -			 * We only want to start kthread caching if we are at -			 * the point where we will wait for caching to make -			 * progress, or if our ideal search is over and we've -			 * found somebody to start caching. +			 * The caching workers are limited to 2 threads, so we +			 * can queue as much work as we care to.  			 */ -			if (loop > LOOP_CACHING_NOWAIT || -			    (loop > LOOP_FIND_IDEAL && -			     atomic_read(&space_info->caching_threads) < 2)) { +			if (loop > LOOP_FIND_IDEAL) {  				ret = cache_block_group(block_group, trans,  							orig_root, 0);  				BUG_ON(ret); @@ -5219,8 +5227,7 @@ loop:  		if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {  			found_uncached_bg = false;  			loop++; -			if (!ideal_cache_percent && -			    atomic_read(&space_info->caching_threads)) +			if (!ideal_cache_percent)  				goto search;  			/* @@ -5623,7 +5630,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,  	if (!buf)  		return ERR_PTR(-ENOMEM);  	btrfs_set_header_generation(buf, trans->transid); -	btrfs_set_buffer_lockdep_class(buf, level); +	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);  	btrfs_tree_lock(buf);  	clean_tree_block(trans, root, buf); @@ -5910,7 +5917,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,  			return 1;  		if (path->locks[level] && !wc->keep_locks) { -			btrfs_tree_unlock(eb); +			btrfs_tree_unlock_rw(eb, path->locks[level]);  			path->locks[level] = 0;  		}  		return 0; @@ -5934,7 +5941,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,  	 * keep the tree lock  	 */  	if (path->locks[level] && level > 0) { -		btrfs_tree_unlock(eb); +		btrfs_tree_unlock_rw(eb, path->locks[level]);  		path->locks[level] = 0;  	}  	return 0; @@ -6047,7 +6054,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,  	BUG_ON(level != btrfs_header_level(next));  	path->nodes[level] = next;  	path->slots[level] = 0; -	path->locks[level] = 1; +	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  	wc->level = level;  	if (wc->level == 1)  		wc->reada_slot = 0; @@ -6118,7 +6125,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,  			BUG_ON(level == 0);  			btrfs_tree_lock(eb);  			btrfs_set_lock_blocking(eb); -			path->locks[level] = 1; +			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  			ret = btrfs_lookup_extent_info(trans, root,  						       eb->start, eb->len, @@ -6127,8 +6134,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,  			BUG_ON(ret);  			BUG_ON(wc->refs[level] == 0);  			if (wc->refs[level] == 1) { -				btrfs_tree_unlock(eb); -				path->locks[level] = 0; +				btrfs_tree_unlock_rw(eb, path->locks[level]);  				return 1;  			}  		} @@ -6150,7 +6156,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,  		    btrfs_header_generation(eb) == trans->transid) {  			btrfs_tree_lock(eb);  			btrfs_set_lock_blocking(eb); -			path->locks[level] = 1; +			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  		}  		clean_tree_block(trans, root, eb);  	} @@ -6229,7 +6235,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,  				return 0;  			if (path->locks[level]) { -				btrfs_tree_unlock(path->nodes[level]); +				btrfs_tree_unlock_rw(path->nodes[level], +						     path->locks[level]);  				path->locks[level] = 0;  			}  			free_extent_buffer(path->nodes[level]); @@ -6281,7 +6288,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  		path->nodes[level] = btrfs_lock_root_node(root);  		btrfs_set_lock_blocking(path->nodes[level]);  		path->slots[level] = 0; -		path->locks[level] = 1; +		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  		memset(&wc->update_progress, 0,  		       sizeof(wc->update_progress));  	} else { @@ -6449,7 +6456,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,  	level = btrfs_header_level(node);  	path->nodes[level] = node;  	path->slots[level] = 0; -	path->locks[level] = 1; +	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  	wc->refs[parent_level] = 1;  	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -6524,15 +6531,28 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)  	return flags;  } -static int set_block_group_ro(struct btrfs_block_group_cache *cache) +static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)  {  	struct btrfs_space_info *sinfo = cache->space_info;  	u64 num_bytes; +	u64 min_allocable_bytes;  	int ret = -ENOSPC;  	if (cache->ro)  		return 0; +	/* +	 * We need some metadata space and system metadata space for +	 * allocating chunks in some corner cases until we force to set +	 * it to be readonly. +	 */ +	if ((sinfo->flags & +	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && +	    !force) +		min_allocable_bytes = 1 * 1024 * 1024; +	else +		min_allocable_bytes = 0; +  	spin_lock(&sinfo->lock);  	spin_lock(&cache->lock);  	num_bytes = cache->key.offset - cache->reserved - cache->pinned - @@ -6540,7 +6560,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)  	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +  	    sinfo->bytes_may_use + sinfo->bytes_readonly + -	    cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { +	    cache->reserved_pinned + num_bytes + min_allocable_bytes <= +	    sinfo->total_bytes) {  		sinfo->bytes_readonly += num_bytes;  		sinfo->bytes_reserved += cache->reserved_pinned;  		cache->reserved_pinned = 0; @@ -6571,7 +6592,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,  		do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,  			       CHUNK_ALLOC_FORCE); -	ret = set_block_group_ro(cache); +	ret = set_block_group_ro(cache, 0);  	if (!ret)  		goto out;  	alloc_flags = get_alloc_profile(root, cache->space_info->flags); @@ -6579,7 +6600,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,  			     CHUNK_ALLOC_FORCE);  	if (ret < 0)  		goto out; -	ret = set_block_group_ro(cache); +	ret = set_block_group_ro(cache, 0);  out:  	btrfs_end_transaction(trans, root);  	return ret; @@ -7016,7 +7037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		set_avail_alloc_bits(root->fs_info, cache->flags);  		if (btrfs_chunk_readonly(root, cache->key.objectid)) -			set_block_group_ro(cache); +			set_block_group_ro(cache, 1);  	}  	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { @@ -7030,9 +7051,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		 * mirrored block groups.  		 */  		list_for_each_entry(cache, &space_info->block_groups[3], list) -			set_block_group_ro(cache); +			set_block_group_ro(cache, 1);  		list_for_each_entry(cache, &space_info->block_groups[4], list) -			set_block_group_ro(cache); +			set_block_group_ro(cache, 1);  	}  	init_global_block_rsv(info); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11c1ef..5bbdb243bb6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -281,11 +281,10 @@ static int merge_state(struct extent_io_tree *tree,  		if (other->start == state->end + 1 &&  		    other->state == state->state) {  			merge_cb(tree, state, other); -			other->start = state->start; -			state->tree = NULL; -			rb_erase(&state->rb_node, &tree->state); -			free_extent_state(state); -			state = NULL; +			state->end = other->end; +			other->tree = NULL; +			rb_erase(&other->rb_node, &tree->state); +			free_extent_state(other);  		}  	} @@ -351,7 +350,6 @@ static int insert_state(struct extent_io_tree *tree,  		       "%llu %llu\n", (unsigned long long)found->start,  		       (unsigned long long)found->end,  		       (unsigned long long)start, (unsigned long long)end); -		free_extent_state(state);  		return -EEXIST;  	}  	state->tree = tree; @@ -500,7 +498,8 @@ again:  			cached_state = NULL;  		} -		if (cached && cached->tree && cached->start == start) { +		if (cached && cached->tree && cached->start <= start && +		    cached->end > start) {  			if (clear)  				atomic_dec(&cached->refs);  			state = cached; @@ -742,7 +741,8 @@ again:  	spin_lock(&tree->lock);  	if (cached_state && *cached_state) {  		state = *cached_state; -		if (state->start == start && state->tree) { +		if (state->start <= start && state->end > start && +		    state->tree) {  			node = &state->rb_node;  			goto hit_next;  		} @@ -783,13 +783,13 @@ hit_next:  		if (err)  			goto out; -		next_node = rb_next(node);  		cache_state(state, cached_state);  		merge_state(tree, state);  		if (last_end == (u64)-1)  			goto out;  		start = last_end + 1; +		next_node = rb_next(&state->rb_node);  		if (next_node && start < end && prealloc && !need_resched()) {  			state = rb_entry(next_node, struct extent_state,  					 rb_node); @@ -862,7 +862,6 @@ hit_next:  		 * Avoid to free 'prealloc' if it can be merged with  		 * the later extent.  		 */ -		atomic_inc(&prealloc->refs);  		err = insert_state(tree, prealloc, start, this_end,  				   &bits);  		BUG_ON(err == -EEXIST); @@ -872,7 +871,6 @@ hit_next:  			goto out;  		}  		cache_state(prealloc, cached_state); -		free_extent_state(prealloc);  		prealloc = NULL;  		start = this_end + 1;  		goto search_again; @@ -1564,7 +1562,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,  	int bitset = 0;  	spin_lock(&tree->lock); -	if (cached && cached->tree && cached->start == start) +	if (cached && cached->tree && cached->start <= start && +	    cached->end > start)  		node = &cached->rb_node;  	else  		node = tree_search(tree, start); @@ -2432,6 +2431,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,  	pgoff_t index;  	pgoff_t end;		/* Inclusive */  	int scanned = 0; +	int tag;  	pagevec_init(&pvec, 0);  	if (wbc->range_cyclic) { @@ -2442,11 +2442,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,  		end = wbc->range_end >> PAGE_CACHE_SHIFT;  		scanned = 1;  	} +	if (wbc->sync_mode == WB_SYNC_ALL) +		tag = PAGECACHE_TAG_TOWRITE; +	else +		tag = PAGECACHE_TAG_DIRTY;  retry: +	if (wbc->sync_mode == WB_SYNC_ALL) +		tag_pages_for_writeback(mapping, index, end);  	while (!done && !nr_to_write_done && (index <= end) && -	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, -			      PAGECACHE_TAG_DIRTY, min(end - index, -				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) { +	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, +			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {  		unsigned i;  		scanned = 1; @@ -3022,8 +3027,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,  		return NULL;  	eb->start = start;  	eb->len = len; -	spin_lock_init(&eb->lock); -	init_waitqueue_head(&eb->lock_wq); +	rwlock_init(&eb->lock); +	atomic_set(&eb->write_locks, 0); +	atomic_set(&eb->read_locks, 0); +	atomic_set(&eb->blocking_readers, 0); +	atomic_set(&eb->blocking_writers, 0); +	atomic_set(&eb->spinning_readers, 0); +	atomic_set(&eb->spinning_writers, 0); +	init_waitqueue_head(&eb->write_lock_wq); +	init_waitqueue_head(&eb->read_lock_wq);  #if LEAK_DEBUG  	spin_lock_irqsave(&leak_lock, flags); @@ -3119,7 +3131,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,  		i = 0;  	}  	for (; i < num_pages; i++, index++) { -		p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); +		p = find_or_create_page(mapping, index, GFP_NOFS);  		if (!p) {  			WARN_ON(1);  			goto free_eb; @@ -3266,6 +3278,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,  	return was_dirty;  } +static int __eb_straddles_pages(u64 start, u64 len) +{ +	if (len < PAGE_CACHE_SIZE) +		return 1; +	if (start & (PAGE_CACHE_SIZE - 1)) +		return 1; +	if ((start + len) & (PAGE_CACHE_SIZE - 1)) +		return 1; +	return 0; +} + +static int eb_straddles_pages(struct extent_buffer *eb) +{ +	return __eb_straddles_pages(eb->start, eb->len); +} +  int clear_extent_buffer_uptodate(struct extent_io_tree *tree,  				struct extent_buffer *eb,  				struct extent_state **cached_state) @@ -3277,8 +3305,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,  	num_pages = num_extent_pages(eb->start, eb->len);  	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, -			      cached_state, GFP_NOFS); +	if (eb_straddles_pages(eb)) { +		clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, +				      cached_state, GFP_NOFS); +	}  	for (i = 0; i < num_pages; i++) {  		page = extent_buffer_page(eb, i);  		if (page) @@ -3296,8 +3326,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,  	num_pages = num_extent_pages(eb->start, eb->len); -	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, -			    NULL, GFP_NOFS); +	if (eb_straddles_pages(eb)) { +		set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, +				    NULL, GFP_NOFS); +	}  	for (i = 0; i < num_pages; i++) {  		page = extent_buffer_page(eb, i);  		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || @@ -3320,9 +3352,12 @@ int extent_range_uptodate(struct extent_io_tree *tree,  	int uptodate;  	unsigned long index; -	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); -	if (ret) -		return 1; +	if (__eb_straddles_pages(start, end - start + 1)) { +		ret = test_range_bit(tree, start, end, +				     EXTENT_UPTODATE, 1, NULL); +		if (ret) +			return 1; +	}  	while (start <= end) {  		index = start >> PAGE_CACHE_SHIFT;  		page = find_get_page(tree->mapping, index); @@ -3350,10 +3385,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,  	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))  		return 1; -	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, -			   EXTENT_UPTODATE, 1, cached_state); -	if (ret) -		return ret; +	if (eb_straddles_pages(eb)) { +		ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, +				   EXTENT_UPTODATE, 1, cached_state); +		if (ret) +			return ret; +	}  	num_pages = num_extent_pages(eb->start, eb->len);  	for (i = 0; i < num_pages; i++) { @@ -3386,9 +3423,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))  		return 0; -	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, -			   EXTENT_UPTODATE, 1, NULL)) { -		return 0; +	if (eb_straddles_pages(eb)) { +		if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, +				   EXTENT_UPTODATE, 1, NULL)) { +			return 0; +		}  	}  	if (start) { @@ -3492,9 +3531,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,  		page = extent_buffer_page(eb, i);  		cur = min(len, (PAGE_CACHE_SIZE - offset)); -		kaddr = kmap_atomic(page, KM_USER1); +		kaddr = page_address(page);  		memcpy(dst, kaddr + offset, cur); -		kunmap_atomic(kaddr, KM_USER1);  		dst += cur;  		len -= cur; @@ -3504,9 +3542,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,  }  int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, -			       unsigned long min_len, char **token, char **map, +			       unsigned long min_len, char **map,  			       unsigned long *map_start, -			       unsigned long *map_len, int km) +			       unsigned long *map_len)  {  	size_t offset = start & (PAGE_CACHE_SIZE - 1);  	char *kaddr; @@ -3536,42 +3574,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,  	}  	p = extent_buffer_page(eb, i); -	kaddr = kmap_atomic(p, km); -	*token = kaddr; +	kaddr = page_address(p);  	*map = kaddr + offset;  	*map_len = PAGE_CACHE_SIZE - offset;  	return 0;  } -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, -		      unsigned long min_len, -		      char **token, char **map, -		      unsigned long *map_start, -		      unsigned long *map_len, int km) -{ -	int err; -	int save = 0; -	if (eb->map_token) { -		unmap_extent_buffer(eb, eb->map_token, km); -		eb->map_token = NULL; -		save = 1; -	} -	err = map_private_extent_buffer(eb, start, min_len, token, map, -				       map_start, map_len, km); -	if (!err && save) { -		eb->map_token = *token; -		eb->kaddr = *map; -		eb->map_start = *map_start; -		eb->map_len = *map_len; -	} -	return err; -} - -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) -{ -	kunmap_atomic(token, km); -} -  int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,  			  unsigned long start,  			  unsigned long len) @@ -3595,9 +3603,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,  		cur = min(len, (PAGE_CACHE_SIZE - offset)); -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = page_address(page);  		ret = memcmp(ptr, kaddr + offset, cur); -		kunmap_atomic(kaddr, KM_USER0);  		if (ret)  			break; @@ -3630,9 +3637,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,  		WARN_ON(!PageUptodate(page));  		cur = min(len, PAGE_CACHE_SIZE - offset); -		kaddr = kmap_atomic(page, KM_USER1); +		kaddr = page_address(page);  		memcpy(kaddr + offset, src, cur); -		kunmap_atomic(kaddr, KM_USER1);  		src += cur;  		len -= cur; @@ -3661,9 +3667,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,  		WARN_ON(!PageUptodate(page));  		cur = min(len, PAGE_CACHE_SIZE - offset); -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = page_address(page);  		memset(kaddr + offset, c, cur); -		kunmap_atomic(kaddr, KM_USER0);  		len -= cur;  		offset = 0; @@ -3694,9 +3699,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,  		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = page_address(page);  		read_extent_buffer(src, kaddr + offset, src_offset, cur); -		kunmap_atomic(kaddr, KM_USER0);  		src_offset += cur;  		len -= cur; @@ -3709,20 +3713,17 @@ static void move_pages(struct page *dst_page, struct page *src_page,  		       unsigned long dst_off, unsigned long src_off,  		       unsigned long len)  { -	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); +	char *dst_kaddr = page_address(dst_page);  	if (dst_page == src_page) {  		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);  	} else { -		char *src_kaddr = kmap_atomic(src_page, KM_USER1); +		char *src_kaddr = page_address(src_page);  		char *p = dst_kaddr + dst_off + len;  		char *s = src_kaddr + src_off + len;  		while (len--)  			*--p = *--s; - -		kunmap_atomic(src_kaddr, KM_USER1);  	} -	kunmap_atomic(dst_kaddr, KM_USER0);  }  static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) @@ -3735,20 +3736,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page,  		       unsigned long dst_off, unsigned long src_off,  		       unsigned long len)  { -	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); +	char *dst_kaddr = page_address(dst_page);  	char *src_kaddr;  	if (dst_page != src_page) { -		src_kaddr = kmap_atomic(src_page, KM_USER1); +		src_kaddr = page_address(src_page);  	} else {  		src_kaddr = dst_kaddr;  		BUG_ON(areas_overlap(src_off, dst_off, len));  	}  	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); -	kunmap_atomic(dst_kaddr, KM_USER0); -	if (dst_page != src_page) -		kunmap_atomic(src_kaddr, KM_USER1);  }  void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a11a92ee2d3..21a7ca9e728 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -120,8 +120,6 @@ struct extent_state {  struct extent_buffer {  	u64 start;  	unsigned long len; -	char *map_token; -	char *kaddr;  	unsigned long map_start;  	unsigned long map_len;  	struct page *first_page; @@ -130,14 +128,26 @@ struct extent_buffer {  	struct rcu_head rcu_head;  	atomic_t refs; -	/* the spinlock is used to protect most operations */ -	spinlock_t lock; +	/* count of read lock holders on the extent buffer */ +	atomic_t write_locks; +	atomic_t read_locks; +	atomic_t blocking_writers; +	atomic_t blocking_readers; +	atomic_t spinning_readers; +	atomic_t spinning_writers; + +	/* protects write locks */ +	rwlock_t lock; -	/* -	 * when we keep the lock held while blocking, waiters go onto -	 * the wq +	/* readers use lock_wq while they wait for the write +	 * lock holders to unlock  	 */ -	wait_queue_head_t lock_wq; +	wait_queue_head_t write_lock_wq; + +	/* writers use read_lock_wq while they wait for readers +	 * to unlock +	 */ +	wait_queue_head_t read_lock_wq;  };  static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -279,15 +289,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,  int extent_buffer_uptodate(struct extent_io_tree *tree,  			   struct extent_buffer *eb,  			   struct extent_state *cached_state); -int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, -		      unsigned long min_len, char **token, char **map, -		      unsigned long *map_start, -		      unsigned long *map_len, int km);  int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, -		      unsigned long min_len, char **token, char **map, +		      unsigned long min_len, char **map,  		      unsigned long *map_start, -		      unsigned long *map_len, int km); -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); +		      unsigned long *map_len);  int extent_range_uptodate(struct extent_io_tree *tree,  			  u64 start, u64 end);  int extent_clear_unlock_delalloc(struct inode *inode, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90d4ee52cd4..08bcfa92a22 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -177,6 +177,15 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,  	WARN_ON(bio->bi_vcnt <= 0); +	/* +	 * the free space stuff is only read when it hasn't been +	 * updated in the current transaction.  So, we can safely +	 * read from the commit root and sidestep a nasty deadlock +	 * between reading the free space cache and updating the csum tree. +	 */ +	if (btrfs_is_free_space_inode(root, inode)) +		path->search_commit_root = 1; +  	disk_bytenr = (u64)bio->bi_sector << 9;  	if (dio)  		offset = logical_offset; @@ -664,10 +673,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,  	struct btrfs_sector_sum *sector_sum;  	u32 nritems;  	u32 ins_size; -	char *eb_map; -	char *eb_token; -	unsigned long map_len; -	unsigned long map_start;  	u16 csum_size =  		btrfs_super_csum_size(&root->fs_info->super_copy); @@ -814,30 +819,9 @@ found:  	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);  	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +  				      btrfs_item_size_nr(leaf, path->slots[0])); -	eb_token = NULL;  next_sector: -	if (!eb_token || -	   (unsigned long)item + csum_size >= map_start + map_len) { -		int err; - -		if (eb_token) -			unmap_extent_buffer(leaf, eb_token, KM_USER1); -		eb_token = NULL; -		err = map_private_extent_buffer(leaf, (unsigned long)item, -						csum_size, -						&eb_token, &eb_map, -						&map_start, &map_len, KM_USER1); -		if (err) -			eb_token = NULL; -	} -	if (eb_token) { -		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), -		       §or_sum->sum, csum_size); -	} else { -		write_extent_buffer(leaf, §or_sum->sum, -				    (unsigned long)item, csum_size); -	} +	write_extent_buffer(leaf, §or_sum->sum, (unsigned long)item, csum_size);  	total_bytes += root->sectorsize;  	sector_sum++; @@ -850,10 +834,7 @@ next_sector:  			goto next_sector;  		}  	} -	if (eb_token) { -		unmap_extent_buffer(leaf, eb_token, KM_USER1); -		eb_token = NULL; -	} +  	btrfs_mark_buffer_dirty(path->nodes[0]);  	if (total_bytes < sums->len) {  		btrfs_release_path(path); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa4ef18b66b..6e56a468d1f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1081,7 +1081,8 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,  again:  	for (i = 0; i < num_pages; i++) { -		pages[i] = grab_cache_page(inode->i_mapping, index + i); +		pages[i] = find_or_create_page(inode->i_mapping, index + i, +					       GFP_NOFS);  		if (!pages[i]) {  			faili = i - 1;  			err = -ENOMEM; @@ -1238,9 +1239,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,  		 * managed to copy.  		 */  		if (num_pages > dirty_pages) { -			if (copied > 0) -				atomic_inc( -					&BTRFS_I(inode)->outstanding_extents); +			if (copied > 0) { +				spin_lock(&BTRFS_I(inode)->lock); +				BTRFS_I(inode)->outstanding_extents++; +				spin_unlock(&BTRFS_I(inode)->lock); +			}  			btrfs_delalloc_release_space(inode,  					(num_pages - dirty_pages) <<  					PAGE_CACHE_SHIFT); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bf0d61567f3..6377713f639 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,  		return inode;  	spin_lock(&block_group->lock); +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { +		printk(KERN_INFO "Old style space inode found, converting.\n"); +		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; +		block_group->disk_cache_state = BTRFS_DC_CLEAR; +	} +  	if (!btrfs_fs_closing(root->fs_info)) {  		block_group->inode = igrab(inode);  		block_group->iref = 1; @@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root,  	btrfs_set_inode_gid(leaf, inode_item, 0);  	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);  	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | -			      BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); +			      BTRFS_INODE_PREALLOC);  	btrfs_set_inode_nlink(leaf, inode_item, 1);  	btrfs_set_inode_transid(leaf, inode_item, trans->transid);  	btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -239,17 +245,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  	struct btrfs_free_space_header *header;  	struct extent_buffer *leaf;  	struct page *page; -	u32 *checksums = NULL, *crc; -	char *disk_crcs = NULL;  	struct btrfs_key key;  	struct list_head bitmaps;  	u64 num_entries;  	u64 num_bitmaps;  	u64 generation; -	u32 cur_crc = ~(u32)0;  	pgoff_t index = 0; -	unsigned long first_page_offset; -	int num_checksums;  	int ret = 0;  	INIT_LIST_HEAD(&bitmaps); @@ -292,16 +293,6 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  	if (!num_entries)  		goto out; -	/* Setup everything for doing checksumming */ -	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; -	checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); -	if (!checksums) -		goto out; -	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); -	disk_crcs = kzalloc(first_page_offset, GFP_NOFS); -	if (!disk_crcs) -		goto out; -  	ret = readahead_cache(inode);  	if (ret)  		goto out; @@ -311,18 +302,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  		struct btrfs_free_space *e;  		void *addr;  		unsigned long offset = 0; -		unsigned long start_offset = 0;  		int need_loop = 0;  		if (!num_entries && !num_bitmaps)  			break; -		if (index == 0) { -			start_offset = first_page_offset; -			offset = start_offset; -		} - -		page = grab_cache_page(inode->i_mapping, index); +		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);  		if (!page)  			goto free_cache; @@ -342,8 +327,15 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  		if (index == 0) {  			u64 *gen; -			memcpy(disk_crcs, addr, first_page_offset); -			gen = addr + (sizeof(u32) * num_checksums); +			/* +			 * We put a bogus crc in the front of the first page in +			 * case old kernels try to mount a fs with the new +			 * format to make sure they discard the cache. +			 */ +			addr += sizeof(u64); +			offset += sizeof(u64); + +			gen = addr;  			if (*gen != BTRFS_I(inode)->generation) {  				printk(KERN_ERR "btrfs: space cache generation"  				       " (%llu) does not match inode (%llu)\n", @@ -355,24 +347,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  				page_cache_release(page);  				goto free_cache;  			} -			crc = (u32 *)disk_crcs; -		} -		entry = addr + start_offset; - -		/* First lets check our crc before we do anything fun */ -		cur_crc = ~(u32)0; -		cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, -					  PAGE_CACHE_SIZE - start_offset); -		btrfs_csum_final(cur_crc, (char *)&cur_crc); -		if (cur_crc != *crc) { -			printk(KERN_ERR "btrfs: crc mismatch for page %lu\n", -			       index); -			kunmap(page); -			unlock_page(page); -			page_cache_release(page); -			goto free_cache; +			addr += sizeof(u64); +			offset += sizeof(u64);  		} -		crc++; +		entry = addr;  		while (1) {  			if (!num_entries) @@ -470,8 +448,6 @@ next:  	ret = 1;  out: -	kfree(checksums); -	kfree(disk_crcs);  	return ret;  free_cache:  	__btrfs_remove_free_space_cache(ctl); @@ -569,8 +545,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	struct btrfs_key key;  	u64 start, end, len;  	u64 bytes = 0; -	u32 *crc, *checksums; -	unsigned long first_page_offset; +	u32 crc = ~(u32)0;  	int index = 0, num_pages = 0;  	int entries = 0;  	int bitmaps = 0; @@ -590,34 +565,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>  		PAGE_CACHE_SHIFT; -	/* Since the first page has all of our checksums and our generation we -	 * need to calculate the offset into the page that we can start writing -	 * our entries. -	 */ -	first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); -  	filemap_write_and_wait(inode->i_mapping);  	btrfs_wait_ordered_range(inode, inode->i_size &  				 ~(root->sectorsize - 1), (u64)-1); -	/* make sure we don't overflow that first page */ -	if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) { -		/* this is really the same as running out of space, where we also return 0 */ -		printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n"); -		ret = 0; -		goto out_update; -	} - -	/* We need a checksum per page. */ -	crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); -	if (!crc) -		return -1; -  	pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); -	if (!pages) { -		kfree(crc); +	if (!pages)  		return -1; -	}  	/* Get the cluster for this block_group if it exists */  	if (block_group && !list_empty(&block_group->cluster_list)) @@ -640,7 +594,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	 * know and don't freak out.  	 */  	while (index < num_pages) { -		page = grab_cache_page(inode->i_mapping, index); +		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);  		if (!page) {  			int i; @@ -648,7 +602,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  				unlock_page(pages[i]);  				page_cache_release(pages[i]);  			} -			goto out_free; +			goto out;  		}  		pages[index] = page;  		index++; @@ -668,17 +622,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	/* Write out the extent entries */  	do {  		struct btrfs_free_space_entry *entry; -		void *addr; +		void *addr, *orig;  		unsigned long offset = 0; -		unsigned long start_offset = 0;  		next_page = false; -		if (index == 0) { -			start_offset = first_page_offset; -			offset = start_offset; -		} -  		if (index >= num_pages) {  			out_of_space = true;  			break; @@ -686,10 +634,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  		page = pages[index]; -		addr = kmap(page); -		entry = addr + start_offset; +		orig = addr = kmap(page); +		if (index == 0) { +			u64 *gen; -		memset(addr, 0, PAGE_CACHE_SIZE); +			/* +			 * We're going to put in a bogus crc for this page to +			 * make sure that old kernels who aren't aware of this +			 * format will be sure to discard the cache. +			 */ +			addr += sizeof(u64); +			offset += sizeof(u64); + +			gen = addr; +			*gen = trans->transid; +			addr += sizeof(u64); +			offset += sizeof(u64); +		} +		entry = addr; + +		memset(addr, 0, PAGE_CACHE_SIZE - offset);  		while (node && !next_page) {  			struct btrfs_free_space *e; @@ -752,13 +716,19 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  				next_page = true;  			entry++;  		} -		*crc = ~(u32)0; -		*crc = btrfs_csum_data(root, addr + start_offset, *crc, -				       PAGE_CACHE_SIZE - start_offset); -		kunmap(page); -		btrfs_csum_final(*crc, (char *)crc); -		crc++; +		/* Generate bogus crc value */ +		if (index == 0) { +			u32 *tmp; +			crc = btrfs_csum_data(root, orig + sizeof(u64), crc, +					      PAGE_CACHE_SIZE - sizeof(u64)); +			btrfs_csum_final(crc, (char *)&crc); +			crc++; +			tmp = orig; +			*tmp = crc; +		} + +		kunmap(page);  		bytes += PAGE_CACHE_SIZE; @@ -779,11 +749,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  		addr = kmap(page);  		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); -		*crc = ~(u32)0; -		*crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);  		kunmap(page); -		btrfs_csum_final(*crc, (char *)crc); -		crc++;  		bytes += PAGE_CACHE_SIZE;  		list_del_init(&entry->list); @@ -796,7 +762,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  				     i_size_read(inode) - 1, &cached_state,  				     GFP_NOFS);  		ret = 0; -		goto out_free; +		goto out;  	}  	/* Zero out the rest of the pages just to make sure */ @@ -811,20 +777,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  		index++;  	} -	/* Write the checksums and trans id to the first page */ -	{ -		void *addr; -		u64 *gen; - -		page = pages[0]; - -		addr = kmap(page); -		memcpy(addr, checksums, sizeof(u32) * num_pages); -		gen = addr + (sizeof(u32) * num_pages); -		*gen = trans->transid; -		kunmap(page); -	} -  	ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,  					    bytes, &cached_state);  	btrfs_drop_pages(pages, num_pages); @@ -833,7 +785,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	if (ret) {  		ret = 0; -		goto out_free; +		goto out;  	}  	BTRFS_I(inode)->generation = trans->transid; @@ -850,7 +802,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,  				 EXTENT_DIRTY | EXTENT_DELALLOC |  				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); -		goto out_free; +		goto out;  	}  	leaf = path->nodes[0];  	if (ret > 0) { @@ -866,7 +818,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,  					 GFP_NOFS);  			btrfs_release_path(path); -			goto out_free; +			goto out;  		}  	}  	header = btrfs_item_ptr(leaf, path->slots[0], @@ -879,11 +831,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	ret = 1; -out_free: -	kfree(checksums); +out:  	kfree(pages); - -out_update:  	if (ret != 1) {  		invalidate_inode_pages2_range(inode->i_mapping, 0, index);  		BTRFS_I(inode)->generation = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3601f0aebdd..586cf6a4385 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -750,15 +750,6 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,  	return alloc_hint;  } -static inline bool is_free_space_inode(struct btrfs_root *root, -				       struct inode *inode) -{ -	if (root == root->fs_info->tree_root || -	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) -		return true; -	return false; -} -  /*   * when extent_io.c finds a delayed allocation range in the file,   * the call backs end up in this code.  The basic idea is to @@ -791,7 +782,7 @@ static noinline int cow_file_range(struct inode *inode,  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;  	int ret = 0; -	BUG_ON(is_free_space_inode(root, inode)); +	BUG_ON(btrfs_is_free_space_inode(root, inode));  	trans = btrfs_join_transaction(root);  	BUG_ON(IS_ERR(trans));  	trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1072,7 +1063,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,  	path = btrfs_alloc_path();  	BUG_ON(!path); -	nolock = is_free_space_inode(root, inode); +	nolock = btrfs_is_free_space_inode(root, inode);  	if (nolock)  		trans = btrfs_join_transaction_nolock(root); @@ -1298,7 +1289,9 @@ static int btrfs_split_extent_hook(struct inode *inode,  	if (!(orig->state & EXTENT_DELALLOC))  		return 0; -	atomic_inc(&BTRFS_I(inode)->outstanding_extents); +	spin_lock(&BTRFS_I(inode)->lock); +	BTRFS_I(inode)->outstanding_extents++; +	spin_unlock(&BTRFS_I(inode)->lock);  	return 0;  } @@ -1316,7 +1309,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,  	if (!(other->state & EXTENT_DELALLOC))  		return 0; -	atomic_dec(&BTRFS_I(inode)->outstanding_extents); +	spin_lock(&BTRFS_I(inode)->lock); +	BTRFS_I(inode)->outstanding_extents--; +	spin_unlock(&BTRFS_I(inode)->lock);  	return 0;  } @@ -1337,12 +1332,15 @@ static int btrfs_set_bit_hook(struct inode *inode,  	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {  		struct btrfs_root *root = BTRFS_I(inode)->root;  		u64 len = state->end + 1 - state->start; -		bool do_list = !is_free_space_inode(root, inode); +		bool do_list = !btrfs_is_free_space_inode(root, inode); -		if (*bits & EXTENT_FIRST_DELALLOC) +		if (*bits & EXTENT_FIRST_DELALLOC) {  			*bits &= ~EXTENT_FIRST_DELALLOC; -		else -			atomic_inc(&BTRFS_I(inode)->outstanding_extents); +		} else { +			spin_lock(&BTRFS_I(inode)->lock); +			BTRFS_I(inode)->outstanding_extents++; +			spin_unlock(&BTRFS_I(inode)->lock); +		}  		spin_lock(&root->fs_info->delalloc_lock);  		BTRFS_I(inode)->delalloc_bytes += len; @@ -1370,12 +1368,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,  	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {  		struct btrfs_root *root = BTRFS_I(inode)->root;  		u64 len = state->end + 1 - state->start; -		bool do_list = !is_free_space_inode(root, inode); +		bool do_list = !btrfs_is_free_space_inode(root, inode); -		if (*bits & EXTENT_FIRST_DELALLOC) +		if (*bits & EXTENT_FIRST_DELALLOC) {  			*bits &= ~EXTENT_FIRST_DELALLOC; -		else if (!(*bits & EXTENT_DO_ACCOUNTING)) -			atomic_dec(&BTRFS_I(inode)->outstanding_extents); +		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) { +			spin_lock(&BTRFS_I(inode)->lock); +			BTRFS_I(inode)->outstanding_extents--; +			spin_unlock(&BTRFS_I(inode)->lock); +		}  		if (*bits & EXTENT_DO_ACCOUNTING)  			btrfs_delalloc_release_metadata(inode, len); @@ -1477,7 +1478,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; -	if (is_free_space_inode(root, inode)) +	if (btrfs_is_free_space_inode(root, inode))  		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);  	else  		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); @@ -1726,7 +1727,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  		return 0;  	BUG_ON(!ordered_extent); -	nolock = is_free_space_inode(root, inode); +	nolock = btrfs_is_free_space_inode(root, inode);  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {  		BUG_ON(!list_empty(&ordered_extent->list)); @@ -2531,13 +2532,6 @@ static void btrfs_read_locked_inode(struct inode *inode)  	inode_item = btrfs_item_ptr(leaf, path->slots[0],  				    struct btrfs_inode_item); -	if (!leaf->map_token) -		map_private_extent_buffer(leaf, (unsigned long)inode_item, -					  sizeof(struct btrfs_inode_item), -					  &leaf->map_token, &leaf->kaddr, -					  &leaf->map_start, &leaf->map_len, -					  KM_USER1); -  	inode->i_mode = btrfs_inode_mode(leaf, inode_item);  	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);  	inode->i_uid = btrfs_inode_uid(leaf, inode_item); @@ -2575,11 +2569,6 @@ cache_acl:  	if (!maybe_acls)  		cache_no_acl(inode); -	if (leaf->map_token) { -		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); -		leaf->map_token = NULL; -	} -  	btrfs_free_path(path);  	switch (inode->i_mode & S_IFMT) { @@ -2624,13 +2613,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  			    struct btrfs_inode_item *item,  			    struct inode *inode)  { -	if (!leaf->map_token) -		map_private_extent_buffer(leaf, (unsigned long)item, -					  sizeof(struct btrfs_inode_item), -					  &leaf->map_token, &leaf->kaddr, -					  &leaf->map_start, &leaf->map_len, -					  KM_USER1); -  	btrfs_set_inode_uid(leaf, item, inode->i_uid);  	btrfs_set_inode_gid(leaf, item, inode->i_gid);  	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); @@ -2659,11 +2641,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);  	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);  	btrfs_set_inode_block_group(leaf, item, 0); - -	if (leaf->map_token) { -		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); -		leaf->map_token = NULL; -	}  }  /* @@ -2684,7 +2661,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,  	 * The data relocation inode should also be directly updated  	 * without delay  	 */ -	if (!is_free_space_inode(root, inode) +	if (!btrfs_is_free_space_inode(root, inode)  	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {  		ret = btrfs_delayed_update_inode(trans, root, inode);  		if (!ret) @@ -3398,7 +3375,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)  	ret = -ENOMEM;  again: -	page = grab_cache_page(mapping, index); +	page = find_or_create_page(mapping, index, GFP_NOFS);  	if (!page) {  		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);  		goto out; @@ -3634,7 +3611,7 @@ void btrfs_evict_inode(struct inode *inode)  	truncate_inode_pages(&inode->i_data, 0);  	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || -			       is_free_space_inode(root, inode))) +			       btrfs_is_free_space_inode(root, inode)))  		goto no_delete;  	if (is_bad_inode(inode)) { @@ -4277,7 +4254,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)  	if (BTRFS_I(inode)->dummy_inode)  		return 0; -	if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) +	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))  		nolock = true;  	if (wbc->sync_mode == WB_SYNC_ALL) { @@ -6735,8 +6712,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->index_cnt = (u64)-1;  	ei->last_unlink_trans = 0; -	atomic_set(&ei->outstanding_extents, 0); -	atomic_set(&ei->reserved_extents, 0); +	spin_lock_init(&ei->lock); +	ei->outstanding_extents = 0; +	ei->reserved_extents = 0;  	ei->ordered_data_close = 0;  	ei->orphan_meta_reserved = 0; @@ -6774,8 +6752,8 @@ void btrfs_destroy_inode(struct inode *inode)  	WARN_ON(!list_empty(&inode->i_dentry));  	WARN_ON(inode->i_data.nrpages); -	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); -	WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); +	WARN_ON(BTRFS_I(inode)->outstanding_extents); +	WARN_ON(BTRFS_I(inode)->reserved_extents);  	/*  	 * This can happen where we create an inode, but somebody else also @@ -6830,7 +6808,7 @@ int btrfs_drop_inode(struct inode *inode)  	struct btrfs_root *root = BTRFS_I(inode)->root;  	if (btrfs_root_refs(&root->root_item) == 0 && -	    !is_free_space_inode(root, inode)) +	    !btrfs_is_free_space_inode(root, inode))  		return 1;  	else  		return generic_drop_inode(inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a3c4751e07d..fd252fff4c6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -867,8 +867,8 @@ again:  	/* step one, lock all the pages */  	for (i = 0; i < num_pages; i++) {  		struct page *page; -		page = grab_cache_page(inode->i_mapping, -					    start_index + i); +		page = find_or_create_page(inode->i_mapping, +					    start_index + i, GFP_NOFS);  		if (!page)  			break; @@ -938,7 +938,9 @@ again:  			  GFP_NOFS);  	if (i_done != num_pages) { -		atomic_inc(&BTRFS_I(inode)->outstanding_extents); +		spin_lock(&BTRFS_I(inode)->lock); +		BTRFS_I(inode)->outstanding_extents++; +		spin_unlock(&BTRFS_I(inode)->lock);  		btrfs_delalloc_release_space(inode,  				     (num_pages - i_done) << PAGE_CACHE_SHIFT);  	} diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 66fa43dc3f0..d77b67c4b27 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -24,185 +24,197 @@  #include "extent_io.h"  #include "locking.h" -static inline void spin_nested(struct extent_buffer *eb) +void btrfs_assert_tree_read_locked(struct extent_buffer *eb); + +/* + * if we currently have a spinning reader or writer lock + * (indicated by the rw flag) this will bump the count + * of blocking holders and drop the spinlock. + */ +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)  { -	spin_lock(&eb->lock); +	if (rw == BTRFS_WRITE_LOCK) { +		if (atomic_read(&eb->blocking_writers) == 0) { +			WARN_ON(atomic_read(&eb->spinning_writers) != 1); +			atomic_dec(&eb->spinning_writers); +			btrfs_assert_tree_locked(eb); +			atomic_inc(&eb->blocking_writers); +			write_unlock(&eb->lock); +		} +	} else if (rw == BTRFS_READ_LOCK) { +		btrfs_assert_tree_read_locked(eb); +		atomic_inc(&eb->blocking_readers); +		WARN_ON(atomic_read(&eb->spinning_readers) == 0); +		atomic_dec(&eb->spinning_readers); +		read_unlock(&eb->lock); +	} +	return;  }  /* - * Setting a lock to blocking will drop the spinlock and set the - * flag that forces other procs who want the lock to wait.  After - * this you can safely schedule with the lock held. + * if we currently have a blocking lock, take the spinlock + * and drop our blocking count   */ -void btrfs_set_lock_blocking(struct extent_buffer *eb) +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)  { -	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { -		set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); -		spin_unlock(&eb->lock); +	if (rw == BTRFS_WRITE_LOCK_BLOCKING) { +		BUG_ON(atomic_read(&eb->blocking_writers) != 1); +		write_lock(&eb->lock); +		WARN_ON(atomic_read(&eb->spinning_writers)); +		atomic_inc(&eb->spinning_writers); +		if (atomic_dec_and_test(&eb->blocking_writers)) +			wake_up(&eb->write_lock_wq); +	} else if (rw == BTRFS_READ_LOCK_BLOCKING) { +		BUG_ON(atomic_read(&eb->blocking_readers) == 0); +		read_lock(&eb->lock); +		atomic_inc(&eb->spinning_readers); +		if (atomic_dec_and_test(&eb->blocking_readers)) +			wake_up(&eb->read_lock_wq);  	} -	/* exit with the spin lock released and the bit set */ +	return;  }  /* - * clearing the blocking flag will take the spinlock again. - * After this you can't safely schedule + * take a spinning read lock.  This will wait for any blocking + * writers   */ -void btrfs_clear_lock_blocking(struct extent_buffer *eb) +void btrfs_tree_read_lock(struct extent_buffer *eb)  { -	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { -		spin_nested(eb); -		clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); -		smp_mb__after_clear_bit(); +again: +	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); +	read_lock(&eb->lock); +	if (atomic_read(&eb->blocking_writers)) { +		read_unlock(&eb->lock); +		wait_event(eb->write_lock_wq, +			   atomic_read(&eb->blocking_writers) == 0); +		goto again;  	} -	/* exit with the spin lock held */ +	atomic_inc(&eb->read_locks); +	atomic_inc(&eb->spinning_readers);  }  /* - * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for very long, and often they don't block - * at all.  For a dbench 50 run, if we don't spin on the blocking bit - * at all, the context switch rate can jump up to 400,000/sec or more. - * - * So, we're still stuck with this crummy spin on the blocking bit, - * at least until the most common causes of the short blocks - * can be dealt with. + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers   */ -static int btrfs_spin_on_block(struct extent_buffer *eb) +int btrfs_try_tree_read_lock(struct extent_buffer *eb)  { -	int i; +	if (atomic_read(&eb->blocking_writers)) +		return 0; -	for (i = 0; i < 512; i++) { -		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) -			return 1; -		if (need_resched()) -			break; -		cpu_relax(); +	read_lock(&eb->lock); +	if (atomic_read(&eb->blocking_writers)) { +		read_unlock(&eb->lock); +		return 0;  	} -	return 0; +	atomic_inc(&eb->read_locks); +	atomic_inc(&eb->spinning_readers); +	return 1;  }  /* - * This is somewhat different from trylock.  It will take the - * spinlock but if it finds the lock is set to blocking, it will - * return without the lock held. - * - * returns 1 if it was able to take the lock and zero otherwise - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers or readers   */ -int btrfs_try_spin_lock(struct extent_buffer *eb) +int btrfs_try_tree_write_lock(struct extent_buffer *eb)  { -	int i; - -	if (btrfs_spin_on_block(eb)) { -		spin_nested(eb); -		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) -			return 1; -		spin_unlock(&eb->lock); +	if (atomic_read(&eb->blocking_writers) || +	    atomic_read(&eb->blocking_readers)) +		return 0; +	write_lock(&eb->lock); +	if (atomic_read(&eb->blocking_writers) || +	    atomic_read(&eb->blocking_readers)) { +		write_unlock(&eb->lock); +		return 0;  	} -	/* spin for a bit on the BLOCKING flag */ -	for (i = 0; i < 2; i++) { -		cpu_relax(); -		if (!btrfs_spin_on_block(eb)) -			break; +	atomic_inc(&eb->write_locks); +	atomic_inc(&eb->spinning_writers); +	return 1; +} -		spin_nested(eb); -		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) -			return 1; -		spin_unlock(&eb->lock); -	} -	return 0; +/* + * drop a spinning read lock + */ +void btrfs_tree_read_unlock(struct extent_buffer *eb) +{ +	btrfs_assert_tree_read_locked(eb); +	WARN_ON(atomic_read(&eb->spinning_readers) == 0); +	atomic_dec(&eb->spinning_readers); +	atomic_dec(&eb->read_locks); +	read_unlock(&eb->lock);  }  /* - * the autoremove wake function will return 0 if it tried to wake up - * a process that was already awake, which means that process won't - * count as an exclusive wakeup.  The waitq code will continue waking - * procs until it finds one that was actually sleeping. - * - * For btrfs, this isn't quite what we want.  We want a single proc - * to be notified that the lock is ready for taking.  If that proc - * already happen to be awake, great, it will loop around and try for - * the lock. - * - * So, btrfs_wake_function always returns 1, even when the proc that we - * tried to wake up was already awake. + * drop a blocking read lock   */ -static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, -			       int sync, void *key) +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)  { -	autoremove_wake_function(wait, mode, sync, key); -	return 1; +	btrfs_assert_tree_read_locked(eb); +	WARN_ON(atomic_read(&eb->blocking_readers) == 0); +	if (atomic_dec_and_test(&eb->blocking_readers)) +		wake_up(&eb->read_lock_wq); +	atomic_dec(&eb->read_locks);  }  /* - * returns with the extent buffer spinlocked. - * - * This will spin and/or wait as required to take the lock, and then - * return with the spinlock held. - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * take a spinning write lock.  This will wait for both + * blocking readers or writers   */  int btrfs_tree_lock(struct extent_buffer *eb)  { -	DEFINE_WAIT(wait); -	wait.func = btrfs_wake_function; - -	if (!btrfs_spin_on_block(eb)) -		goto sleep; - -	while(1) { -		spin_nested(eb); - -		/* nobody is blocking, exit with the spinlock held */ -		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) -			return 0; - -		/* -		 * we have the spinlock, but the real owner is blocking. -		 * wait for them -		 */ -		spin_unlock(&eb->lock); - -		/* -		 * spin for a bit, and if the blocking flag goes away, -		 * loop around -		 */ -		cpu_relax(); -		if (btrfs_spin_on_block(eb)) -			continue; -sleep: -		prepare_to_wait_exclusive(&eb->lock_wq, &wait, -					  TASK_UNINTERRUPTIBLE); - -		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) -			schedule(); - -		finish_wait(&eb->lock_wq, &wait); +again: +	wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); +	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); +	write_lock(&eb->lock); +	if (atomic_read(&eb->blocking_readers)) { +		write_unlock(&eb->lock); +		wait_event(eb->read_lock_wq, +			   atomic_read(&eb->blocking_readers) == 0); +		goto again;  	} +	if (atomic_read(&eb->blocking_writers)) { +		write_unlock(&eb->lock); +		wait_event(eb->write_lock_wq, +			   atomic_read(&eb->blocking_writers) == 0); +		goto again; +	} +	WARN_ON(atomic_read(&eb->spinning_writers)); +	atomic_inc(&eb->spinning_writers); +	atomic_inc(&eb->write_locks);  	return 0;  } +/* + * drop a spinning or a blocking write lock. + */  int btrfs_tree_unlock(struct extent_buffer *eb)  { -	/* -	 * if we were a blocking owner, we don't have the spinlock held -	 * just clear the bit and look for waiters -	 */ -	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) -		smp_mb__after_clear_bit(); -	else -		spin_unlock(&eb->lock); +	int blockers = atomic_read(&eb->blocking_writers); + +	BUG_ON(blockers > 1); + +	btrfs_assert_tree_locked(eb); +	atomic_dec(&eb->write_locks); -	if (waitqueue_active(&eb->lock_wq)) -		wake_up(&eb->lock_wq); +	if (blockers) { +		WARN_ON(atomic_read(&eb->spinning_writers)); +		atomic_dec(&eb->blocking_writers); +		smp_wmb(); +		wake_up(&eb->write_lock_wq); +	} else { +		WARN_ON(atomic_read(&eb->spinning_writers) != 1); +		atomic_dec(&eb->spinning_writers); +		write_unlock(&eb->lock); +	}  	return 0;  }  void btrfs_assert_tree_locked(struct extent_buffer *eb)  { -	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) -		assert_spin_locked(&eb->lock); +	BUG_ON(!atomic_read(&eb->write_locks)); +} + +void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ +	BUG_ON(!atomic_read(&eb->read_locks));  } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5c33a560a2f..17247ddb81a 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -19,11 +19,43 @@  #ifndef __BTRFS_LOCKING_  #define __BTRFS_LOCKING_ +#define BTRFS_WRITE_LOCK 1 +#define BTRFS_READ_LOCK 2 +#define BTRFS_WRITE_LOCK_BLOCKING 3 +#define BTRFS_READ_LOCK_BLOCKING 4 +  int btrfs_tree_lock(struct extent_buffer *eb);  int btrfs_tree_unlock(struct extent_buffer *eb);  int btrfs_try_spin_lock(struct extent_buffer *eb); -void btrfs_set_lock_blocking(struct extent_buffer *eb); -void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_unlock(struct extent_buffer *eb); +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);  void btrfs_assert_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_read_lock(struct extent_buffer *eb); +int btrfs_try_tree_write_lock(struct extent_buffer *eb); + +static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) +{ +	if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) +		btrfs_tree_unlock(eb); +	else if (rw == BTRFS_READ_LOCK_BLOCKING) +		btrfs_tree_read_unlock_blocking(eb); +	else if (rw == BTRFS_READ_LOCK) +		btrfs_tree_read_unlock(eb); +	else +		BUG(); +} + +static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ +	btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); +} + +static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ +	btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); +}  #endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5e0a3dc79a4..59bb1764273 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2955,7 +2955,8 @@ static int relocate_file_extent_cluster(struct inode *inode,  			page_cache_sync_readahead(inode->i_mapping,  						  ra, NULL, index,  						  last_index + 1 - index); -			page = grab_cache_page(inode->i_mapping, index); +			page = find_or_create_page(inode->i_mapping, index, +						   GFP_NOFS);  			if (!page) {  				btrfs_delalloc_release_metadata(inode,  							PAGE_CACHE_SIZE); diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index c0f7ecaf1e7..bc1f6ad1844 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -50,36 +50,22 @@ u##bits btrfs_##name(struct extent_buffer *eb,				\  	unsigned long part_offset = (unsigned long)s;			\  	unsigned long offset = part_offset + offsetof(type, member);	\  	type *p;							\ -	/* ugly, but we want the fast path here */			\ -	if (eb->map_token && offset >= eb->map_start &&			\ -	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\ -	    eb->map_len) {						\ -		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\ -		return le##bits##_to_cpu(p->member);			\ -	}								\ -	{								\ -		int err;						\ -		char *map_token;					\ -		char *kaddr;						\ -		int unmap_on_exit = (eb->map_token == NULL);		\ -		unsigned long map_start;				\ -		unsigned long map_len;					\ -		u##bits res;						\ -		err = map_extent_buffer(eb, offset,			\ -				sizeof(((type *)0)->member),		\ -				&map_token, &kaddr,			\ -				&map_start, &map_len, KM_USER1);	\ -		if (err) {						\ -			__le##bits leres;				\ -			read_eb_member(eb, s, type, member, &leres);	\ -			return le##bits##_to_cpu(leres);		\ -		}							\ -		p = (type *)(kaddr + part_offset - map_start);		\ -		res = le##bits##_to_cpu(p->member);			\ -		if (unmap_on_exit)					\ -			unmap_extent_buffer(eb, map_token, KM_USER1);	\ -		return res;						\ -	}								\ +	int err;						\ +	char *kaddr;						\ +	unsigned long map_start;				\ +	unsigned long map_len;					\ +	u##bits res;						\ +	err = map_private_extent_buffer(eb, offset,		\ +			sizeof(((type *)0)->member),		\ +			&kaddr, &map_start, &map_len);		\ +	if (err) {						\ +		__le##bits leres;				\ +		read_eb_member(eb, s, type, member, &leres);	\ +		return le##bits##_to_cpu(leres);		\ +	}							\ +	p = (type *)(kaddr + part_offset - map_start);		\ +	res = le##bits##_to_cpu(p->member);			\ +	return res;						\  }									\  void btrfs_set_##name(struct extent_buffer *eb,				\  				    type *s, u##bits val)		\ @@ -87,36 +73,21 @@ void btrfs_set_##name(struct extent_buffer *eb,				\  	unsigned long part_offset = (unsigned long)s;			\  	unsigned long offset = part_offset + offsetof(type, member);	\  	type *p;							\ -	/* ugly, but we want the fast path here */			\ -	if (eb->map_token && offset >= eb->map_start &&			\ -	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\ -	    eb->map_len) {						\ -		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\ -		p->member = cpu_to_le##bits(val);			\ -		return;							\ -	}								\ -	{								\ -		int err;						\ -		char *map_token;					\ -		char *kaddr;						\ -		int unmap_on_exit = (eb->map_token == NULL);		\ -		unsigned long map_start;				\ -		unsigned long map_len;					\ -		err = map_extent_buffer(eb, offset,			\ -				sizeof(((type *)0)->member),		\ -				&map_token, &kaddr,			\ -				&map_start, &map_len, KM_USER1);	\ -		if (err) {						\ -			__le##bits val2;				\ -			val2 = cpu_to_le##bits(val);			\ -			write_eb_member(eb, s, type, member, &val2);	\ -			return;						\ -		}							\ -		p = (type *)(kaddr + part_offset - map_start);		\ -		p->member = cpu_to_le##bits(val);			\ -		if (unmap_on_exit)					\ -			unmap_extent_buffer(eb, map_token, KM_USER1);	\ -	}								\ +	int err;						\ +	char *kaddr;						\ +	unsigned long map_start;				\ +	unsigned long map_len;					\ +	err = map_private_extent_buffer(eb, offset,		\ +			sizeof(((type *)0)->member),		\ +			&kaddr, &map_start, &map_len);		\ +	if (err) {						\ +		__le##bits val2;				\ +		val2 = cpu_to_le##bits(val);			\ +		write_eb_member(eb, s, type, member, &val2);	\ +		return;						\ +	}							\ +	p = (type *)(kaddr + part_offset - map_start);		\ +	p->member = cpu_to_le##bits(val);			\  }  #include "ctree.h" @@ -125,15 +96,6 @@ void btrfs_node_key(struct extent_buffer *eb,  		    struct btrfs_disk_key *disk_key, int nr)  {  	unsigned long ptr = btrfs_node_key_ptr_offset(nr); -	if (eb->map_token && ptr >= eb->map_start && -	    ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { -		memcpy(disk_key, eb->kaddr + ptr - eb->map_start, -			sizeof(*disk_key)); -		return; -	} else if (eb->map_token) { -		unmap_extent_buffer(eb, eb->map_token, KM_USER1); -		eb->map_token = NULL; -	}  	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,  		       struct btrfs_key_ptr, key, disk_key);  } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 51dcec86757..eb55863bb4a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -260,7 +260,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,  {  	struct btrfs_trans_handle *h;  	struct btrfs_transaction *cur_trans; -	int retries = 0; +	u64 num_bytes = 0;  	int ret;  	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) @@ -274,6 +274,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,  		h->block_rsv = NULL;  		goto got_it;  	} + +	/* +	 * Do the reservation before we join the transaction so we can do all +	 * the appropriate flushing if need be. +	 */ +	if (num_items > 0 && root != root->fs_info->chunk_root) { +		num_bytes = btrfs_calc_trans_metadata_size(root, num_items); +		ret = btrfs_block_rsv_add(NULL, root, +					  &root->fs_info->trans_block_rsv, +					  num_bytes); +		if (ret) +			return ERR_PTR(ret); +	}  again:  	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);  	if (!h) @@ -310,24 +323,9 @@ again:  		goto again;  	} -	if (num_items > 0) { -		ret = btrfs_trans_reserve_metadata(h, root, num_items); -		if (ret == -EAGAIN && !retries) { -			retries++; -			btrfs_commit_transaction(h, root); -			goto again; -		} else if (ret == -EAGAIN) { -			/* -			 * We have already retried and got EAGAIN, so really we -			 * don't have space, so set ret to -ENOSPC. -			 */ -			ret = -ENOSPC; -		} - -		if (ret < 0) { -			btrfs_end_transaction(h, root); -			return ERR_PTR(ret); -		} +	if (num_bytes) { +		h->block_rsv = &root->fs_info->trans_block_rsv; +		h->bytes_reserved = num_bytes;  	}  got_it: @@ -499,10 +497,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	}  	if (lock && cur_trans->blocked && !cur_trans->in_commit) { -		if (throttle) +		if (throttle) { +			/* +			 * We may race with somebody else here so end up having +			 * to call end_transaction on ourselves again, so inc +			 * our use_count. +			 */ +			trans->use_count++;  			return btrfs_commit_transaction(trans, root); -		else +		} else {  			wake_up_process(info->transaction_kthread); +		}  	}  	WARN_ON(cur_trans != info->running_transaction); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ce8a9f41d1..ac278dd8317 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1730,8 +1730,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,  				btrfs_read_buffer(next, ptr_gen);  				btrfs_tree_lock(next); -				clean_tree_block(trans, root, next);  				btrfs_set_lock_blocking(next); +				clean_tree_block(trans, root, next);  				btrfs_wait_tree_block_writeback(next);  				btrfs_tree_unlock(next); @@ -1796,8 +1796,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,  				next = path->nodes[*level];  				btrfs_tree_lock(next); -				clean_tree_block(trans, root, next);  				btrfs_set_lock_blocking(next); +				clean_tree_block(trans, root, next);  				btrfs_wait_tree_block_writeback(next);  				btrfs_tree_unlock(next); @@ -1864,8 +1864,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,  			next = path->nodes[orig_level];  			btrfs_tree_lock(next); -			clean_tree_block(trans, log, next);  			btrfs_set_lock_blocking(next); +			clean_tree_block(trans, log, next);  			btrfs_wait_tree_block_writeback(next);  			btrfs_tree_unlock(next); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19450bc5363..b89e372c754 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3595,7 +3595,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)  	if (!sb)  		return -ENOMEM;  	btrfs_set_buffer_uptodate(sb); -	btrfs_set_buffer_lockdep_class(sb, 0); +	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);  	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);  	array_size = btrfs_super_sys_array_size(super_copy); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5366fe452ab..d733b9cfea3 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -102,43 +102,57 @@ static int do_setxattr(struct btrfs_trans_handle *trans,  	if (!path)  		return -ENOMEM; -	/* first lets see if we already have this xattr */ -	di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, -				strlen(name), -1); -	if (IS_ERR(di)) { -		ret = PTR_ERR(di); -		goto out; -	} - -	/* ok we already have this xattr, lets remove it */ -	if (di) { -		/* if we want create only exit */ -		if (flags & XATTR_CREATE) { -			ret = -EEXIST; +	if (flags & XATTR_REPLACE) { +		di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, +					name_len, -1); +		if (IS_ERR(di)) { +			ret = PTR_ERR(di); +			goto out; +		} else if (!di) { +			ret = -ENODATA;  			goto out;  		} -  		ret = btrfs_delete_one_dir_name(trans, root, path, di); -		BUG_ON(ret); +		if (ret) +			goto out;  		btrfs_release_path(path); +	} -		/* if we don't have a value then we are removing the xattr */ -		if (!value) +again: +	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), +				      name, name_len, value, size); +	if (ret == -EEXIST) { +		if (flags & XATTR_CREATE)  			goto out; -	} else { +		/* +		 * We can't use the path we already have since we won't have the +		 * proper locking for a delete, so release the path and +		 * re-lookup to delete the thing. +		 */  		btrfs_release_path(path); +		di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), +					name, name_len, -1); +		if (IS_ERR(di)) { +			ret = PTR_ERR(di); +			goto out; +		} else if (!di) { +			/* Shouldn't happen but just in case... */ +			btrfs_release_path(path); +			goto again; +		} -		if (flags & XATTR_REPLACE) { -			/* we couldn't find the attr to replace */ -			ret = -ENODATA; +		ret = btrfs_delete_one_dir_name(trans, root, path, di); +		if (ret)  			goto out; + +		/* +		 * We have a value to set, so go back and try to insert it now. +		 */ +		if (value) { +			btrfs_release_path(path); +			goto again;  		}  	} - -	/* ok we have to create a completely new xattr */ -	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), -				      name, name_len, value, size); -	BUG_ON(ret);  out:  	btrfs_free_path(path);  	return ret;  |