summaryrefslogtreecommitdiff
path: root/fs/btrfs/ordered-data.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-06-01 08:37:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-06-01 08:37:31 -0700
commit51eab603f5c86dd1eae4c525df3e7f7eeab401d6 (patch)
treee7a8c6214b072db126cca62d39008b3620134798 /fs/btrfs/ordered-data.c
parent419f4319495043a9507ac3e616be9ca60af09744 (diff)
parent1e20932a23578bb1ec59107843574e259b96193f (diff)
downloadolio-linux-3.10-51eab603f5c86dd1eae4c525df3e7f7eeab401d6.tar.xz
olio-linux-3.10-51eab603f5c86dd1eae4c525df3e7f7eeab401d6.zip
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "This includes a fairly large change from Josef around data writeback completion. Before, the writeback wasn't completed until the metadata insertions for the extent were done, and this made for fairly large latency spikes on the last page of each ordered extent. We already had a separate mechanism for tracking pending metadata insertions, so Josef just needed to tweak things a little to end writeback earlier on the page. Overall it makes us much friendly to memory reclaim and lowers latencies quite a lot for synchronous IO. Jan Schmidt has finished some background work required to track btree blocks as they go through changes in ownership. It's the missing piece he needed for both btrfs send/receive and subvolume quotas. Neither of those are ready yet, but the new tracking code is included here. Most of the time, the new code is off. It is only used by scrub and other backref walkers. Stefan Behrens has added io failure tracking. This includes counters for which drives are causing the most trouble so the admin (or an automated tool) can choose to kick them out. We're tracking IO errors, crc errors, and generation checks we do on each metadata block. RAID5/6 did miss the cut this time because I'm having trouble with corruptions. I'll nail it down next week and post as a beta testing before 3.6" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (58 commits) Btrfs: fix tree mod log rewinded level and rewinding of moved keys Btrfs: fix tree mod log del_ptr Btrfs: add tree_mod_dont_log helper Btrfs: add missing spin_lock for insertion into tree mod log Btrfs: add inodes before dropping the extent lock in find_all_leafs Btrfs: use delayed ref sequence numbers for all fs-tree updates Btrfs: fix false positive in check-integrity on unmount Btrfs: fix runtime warning in check-integrity check data mode Btrfs: set ioprio of scrub readahead to idle Btrfs: fix return code in drop_objectid_items Btrfs: check to see if the inode is in the log before fsyncing Btrfs: return value of btrfs_read_buffer is checked correctly Btrfs: read device stats on mount, write modified ones during commit Btrfs: add ioctl to get and reset the device stats Btrfs: add device counters for detected IO and checksum errors btrfs: Drop unused function btrfs_abort_devices() Btrfs: fix the same inode id problem when doing auto defragment Btrfs: fall back to non-inline if we don't have enough space Btrfs: fix how we deal with the orphan block rsv Btrfs: convert the inode bit field to use the actual bit operations ...
Diffstat (limited to 'fs/btrfs/ordered-data.c')
-rw-r--r--fs/btrfs/ordered-data.c165
1 files changed, 77 insertions, 88 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index bbf6d0d9aeb..9e138cdc36c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->len = len;
entry->disk_len = disk_len;
entry->bytes_left = len;
- entry->inode = inode;
+ entry->inode = igrab(inode);
entry->compress_type = compress_type;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
trace_btrfs_ordered_extent_add(inode, entry);
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
if (node)
ordered_data_tree_panic(inode, -EEXIST, file_offset);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
}
/*
@@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode,
*/
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 *file_offset, u64 io_size)
+ u64 *file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
int ret;
+ unsigned long flags;
u64 dec_end;
u64 dec_start;
u64 to_dec;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
if (!node) {
ret = 1;
@@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
(unsigned long long)to_dec);
}
entry->bytes_left -= to_dec;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
else
@@ -332,7 +336,7 @@ out:
*cached = entry;
atomic_inc(&entry->refs);
}
- spin_unlock(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
}
@@ -347,15 +351,21 @@ out:
*/
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size)
+ u64 file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
+ if (cached && *cached) {
+ entry = *cached;
+ goto have_entry;
+ }
+
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
@@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
}
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
if (!offset_in_entry(entry, file_offset)) {
ret = 1;
goto out;
@@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
(unsigned long long)io_size);
}
entry->bytes_left -= io_size;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
else
@@ -383,7 +397,7 @@ out:
*cached = entry;
atomic_inc(&entry->refs);
}
- spin_unlock(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
}
@@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (atomic_dec_and_test(&entry->refs)) {
+ if (entry->inode)
+ btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
/*
* remove an ordered extent from the tree. No references are dropped
- * and you must wake_up entry->wait. You must hold the tree lock
- * while you call this function.
+ * and waiters are woken up.
*/
-static void __btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct rb_node *node;
tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ spin_unlock_irq(&tree->lock);
spin_lock(&root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
@@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode,
list_del_init(&BTRFS_I(inode)->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-}
-
-/*
- * remove an ordered extent from the tree. No references are dropped
- * but any waiters are woken.
- */
-void btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
-{
- struct btrfs_ordered_inode_tree *tree;
-
- tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
- __btrfs_remove_ordered_extent(inode, entry);
- spin_unlock(&tree->lock);
wake_up(&entry->wait);
}
@@ -621,19 +623,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
if (orig_end > INT_LIMIT(loff_t))
orig_end = INT_LIMIT(loff_t);
}
-again:
+
/* start IO across the range first to instantiate any delalloc
* extents
*/
- filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
- /* The compression code will leave pages locked but return from
- * writepage without setting the page writeback. Starting again
- * with WB_SYNC_ALL will end up waiting for the IO to actually start.
- */
- filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
- filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+ filemap_write_and_wait_range(inode->i_mapping, start, orig_end);
end = orig_end;
found = 0;
@@ -657,11 +651,6 @@ again:
break;
end--;
}
- if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
- EXTENT_DELALLOC, 0, NULL)) {
- schedule_timeout(1);
- goto again;
- }
}
/*
@@ -676,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -687,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
if (entry)
atomic_inc(&entry->refs);
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -703,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node) {
node = tree_search(tree, file_offset + len);
@@ -728,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
out:
if (entry)
atomic_inc(&entry->refs);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -744,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -752,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
atomic_inc(&entry->refs);
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -764,7 +753,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered)
{
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
u64 disk_i_size;
u64 new_i_size;
u64 i_size_test;
@@ -779,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
else
offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
disk_i_size = BTRFS_I(inode)->disk_i_size;
/* truncate file */
@@ -798,14 +786,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
}
/*
- * we can't update the disk_isize if there are delalloc bytes
- * between disk_i_size and this ordered extent
- */
- if (test_range_bit(io_tree, disk_i_size, offset - 1,
- EXTENT_DELALLOC, 0, NULL)) {
- goto out;
- }
- /*
* walk backward from this ordered extent to disk_i_size.
* if we find an ordered extent then we can't update disk i_size
* yet
@@ -825,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
}
node = prev;
}
- while (node) {
+ for (; node; node = rb_prev(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+ /* We treat this entry as if it doesnt exist */
+ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+ continue;
if (test->file_offset + test->len <= disk_i_size)
break;
if (test->file_offset >= i_size)
break;
if (test->file_offset >= disk_i_size)
goto out;
- node = rb_prev(node);
}
new_i_size = min_t(u64, offset, i_size);
@@ -851,43 +834,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
else
node = rb_first(&tree->tree);
}
- i_size_test = 0;
- if (node) {
- /*
- * do we have an area where IO might have finished
- * between our ordered extent and the next one.
- */
+
+ /*
+ * We are looking for an area between our current extent and the next
+ * ordered extent to update the i_size to. There are 3 cases here
+ *
+ * 1) We don't actually have anything and we can update to i_size.
+ * 2) We have stuff but they already did their i_size update so again we
+ * can just update to i_size.
+ * 3) We have an outstanding ordered extent so the most we can update
+ * our disk_i_size to is the start of the next offset.
+ */
+ i_size_test = i_size;
+ for (; node; node = rb_next(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (test->file_offset > offset)
+
+ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+ continue;
+ if (test->file_offset > offset) {
i_size_test = test->file_offset;
- } else {
- i_size_test = i_size;
+ break;
+ }
}
/*
* i_size_test is the end of a region after this ordered
- * extent where there are no ordered extents. As long as there
- * are no delalloc bytes in this area, it is safe to update
- * disk_i_size to the end of the region.
+ * extent where there are no ordered extents, we can safely set
+ * disk_i_size to this.
*/
- if (i_size_test > offset &&
- !test_range_bit(io_tree, offset, i_size_test - 1,
- EXTENT_DELALLOC, 0, NULL)) {
+ if (i_size_test > offset)
new_i_size = min_t(u64, i_size_test, i_size);
- }
BTRFS_I(inode)->disk_i_size = new_i_size;
ret = 0;
out:
/*
- * we need to remove the ordered extent with the tree lock held
- * so that other people calling this function don't find our fully
- * processed ordered entry and skip updating the i_size
+ * We need to do this because we can't remove ordered extents until
+ * after the i_disk_size has been updated and then the inode has been
+ * updated to reflect the change, so we need to tell anybody who finds
+ * this ordered extent that we've already done all the real work, we
+ * just haven't completed all the other work.
*/
if (ordered)
- __btrfs_remove_ordered_extent(inode, ordered);
- spin_unlock(&tree->lock);
- if (ordered)
- wake_up(&ordered->wait);
+ set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+ spin_unlock_irq(&tree->lock);
return ret;
}
@@ -912,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
if (!ordered)
return 1;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr) {
num_sectors = ordered_sum->len / sectorsize;
@@ -927,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
}
}
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
btrfs_put_ordered_extent(ordered);
return ret;
}