From c8b978188c9a0fd3d535c13debd19d522b726f1f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 29 Oct 2008 14:49:59 -0400
Subject: Btrfs: Add zlib compression support

This is a large change for adding compression on reading and writing,
both for inline and regular extents.  It does some fairly large
surgery to the writeback paths.

Compression is off by default and enabled by mount -o compress.  Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.

If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.

* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler.  This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.

* Inline extents are inserted at delalloc time now.  This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.

* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.

From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field.  Neither the encryption or the
'other' field are currently used.

In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k.  This is a
software only limit, the disk format supports u64 sized compressed extents.

In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k.  This is a software only limit
and will be subject to tuning later.

Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data.  This way additional encodings can be
layered on without having to figure out which encoding to checksum.

Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread.  This makes it tricky to
spread the compression load across all the cpus on the box.  We'll have to
look at parallel pdflush walks of dirty inodes at a later time.

Decompression is hooked into readpages and it does spread across CPUs nicely.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 454 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 454 insertions(+)
 create mode 100644 fs/btrfs/compression.c

(limited to 'fs/btrfs/compression.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 00000000000..c5470367ca5
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compat.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+	/* number of bios pending for this compressed extent */
+	atomic_t pending_bios;
+
+	/* the pages with the compressed data on them */
+	struct page **compressed_pages;
+
+	/* inode that owns this data */
+	struct inode *inode;
+
+	/* starting offset in the inode for our pages */
+	u64 start;
+
+	/* number of bytes in the inode we're working on */
+	unsigned long len;
+
+	/* number of bytes on disk */
+	unsigned long compressed_len;
+
+	/* number of compressed pages in the array */
+	unsigned long nr_pages;
+
+	/* IO errors */
+	int errors;
+
+	/* for reads, this is the bio we are copying the data into */
+	struct bio *orig_bio;
+};
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+					u64 first_byte, gfp_t gfp_flags)
+{
+	struct bio *bio;
+	int nr_vecs;
+
+	nr_vecs = bio_get_nr_vecs(bdev);
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_size = 0;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_byte >> 9;
+	}
+	return bio;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+	int ret;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, lets start
+	 * the decompression.
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+					cb->start,
+					cb->orig_bio->bi_io_vec,
+					cb->orig_bio->bi_vcnt,
+					cb->compressed_len);
+	if (ret)
+		cb->errors = 1;
+
+	/* release the compressed pages */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* do io completion on the original bio */
+	if (cb->errors)
+		bio_io_error(cb->orig_bio);
+	else
+		bio_endio(cb->orig_bio, 0);
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+					     unsigned long ram_size)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+	struct page *pages[16];
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int ret;
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			nr_pages -= 1;
+			index += 1;
+			continue;
+		}
+		for (i = 0; i < ret; i++) {
+			end_page_writeback(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+	}
+	/* the inode may be gone now */
+	return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, step one is to
+	 * call back into the FS and do all the end_io operations
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+					 cb->start,
+					 cb->start + cb->len - 1,
+					 NULL, 1);
+
+	end_compressed_writeback(inode, cb->start, cb->len);
+	/* note, our inode could be gone now */
+
+	/*
+	 * release the compressed pages, these came from alloc_page and
+	 * are not attached to the inode at all
+	 */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				 unsigned long len, u64 disk_start,
+				 unsigned long compressed_len,
+				 struct page **compressed_pages,
+				 unsigned long nr_pages)
+{
+	struct bio *bio = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct compressed_bio *cb;
+	unsigned long bytes_left;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int page_index = 0;
+	struct page *page;
+	u64 first_byte = disk_start;
+	struct block_device *bdev;
+	int ret;
+
+	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->start = start;
+	cb->len = len;
+	cb->compressed_pages = compressed_pages;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = NULL;
+	cb->nr_pages = nr_pages;
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	ret = btrfs_csum_file_bytes(root, inode, start, len);
+	BUG_ON(ret);
+
+	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+	bio->bi_private = cb;
+	bio->bi_end_io = end_compressed_bio_write;
+	atomic_inc(&cb->pending_bios);
+
+	/* create and submit bios for the compressed pages */
+	bytes_left = compressed_len;
+	while(bytes_left > 0) {
+		page = compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		if (bio->bi_size)
+			ret = io_tree->ops->merge_bio_hook(page, 0,
+							   PAGE_CACHE_SIZE,
+							   bio, 0);
+		else
+			ret = 0;
+
+		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+			BUG_ON(ret);
+
+			bio_put(bio);
+
+			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			atomic_inc(&cb->pending_bios);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		page_index++;
+		bytes_left -= PAGE_CACHE_SIZE;
+		first_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+	BUG_ON(ret);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *em_tree;
+	struct compressed_bio *cb;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	unsigned long compressed_len;
+	unsigned long nr_pages;
+	unsigned long page_index;
+	struct page *page;
+	struct block_device *bdev;
+	struct bio *comp_bio;
+	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	struct extent_map *em;
+	int ret;
+
+	tree = &BTRFS_I(inode)->io_tree;
+	em_tree = &BTRFS_I(inode)->extent_tree;
+
+	/* we need the actual starting offset of this extent in the file */
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree,
+				   page_offset(bio->bi_io_vec->bv_page),
+				   PAGE_CACHE_SIZE);
+	spin_unlock(&em_tree->lock);
+
+	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+
+	cb->start = em->start;
+	compressed_len = em->block_len;
+	free_extent_map(em);
+
+	cb->len = uncompressed_len;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = bio;
+
+	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+				 PAGE_CACHE_SIZE;
+	cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+				       GFP_NOFS);
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+							      __GFP_HIGHMEM);
+	}
+	cb->nr_pages = nr_pages;
+
+	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+	comp_bio->bi_private = cb;
+	comp_bio->bi_end_io = end_compressed_bio_read;
+	atomic_inc(&cb->pending_bios);
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		page = cb->compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		if (comp_bio->bi_size)
+			ret = tree->ops->merge_bio_hook(page, 0,
+							PAGE_CACHE_SIZE,
+							comp_bio, 0);
+		else
+			ret = 0;
+
+		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(comp_bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+			BUG_ON(ret);
+
+			bio_put(comp_bio);
+
+			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+							GFP_NOFS);
+			atomic_inc(&cb->pending_bios);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		cur_disk_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(comp_bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+	BUG_ON(ret);
+
+	bio_put(comp_bio);
+	return 0;
+}
-- 
cgit v1.2.3-70-g09d2


From cfbc246eaae2a1089911016094b74b3055e8a906 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 30 Oct 2008 13:22:14 -0400
Subject: Btrfs: walk compressed pages based on the nr_pages count instead of
 bytes

The byte walk counting was awkward and error prone.  This uses the
number of pages sent the higher layer to build bios.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 7 +++++--
 fs/btrfs/inode.c       | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/compression.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c5470367ca5..9adaa79adad 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -296,7 +296,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 
 	/* create and submit bios for the compressed pages */
 	bytes_left = compressed_len;
-	while(bytes_left > 0) {
+	for (page_index = 0; page_index < cb->nr_pages; page_index++) {
 		page = compressed_pages[page_index];
 		page->mapping = inode->i_mapping;
 		if (bio->bi_size)
@@ -324,7 +324,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			bio->bi_end_io = end_compressed_bio_write;
 			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
 		}
-		page_index++;
+		if (bytes_left < PAGE_CACHE_SIZE) {
+			printk("bytes left %lu compress len %lu nr %lu\n",
+			       bytes_left, cb->compressed_len, cb->nr_pages);
+		}
 		bytes_left -= PAGE_CACHE_SIZE;
 		first_byte += PAGE_CACHE_SIZE;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9797592dc86..6739424c0fe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -338,7 +338,7 @@ again:
 	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
 	    btrfs_test_opt(root, COMPRESS)) {
 		WARN_ON(pages);
-		pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
 		/* we want to make sure the amount of IO required to satisfy
 		 * a random read is reasonably small, so we limit the size
-- 
cgit v1.2.3-70-g09d2


From 70b99e6959a4c28ae1b314985eca731f3db72f1d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 31 Oct 2008 12:46:39 -0400
Subject: Btrfs: Compression corner fixes

Make sure we keep page->mapping NULL on the pages we're getting
via alloc_page.  It gets set so a few of the callbacks can do the right
thing, but in general these pages don't have a mapping.

Don't try to truncate compressed inline items in btrfs_drop_extents.
The whole compressed item must be preserved.

Don't try to create multipage inline compressed items.  When we try to
overwrite just the first page of the file, we would have to read in and recow
all the pages after it in the same compressed inline items.  For now, only
create single page inline items.

Make sure we lock pages in the correct order during delalloc.  The
search into the state tree for delalloc bytes can return bytes before
the page we already have locked.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c |  4 ++++
 fs/btrfs/extent_io.c   | 11 ++++++++++-
 fs/btrfs/file.c        | 21 +++++++++++++++------
 fs/btrfs/inode.c       |  8 ++++++--
 4 files changed, 35 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/compression.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9adaa79adad..354913177ba 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -220,10 +220,12 @@ static void end_compressed_bio_write(struct bio *bio, int err)
 	 */
 	inode = cb->inode;
 	tree = &BTRFS_I(inode)->io_tree;
+	cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
 	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
 					 cb->start,
 					 cb->start + cb->len - 1,
 					 NULL, 1);
+	cb->compressed_pages[0]->mapping = NULL;
 
 	end_compressed_writeback(inode, cb->start, cb->len);
 	/* note, our inode could be gone now */
@@ -306,6 +308,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		else
 			ret = 0;
 
+		page->mapping = NULL;
 		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
 		    PAGE_CACHE_SIZE) {
 			bio_get(bio);
@@ -423,6 +426,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		else
 			ret = 0;
 
+		page->mapping = NULL;
 		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
 		    PAGE_CACHE_SIZE) {
 			bio_get(comp_bio);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb3c12e7bea..9b37ce6e516 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1242,12 +1242,21 @@ again:
 	delalloc_end = 0;
 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
 				    max_bytes);
-	if (!found) {
+	if (!found || delalloc_end <= *start) {
 		*start = delalloc_start;
 		*end = delalloc_end;
 		return found;
 	}
 
+	/*
+	 * start comes from the offset of locked_page.  We have to lock
+	 * pages in order, so we can't process delalloc bytes before
+	 * locked_page
+	 */
+	if (delalloc_start < *start) {
+		delalloc_start = *start;
+	}
+
 	/*
 	 * make sure to limit the number of pages we try to lock down
 	 * if we're looping.
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 238a8e215eb..0c8cc35a8b9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -368,8 +368,8 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	u64 search_start = start;
 	u64 leaf_start;
 	u64 ram_bytes = 0;
-	u8 compression = 0;
-	u8 encryption = 0;
+	u8 compression;
+	u8 encryption;
 	u16 other_encoding = 0;
 	u64 root_gen;
 	u64 root_owner;
@@ -415,6 +415,8 @@ next_slot:
 		leaf_start = 0;
 		root_gen = 0;
 		root_owner = 0;
+		compression = 0;
+		encryption = 0;
 		extent = NULL;
 		leaf = path->nodes[0];
 		slot = path->slots[0];
@@ -546,8 +548,12 @@ next_slot:
 						   inline_limit - key.offset);
 				inode_sub_bytes(inode, extent_end -
 						inline_limit);
-				btrfs_truncate_item(trans, root, path,
-						    new_size, 1);
+				btrfs_set_file_extent_ram_bytes(leaf, extent,
+							new_size);
+				if (!compression && !encryption) {
+					btrfs_truncate_item(trans, root, path,
+							    new_size, 1);
+				}
 			}
 		}
 		/* delete the entire extent */
@@ -567,8 +573,11 @@ next_slot:
 			new_size = btrfs_file_extent_calc_inline_size(
 						   extent_end - end);
 			inode_sub_bytes(inode, end - key.offset);
-			ret = btrfs_truncate_item(trans, root, path,
-						  new_size, 0);
+			btrfs_set_file_extent_ram_bytes(leaf, extent,
+							new_size);
+			if (!compression && !encryption)
+				ret = btrfs_truncate_item(trans, root, path,
+							  new_size, 0);
 			BUG_ON(ret);
 		}
 		/* create bookend, splitting the extent in two */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 789c376157f..806708dd7e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -239,6 +239,7 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		data_len = compressed_size;
 
 	if (start > 0 ||
+	    actual_end >= PAGE_CACHE_SIZE ||
 	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
 	    (!compressed_size &&
 	    (actual_end & (root->sectorsize - 1)) == 0) ||
@@ -248,7 +249,7 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_drop_extents(trans, root, inode, start,
-				 aligned_end, aligned_end, &hint_byte);
+				 aligned_end, start, &hint_byte);
 	BUG_ON(ret);
 
 	if (isize > actual_end)
@@ -423,6 +424,7 @@ again:
 		 * free any pages it allocated and our page pointer array
 		 */
 		for (i = 0; i < nr_pages_ret; i++) {
+			WARN_ON(pages[i]->mapping);
 			page_cache_release(pages[i]);
 		}
 		kfree(pages);
@@ -572,8 +574,10 @@ free_pages_out_fail:
 	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 				     start, end, locked_page, 0, 0, 0);
 free_pages_out:
-	for (i = 0; i < nr_pages_ret; i++)
+	for (i = 0; i < nr_pages_ret; i++) {
+		WARN_ON(pages[i]->mapping);
 		page_cache_release(pages[i]);
+	}
 	if (pages)
 		kfree(pages);
 
-- 
cgit v1.2.3-70-g09d2


From 771ed689d2cd53439e28e095bc38fbe40a71429e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 6 Nov 2008 22:02:51 -0500
Subject: Btrfs: Optimize compressed writeback and reads

When reading compressed extents, try to put pages into the page cache
for any pages covered by the compressed extent that readpages didn't already
preload.

Add an async work queue to handle transformations at delayed allocation processing
time.  Right now this is just compression.  The workflow is:

1) Find offsets in the file marked for delayed allocation
2) Lock the pages
3) Lock the state bits
4) Call the async delalloc code

The async delalloc code clears the state lock bits and delalloc bits.  It is
important this happens before the range goes into the work queue because
otherwise it might deadlock with other work queue items that try to lock
those extent bits.

The file pages are compressed, and if the compression doesn't work the
pages are written back directly.

An ordered work queue is used to make sure the inodes are written in the same
order that pdflush or writepages sent them down.

This changes extent_write_cache_pages to let the writepage function
update the wbc nr_written count.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c  | 150 ++++++++++-
 fs/btrfs/ctree.h        |   4 +-
 fs/btrfs/disk-io.c      |  27 +-
 fs/btrfs/extent-tree.c  |   6 +-
 fs/btrfs/extent_io.c    | 140 +++++++++--
 fs/btrfs/extent_io.h    |  13 +-
 fs/btrfs/file.c         |  53 ++--
 fs/btrfs/inode.c        | 643 +++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ordered-data.c |  13 +-
 fs/btrfs/super.c        |   4 +
 fs/btrfs/zlib.c         |   3 +-
 11 files changed, 849 insertions(+), 207 deletions(-)

(limited to 'fs/btrfs/compression.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 354913177ba..284f21025bc 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,6 +33,7 @@
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
 #include <linux/version.h>
+#include <linux/pagevec.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -145,9 +146,9 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	}
 
 	/* do io completion on the original bio */
-	if (cb->errors)
+	if (cb->errors) {
 		bio_io_error(cb->orig_bio);
-	else
+	} else
 		bio_endio(cb->orig_bio, 0);
 
 	/* finally free the cb struct */
@@ -333,6 +334,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		}
 		bytes_left -= PAGE_CACHE_SIZE;
 		first_byte += PAGE_CACHE_SIZE;
+		cond_resched();
 	}
 	bio_get(bio);
 
@@ -346,6 +348,130 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	return 0;
 }
 
+static noinline int add_ra_bio_pages(struct inode *inode,
+				     u64 compressed_end,
+				     struct compressed_bio *cb)
+{
+	unsigned long end_index;
+	unsigned long page_index;
+	u64 last_offset;
+	u64 isize = i_size_read(inode);
+	int ret;
+	struct page *page;
+	unsigned long nr_pages = 0;
+	struct extent_map *em;
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	struct extent_map_tree *em_tree;
+	struct extent_io_tree *tree;
+	u64 end;
+	int misses = 0;
+
+	page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+	last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	tree = &BTRFS_I(inode)->io_tree;
+
+	if (isize == 0)
+		return 0;
+
+	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+	while(last_offset < compressed_end) {
+		page_index = last_offset >> PAGE_CACHE_SHIFT;
+
+		if (page_index > end_index)
+			break;
+
+		rcu_read_lock();
+		page = radix_tree_lookup(&mapping->page_tree, page_index);
+		rcu_read_unlock();
+		if (page) {
+			misses++;
+			if (misses > 4)
+				break;
+			goto next;
+		}
+
+		page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+		if (!page)
+			break;
+
+		page->index = page_index;
+		/*
+		 * what we want to do here is call add_to_page_cache_lru,
+		 * but that isn't exported, so we reproduce it here
+		 */
+		if (add_to_page_cache(page, mapping,
+				      page->index, GFP_NOFS)) {
+			page_cache_release(page);
+			goto next;
+		}
+
+		/* open coding of lru_cache_add, also not exported */
+		page_cache_get(page);
+		if (!pagevec_add(&pvec, page))
+			__pagevec_lru_add(&pvec);
+
+		end = last_offset + PAGE_CACHE_SIZE - 1;
+		/*
+		 * at this point, we have a locked page in the page cache
+		 * for these bytes in the file.  But, we have to make
+		 * sure they map to this compressed extent on disk.
+		 */
+		set_page_extent_mapped(page);
+		lock_extent(tree, last_offset, end, GFP_NOFS);
+		spin_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, last_offset,
+					   PAGE_CACHE_SIZE);
+		spin_unlock(&em_tree->lock);
+
+		if (!em || last_offset < em->start ||
+		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+		    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+			free_extent_map(em);
+			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+		free_extent_map(em);
+
+		if (page->index == end_index) {
+			char *userpage;
+			size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+
+			if (zero_offset) {
+				int zeros;
+				zeros = PAGE_CACHE_SIZE - zero_offset;
+				userpage = kmap_atomic(page, KM_USER0);
+				memset(userpage + zero_offset, 0, zeros);
+				flush_dcache_page(page);
+				kunmap_atomic(userpage, KM_USER0);
+			}
+		}
+
+		ret = bio_add_page(cb->orig_bio, page,
+				   PAGE_CACHE_SIZE, 0);
+
+		if (ret == PAGE_CACHE_SIZE) {
+			nr_pages++;
+			page_cache_release(page);
+		} else {
+			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+next:
+		last_offset += PAGE_CACHE_SIZE;
+	}
+	if (pagevec_count(&pvec))
+		__pagevec_lru_add(&pvec);
+	return 0;
+}
+
 /*
  * for a compressed read, the bio we get passed has all the inode pages
  * in it.  We don't actually do IO on those pages but allocate new ones
@@ -373,6 +499,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	struct block_device *bdev;
 	struct bio *comp_bio;
 	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	u64 em_len;
 	struct extent_map *em;
 	int ret;
 
@@ -393,6 +520,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	cb->start = em->start;
 	compressed_len = em->block_len;
+	em_len = em->len;
 	free_extent_map(em);
 
 	cb->len = uncompressed_len;
@@ -411,6 +539,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	}
 	cb->nr_pages = nr_pages;
 
+	add_ra_bio_pages(inode, cb->start + em_len, cb);
+
+	if (!btrfs_test_opt(root, NODATASUM) &&
+	    !btrfs_test_flag(inode, NODATASUM)) {
+		btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
+	}
+
+	/* include any pages we added in add_ra-bio_pages */
+	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	cb->len = uncompressed_len;
+
 	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
 	comp_bio->bi_private = cb;
 	comp_bio->bi_end_io = end_compressed_bio_read;
@@ -442,9 +581,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
 							GFP_NOFS);
 			atomic_inc(&cb->pending_bios);
-			bio->bi_private = cb;
-			bio->bi_end_io = end_compressed_bio_write;
-			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+			comp_bio->bi_private = cb;
+			comp_bio->bi_end_io = end_compressed_bio_read;
+
+			bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
 		}
 		cur_disk_byte += PAGE_CACHE_SIZE;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 689df070c8e..c83cc5b2ded 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -625,8 +625,8 @@ struct btrfs_fs_info {
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
-	wait_queue_head_t async_submit_wait;
 
+	wait_queue_head_t async_submit_wait;
 	wait_queue_head_t tree_log_wait;
 
 	struct btrfs_super_block super_copy;
@@ -653,6 +653,7 @@ struct btrfs_fs_info {
 	atomic_t nr_async_submits;
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
+	atomic_t async_delalloc_pages;
 	atomic_t tree_log_writers;
 	atomic_t tree_log_commit;
 	unsigned long tree_log_batch;
@@ -677,6 +678,7 @@ struct btrfs_fs_info {
 	 * two
 	 */
 	struct btrfs_workers workers;
+	struct btrfs_workers delalloc_workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e0a28f705a6..8efc123d222 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -539,6 +539,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			   (atomic_read(&fs_info->nr_async_bios) < limit),
 			   HZ/10);
 	}
+
+	while(atomic_read(&fs_info->async_submit_draining) &&
+	      atomic_read(&fs_info->nr_async_submits)) {
+		wait_event(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_submits) == 0));
+	}
+
 	return 0;
 }
 
@@ -1437,6 +1444,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->throttles, 0);
@@ -1550,6 +1558,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->workers, "worker",
 			   fs_info->thread_pool_size);
 
+	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+			   fs_info->thread_pool_size);
+
 	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
 			   fs_info->thread_pool_size));
@@ -1560,15 +1571,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	fs_info->submit_workers.idle_thresh = 64;
 
-	/* fs_info->workers is responsible for checksumming file data
-	 * blocks and metadata.  Using a larger idle thresh allows each
-	 * worker thread to operate on things in roughly the order they
-	 * were sent by the writeback daemons, improving overall locality
-	 * of the IO going down the pipe.
-	 */
-	fs_info->workers.idle_thresh = 8;
+	fs_info->workers.idle_thresh = 16;
 	fs_info->workers.ordered = 1;
 
+	fs_info->delalloc_workers.idle_thresh = 2;
+	fs_info->delalloc_workers.ordered = 1;
+
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
 			   fs_info->thread_pool_size);
@@ -1584,6 +1592,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
+	btrfs_start_workers(&fs_info->delalloc_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->endio_write_workers,
@@ -1732,6 +1741,7 @@ fail_tree_root:
 fail_sys_array:
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -1988,6 +1998,7 @@ int close_ctree(struct btrfs_root *root)
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
 	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -2062,7 +2073,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	struct extent_io_tree *tree;
 	u64 num_dirty;
 	u64 start = 0;
-	unsigned long thresh = 96 * 1024 * 1024;
+	unsigned long thresh = 32 * 1024 * 1024;
 	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
 	if (current_is_pdflush() || current->flags & PF_MEMALLOC)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8af39521eb7..ebd8275a193 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -768,7 +768,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	l = path->nodes[0];
 
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
-	BUG_ON(key.objectid != bytenr);
+	if (key.objectid != bytenr) {
+		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+		printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
+		BUG();
+	}
 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
 
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9b37ce6e516..bbe3bcfcf4a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -47,6 +47,11 @@ struct extent_page_data {
 	struct bio *bio;
 	struct extent_io_tree *tree;
 	get_extent_t *get_extent;
+
+	/* tells writepage not to lock the state bits for this range
+	 * it still does the unlocking
+	 */
+	int extent_locked;
 };
 
 int __init extent_io_init(void)
@@ -1198,11 +1203,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 			 * the caller is taking responsibility for
 			 * locked_page
 			 */
-			if (pages[i] != locked_page)
+			if (pages[i] != locked_page) {
 				lock_page(pages[i]);
+				if (pages[i]->mapping != inode->i_mapping) {
+					ret = -EAGAIN;
+					unlock_page(pages[i]);
+					page_cache_release(pages[i]);
+					goto done;
+				}
+			}
 			page_cache_release(pages[i]);
+			pages_locked++;
 		}
-		pages_locked += ret;
 		nrpages -= ret;
 		index += ret;
 		cond_resched();
@@ -1262,8 +1274,7 @@ again:
 	 * if we're looping.
 	 */
 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
-		delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
-			~((u64)PAGE_CACHE_SIZE - 1);
+		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
 	}
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
@@ -1306,7 +1317,10 @@ out_failed:
 int extent_clear_unlock_delalloc(struct inode *inode,
 				struct extent_io_tree *tree,
 				u64 start, u64 end, struct page *locked_page,
-				int clear_dirty, int set_writeback,
+				int unlock_pages,
+				int clear_unlock,
+				int clear_delalloc, int clear_dirty,
+				int set_writeback,
 				int end_writeback)
 {
 	int ret;
@@ -1315,12 +1329,19 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
-	int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+	int clear_bits = 0;
 
+	if (clear_unlock)
+		clear_bits |= EXTENT_LOCKED;
 	if (clear_dirty)
 		clear_bits |= EXTENT_DIRTY;
 
+	if (clear_delalloc)
+		clear_bits |= EXTENT_DELALLOC;
+
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+		return 0;
 
 	while(nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1336,7 +1357,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				set_page_writeback(pages[i]);
 			if (end_writeback)
 				end_page_writeback(pages[i]);
-			unlock_page(pages[i]);
+			if (unlock_pages)
+				unlock_page(pages[i]);
 			page_cache_release(pages[i]);
 		}
 		nr_pages -= ret;
@@ -1741,9 +1763,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			}
 		}
 
-		if (uptodate)
+		if (uptodate) {
 			set_extent_uptodate(tree, start, end,
 					    GFP_ATOMIC);
+		}
 		unlock_extent(tree, start, end, GFP_ATOMIC);
 
 		if (whole_page) {
@@ -1925,6 +1948,7 @@ void set_page_extent_mapped(struct page *page)
 		set_page_private(page, EXTENT_PAGE_PRIVATE);
 	}
 }
+EXPORT_SYMBOL(set_page_extent_mapped);
 
 void set_page_extent_head(struct page *page, unsigned long len)
 {
@@ -2143,12 +2167,17 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 delalloc_end;
 	int page_started;
 	int compressed;
+	unsigned long nr_written = 0;
 
 	WARN_ON(!PageLocked(page));
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0);
+		if (epd->extent_locked) {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, start,
+							 page_end, NULL, 1);
+		}
 		unlock_page(page);
 		return 0;
 	}
@@ -2169,27 +2198,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	delalloc_start = start;
 	delalloc_end = 0;
 	page_started = 0;
-	while(delalloc_end < page_end) {
-		nr_delalloc = find_lock_delalloc_range(inode, tree,
+	if (!epd->extent_locked) {
+		while(delalloc_end < page_end) {
+			nr_delalloc = find_lock_delalloc_range(inode, tree,
 						       page,
 						       &delalloc_start,
 						       &delalloc_end,
 						       128 * 1024 * 1024);
-		if (nr_delalloc == 0) {
+			if (nr_delalloc == 0) {
+				delalloc_start = delalloc_end + 1;
+				continue;
+			}
+			tree->ops->fill_delalloc(inode, page, delalloc_start,
+						 delalloc_end, &page_started,
+						 &nr_written);
 			delalloc_start = delalloc_end + 1;
-			continue;
 		}
-		tree->ops->fill_delalloc(inode, page, delalloc_start,
-					 delalloc_end, &page_started);
-		delalloc_start = delalloc_end + 1;
-	}
 
-	/* did the fill delalloc function already unlock and start the IO? */
-	if (page_started) {
-		return 0;
+		/* did the fill delalloc function already unlock and start
+		 * the IO?
+		 */
+		if (page_started) {
+			ret = 0;
+			goto update_nr_written;
+		}
 	}
-
 	lock_extent(tree, start, page_end, GFP_NOFS);
+
 	unlock_start = start;
 
 	if (tree->ops && tree->ops->writepage_start_hook) {
@@ -2199,10 +2234,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			unlock_extent(tree, start, page_end, GFP_NOFS);
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
-			return 0;
+			ret = 0;
+			goto update_nr_written;
 		}
 	}
 
+	nr_written++;
+
 	end = page_end;
 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
 		printk("found delalloc bits after lock_extent\n");
@@ -2333,6 +2371,12 @@ done:
 	if (unlock_start <= page_end)
 		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 	unlock_page(page);
+
+update_nr_written:
+	wbc->nr_to_write -= nr_written;
+	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+		page->mapping->writeback_index = page->index + nr_written;
 	return 0;
 }
 
@@ -2431,7 +2475,7 @@ retry:
 				unlock_page(page);
 				ret = 0;
 			}
-			if (ret || (--(wbc->nr_to_write) <= 0))
+			if (ret || wbc->nr_to_write <= 0)
 				done = 1;
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
 				wbc->encountered_congestion = 1;
@@ -2452,6 +2496,8 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 
 	if (wbc->range_cont)
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
@@ -2469,6 +2515,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.bio = NULL,
 		.tree = tree,
 		.get_extent = get_extent,
+		.extent_locked = 0,
 	};
 	struct writeback_control wbc_writepages = {
 		.bdi		= wbc->bdi,
@@ -2491,6 +2538,52 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 }
 EXPORT_SYMBOL(extent_write_full_page);
 
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode)
+{
+	int ret = 0;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 1,
+	};
+	struct writeback_control wbc_writepages = {
+		.bdi		= inode->i_mapping->backing_dev_info,
+		.sync_mode	= mode,
+		.older_than_this = NULL,
+		.nr_to_write	= nr_pages * 2,
+		.range_start	= start,
+		.range_end	= end + 1,
+	};
+
+	while(start <= end) {
+		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		if (clear_page_dirty_for_io(page))
+			ret = __extent_writepage(page, &wbc_writepages, &epd);
+		else {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, start,
+						 start + PAGE_CACHE_SIZE - 1,
+						 NULL, 1);
+			unlock_page(page);
+		}
+		page_cache_release(page);
+		start += PAGE_CACHE_SIZE;
+	}
+
+	if (epd.bio)
+		submit_one_bio(WRITE, epd.bio, 0, 0);
+	return ret;
+}
+EXPORT_SYMBOL(extent_write_locked_range);
+
 
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
@@ -2502,6 +2595,7 @@ int extent_writepages(struct extent_io_tree *tree,
 		.bio = NULL,
 		.tree = tree,
 		.get_extent = get_extent,
+		.extent_locked = 0,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 283110ec4ee..2d5f67065b6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -35,7 +35,8 @@ typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
 				       unsigned long bio_flags);
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
-			     u64 start, u64 end, int *page_started);
+			     u64 start, u64 end, int *page_started,
+			     unsigned long *nr_written);
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	extent_submit_bio_hook_t *submit_bio_hook;
@@ -172,6 +173,9 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode);
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
 		      get_extent_t *get_extent,
@@ -256,6 +260,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 int extent_clear_unlock_delalloc(struct inode *inode,
 				struct extent_io_tree *tree,
 				u64 start, u64 end, struct page *locked_page,
-				int clear_dirty, int set_writeback,
-				int clear_writeback);
+				int unlock_page,
+				int clear_unlock,
+				int clear_delalloc, int clear_dirty,
+				int set_writeback,
+				int end_writeback);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0c8cc35a8b9..337221ecca2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -368,6 +368,8 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	u64 search_start = start;
 	u64 leaf_start;
 	u64 ram_bytes = 0;
+	u64 orig_parent = 0;
+	u64 disk_bytenr = 0;
 	u8 compression;
 	u8 encryption;
 	u16 other_encoding = 0;
@@ -500,17 +502,31 @@ next_slot:
 				keep = 1;
 		}
 
-		if (bookend && found_extent && locked_end < extent_end) {
-			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-					locked_end, extent_end - 1, GFP_NOFS);
-			if (!ret) {
-				btrfs_release_path(root, path);
-				lock_extent(&BTRFS_I(inode)->io_tree,
-					locked_end, extent_end - 1, GFP_NOFS);
+		if (bookend && found_extent) {
+			if (locked_end < extent_end) {
+				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+						locked_end, extent_end - 1,
+						GFP_NOFS);
+				if (!ret) {
+					btrfs_release_path(root, path);
+					lock_extent(&BTRFS_I(inode)->io_tree,
+						locked_end, extent_end - 1,
+						GFP_NOFS);
+					locked_end = extent_end;
+					continue;
+				}
 				locked_end = extent_end;
-				continue;
 			}
-			locked_end = extent_end;
+			orig_parent = path->nodes[0]->start;
+			disk_bytenr = le64_to_cpu(old.disk_bytenr);
+			if (disk_bytenr != 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+					   disk_bytenr,
+					   le64_to_cpu(old.disk_num_bytes),
+					   orig_parent, root->root_key.objectid,
+					   trans->transid, inode->i_ino);
+				BUG_ON(ret);
+			}
 		}
 
 		if (found_inline) {
@@ -537,8 +553,12 @@ next_slot:
 					inode_sub_bytes(inode, old_num -
 							new_num);
 				}
-				btrfs_set_file_extent_num_bytes(leaf, extent,
-								new_num);
+				if (!compression && !encryption) {
+					btrfs_set_file_extent_ram_bytes(leaf,
+							extent, new_num);
+				}
+				btrfs_set_file_extent_num_bytes(leaf,
+							extent, new_num);
 				btrfs_mark_buffer_dirty(leaf);
 			} else if (key.offset < inline_limit &&
 				   (end > extent_end) &&
@@ -582,11 +602,11 @@ next_slot:
 		}
 		/* create bookend, splitting the extent in two */
 		if (bookend && found_extent) {
-			u64 disk_bytenr;
 			struct btrfs_key ins;
 			ins.objectid = inode->i_ino;
 			ins.offset = end;
 			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+
 			btrfs_release_path(root, path);
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
@@ -623,14 +643,13 @@ next_slot:
 
 			btrfs_mark_buffer_dirty(path->nodes[0]);
 
-			disk_bytenr = le64_to_cpu(old.disk_bytenr);
 			if (disk_bytenr != 0) {
-				ret = btrfs_inc_extent_ref(trans, root,
-						disk_bytenr,
-						le64_to_cpu(old.disk_num_bytes),
-						leaf->start,
+				ret = btrfs_update_extent_ref(trans, root,
+						disk_bytenr, orig_parent,
+					        leaf->start,
 						root->root_key.objectid,
 						trans->transid, ins.objectid);
+
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3df0ffad976..e01c0d0310a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -86,6 +86,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 
 static void btrfs_truncate(struct inode *inode);
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written, int unlock);
 
 /*
  * a very lame attempt at stopping writes when the FS is 85% full.  There
@@ -262,35 +266,72 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+struct async_extent {
+	u64 start;
+	u64 ram_size;
+	u64 compressed_size;
+	struct page **pages;
+	unsigned long nr_pages;
+	struct list_head list;
+};
+
+struct async_cow {
+	struct inode *inode;
+	struct btrfs_root *root;
+	struct page *locked_page;
+	u64 start;
+	u64 end;
+	struct list_head extents;
+	struct btrfs_work work;
+};
+
+static noinline int add_async_extent(struct async_cow *cow,
+				     u64 start, u64 ram_size,
+				     u64 compressed_size,
+				     struct page **pages,
+				     unsigned long nr_pages)
+{
+	struct async_extent *async_extent;
+
+	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+	async_extent->start = start;
+	async_extent->ram_size = ram_size;
+	async_extent->compressed_size = compressed_size;
+	async_extent->pages = pages;
+	async_extent->nr_pages = nr_pages;
+	list_add_tail(&async_extent->list, &cow->extents);
+	return 0;
+}
+
 /*
- * when extent_io.c finds a delayed allocation range in the file,
- * the call backs end up in this code.  The basic idea is to
- * allocate extents on disk for the range, and create ordered data structs
- * in ram to track those extents.
+ * we create compressed extents in two phases.  The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
  *
- * locked_page is the page that writepage had locked already.  We use
- * it to make sure we don't do extra locks or unlocks.
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus.  The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
  *
- * *page_started is set to one if we unlock locked_page and do everything
- * required to start IO on it.  It may be clean and already done with
- * IO when we return.
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes.  This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that pdflush sent them down.
  */
-static int cow_file_range(struct inode *inode, struct page *locked_page,
-			  u64 start, u64 end, int *page_started)
+static noinline int compress_file_range(struct inode *inode,
+					struct page *locked_page,
+					u64 start, u64 end,
+					struct async_cow *async_cow,
+					int *num_added)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	u64 alloc_hint = 0;
 	u64 num_bytes;
-	unsigned long ram_size;
 	u64 orig_start;
 	u64 disk_num_bytes;
-	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
 	u64 actual_end;
-	struct btrfs_key ins;
-	struct extent_map *em;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
 	struct page **pages = NULL;
 	unsigned long nr_pages;
@@ -298,22 +339,12 @@ static int cow_file_range(struct inode *inode, struct page *locked_page,
 	unsigned long total_compressed = 0;
 	unsigned long total_in = 0;
 	unsigned long max_compressed = 128 * 1024;
-	unsigned long max_uncompressed = 256 * 1024;
+	unsigned long max_uncompressed = 128 * 1024;
 	int i;
-	int ordered_type;
 	int will_compress;
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(!trans);
-	btrfs_set_trans_block_group(trans, inode);
 	orig_start = start;
 
-	/*
-	 * compression made this loop a bit ugly, but the basic idea is to
-	 * compress some pages but keep the total size of the compressed
-	 * extent relatively small.  If compression is off, this goto target
-	 * is never used.
-	 */
 again:
 	will_compress = 0;
 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
@@ -324,7 +355,13 @@ again:
 
 	/* we want to make sure that amount of ram required to uncompress
 	 * an extent is reasonable, so we limit the total size in ram
-	 * of a compressed extent to 256k
+	 * of a compressed extent to 128k.  This is a crucial number
+	 * because it also controls how easily we can spread reads across
+	 * cpus for decompression.
+	 *
+	 * We also want to make sure the amount of IO required to do
+	 * a random read is reasonably small, so we limit the size of
+	 * a compressed extent to 128k.
 	 */
 	total_compressed = min(total_compressed, max_uncompressed);
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -333,18 +370,16 @@ again:
 	total_in = 0;
 	ret = 0;
 
-	/* we do compression for mount -o compress and when the
-	 * inode has not been flagged as nocompress
+	/*
+	 * we do compression for mount -o compress and when the
+	 * inode has not been flagged as nocompress.  This flag can
+	 * change at any time if we discover bad compression ratios.
 	 */
 	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
 	    btrfs_test_opt(root, COMPRESS)) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
-		/* we want to make sure the amount of IO required to satisfy
-		 * a random read is reasonably small, so we limit the size
-		 * of a compressed extent to 128k
-		 */
 		ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
 						total_compressed, pages,
 						nr_pages, &nr_pages_ret,
@@ -371,26 +406,34 @@ again:
 		}
 	}
 	if (start == 0) {
+		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(!trans);
+		btrfs_set_trans_block_group(trans, inode);
+
 		/* lets try to make an inline extent */
-		if (ret || total_in < (end - start + 1)) {
+		if (ret || total_in < (actual_end - start)) {
 			/* we didn't compress the entire range, try
-			 * to make an uncompressed inline extent.  This
-			 * is almost sure to fail, but maybe inline sizes
-			 * will get bigger later
+			 * to make an uncompressed inline extent.
 			 */
 			ret = cow_file_range_inline(trans, root, inode,
 						    start, end, 0, NULL);
 		} else {
+			/* try making a compressed inline extent */
 			ret = cow_file_range_inline(trans, root, inode,
 						    start, end,
 						    total_compressed, pages);
 		}
+		btrfs_end_transaction(trans, root);
 		if (ret == 0) {
+			/*
+			 * inline extent creation worked, we don't need
+			 * to create any more async work items.  Unlock
+			 * and free up our temp pages.
+			 */
 			extent_clear_unlock_delalloc(inode,
 						     &BTRFS_I(inode)->io_tree,
-						     start, end, NULL,
-						     1, 1, 1);
-			*page_started = 1;
+						     start, end, NULL, 1, 0,
+						     0, 1, 1, 1);
 			ret = 0;
 			goto free_pages_out;
 		}
@@ -435,53 +478,280 @@ again:
 		/* flag the file so we don't compress in the future */
 		btrfs_set_flag(inode, NOCOMPRESS);
 	}
+	if (will_compress) {
+		*num_added += 1;
 
-	BUG_ON(disk_num_bytes >
-	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+		/* the async work queues will take care of doing actual
+		 * allocation on disk for these compressed pages,
+		 * and will submit them to the elevator.
+		 */
+		add_async_extent(async_cow, start, num_bytes,
+				 total_compressed, pages, nr_pages_ret);
 
-	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+		if (start + num_bytes < end) {
+			start += num_bytes;
+			pages = NULL;
+			cond_resched();
+			goto again;
+		}
+	} else {
+		/*
+		 * No compression, but we still need to write the pages in
+		 * the file we've been given so far.  redirty the locked
+		 * page if it corresponds to our extent and set things up
+		 * for the async work queue to run cow_file_range to do
+		 * the normal delalloc dance
+		 */
+		if (page_offset(locked_page) >= start &&
+		    page_offset(locked_page) <= end) {
+			__set_page_dirty_nobuffers(locked_page);
+			/* unlocked later on in the async handlers */
+		}
+		add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+		*num_added += 1;
+	}
 
-	while(disk_num_bytes > 0) {
-		unsigned long min_bytes;
+out:
+	return 0;
+
+free_pages_out:
+	for (i = 0; i < nr_pages_ret; i++) {
+		WARN_ON(pages[i]->mapping);
+		page_cache_release(pages[i]);
+	}
+	if (pages)
+		kfree(pages);
+
+	goto out;
+}
+
+/*
+ * phase two of compressed writeback.  This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued.  We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline int submit_compressed_extents(struct inode *inode,
+					      struct async_cow *async_cow)
+{
+	struct async_extent *async_extent;
+	u64 alloc_hint = 0;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree;
+	int ret;
+
+	if (list_empty(&async_cow->extents))
+		return 0;
+
+	trans = btrfs_join_transaction(root, 1);
+
+	while(!list_empty(&async_cow->extents)) {
+		async_extent = list_entry(async_cow->extents.next,
+					  struct async_extent, list);
+		list_del(&async_extent->list);
 
+		io_tree = &BTRFS_I(inode)->io_tree;
+
+		/* did the compression code fall back to uncompressed IO? */
+		if (!async_extent->pages) {
+			int page_started = 0;
+			unsigned long nr_written = 0;
+
+			lock_extent(io_tree, async_extent->start,
+				    async_extent->start + async_extent->ram_size - 1,
+				    GFP_NOFS);
+
+			/* allocate blocks */
+			cow_file_range(inode, async_cow->locked_page,
+				       async_extent->start,
+				       async_extent->start +
+				       async_extent->ram_size - 1,
+				       &page_started, &nr_written, 0);
+
+			/*
+			 * if page_started, cow_file_range inserted an
+			 * inline extent and took care of all the unlocking
+			 * and IO for us.  Otherwise, we need to submit
+			 * all those pages down to the drive.
+			 */
+			if (!page_started)
+				extent_write_locked_range(io_tree,
+						  inode, async_extent->start,
+					          async_extent->start +
+						  async_extent->ram_size - 1,
+						  btrfs_get_extent,
+						  WB_SYNC_ALL);
+			kfree(async_extent);
+			cond_resched();
+			continue;
+		}
+
+		lock_extent(io_tree, async_extent->start,
+			    async_extent->start + async_extent->ram_size - 1,
+			    GFP_NOFS);
 		/*
-		 * the max size of a compressed extent is pretty small,
-		 * make the code a little less complex by forcing
-		 * the allocator to find a whole compressed extent at once
+		 * here we're doing allocation and writeback of the
+		 * compressed pages
 		 */
-		if (will_compress)
-			min_bytes = disk_num_bytes;
-		else
-			min_bytes = root->sectorsize;
+		btrfs_drop_extent_cache(inode, async_extent->start,
+					async_extent->start +
+					async_extent->ram_size - 1, 0);
+
+		ret = btrfs_reserve_extent(trans, root,
+					   async_extent->compressed_size,
+					   async_extent->compressed_size,
+					   0, alloc_hint,
+					   (u64)-1, &ins, 1);
+		BUG_ON(ret);
+		em = alloc_extent_map(GFP_NOFS);
+		em->start = async_extent->start;
+		em->len = async_extent->ram_size;
+
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+		while(1) {
+			spin_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			spin_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, async_extent->start,
+						async_extent->start +
+						async_extent->ram_size - 1, 0);
+		}
+
+		ret = btrfs_add_ordered_extent(inode, async_extent->start,
+					       ins.objectid,
+					       async_extent->ram_size,
+					       ins.offset,
+					       BTRFS_ORDERED_COMPRESSED);
+		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, root);
+
+		/*
+		 * clear dirty, set writeback and unlock the pages.
+		 */
+		extent_clear_unlock_delalloc(inode,
+					     &BTRFS_I(inode)->io_tree,
+					     async_extent->start,
+					     async_extent->start +
+					     async_extent->ram_size - 1,
+					     NULL, 1, 1, 0, 1, 1, 0);
+
+		ret = btrfs_submit_compressed_write(inode,
+				         async_extent->start,
+					 async_extent->ram_size,
+					 ins.objectid,
+					 ins.offset, async_extent->pages,
+					 async_extent->nr_pages);
+
+		BUG_ON(ret);
+		trans = btrfs_join_transaction(root, 1);
+		alloc_hint = ins.objectid + ins.offset;
+		kfree(async_extent);
+		cond_resched();
+	}
+
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written,
+				   int unlock)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 alloc_hint = 0;
+	u64 num_bytes;
+	unsigned long ram_size;
+	u64 disk_num_bytes;
+	u64 cur_alloc_size;
+	u64 blocksize = root->sectorsize;
+	u64 actual_end;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 0;
+
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
+	btrfs_set_trans_block_group(trans, inode);
 
+	actual_end = min_t(u64, i_size_read(inode), end + 1);
+
+	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = max(blocksize,  num_bytes);
+	disk_num_bytes = num_bytes;
+	ret = 0;
+
+	if (start == 0) {
+		/* lets try to make an inline extent */
+		ret = cow_file_range_inline(trans, root, inode,
+					    start, end, 0, NULL);
+		if (ret == 0) {
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start, end, NULL, 1, 1,
+						     1, 1, 1, 1);
+			*nr_written = *nr_written +
+			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+			*page_started = 1;
+			ret = 0;
+			goto out;
+		}
+	}
+
+	BUG_ON(disk_num_bytes >
+	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+	while(disk_num_bytes > 0) {
 		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-					   min_bytes, 0, alloc_hint,
+					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
-			WARN_ON(1);
-			goto free_pages_out_fail;
+			BUG();
 		}
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
 
-		if (will_compress) {
-			ram_size = num_bytes;
-			em->len = num_bytes;
-		} else {
-			/* ramsize == disk size */
-			ram_size = ins.offset;
-			em->len = ins.offset;
-		}
+		ram_size = ins.offset;
+		em->len = ins.offset;
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
-		if (will_compress)
-			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
-
 		while(1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
@@ -495,10 +765,8 @@ again:
 		}
 
 		cur_alloc_size = ins.offset;
-		ordered_type = will_compress ? BTRFS_ORDERED_COMPRESSED : 0;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-					       ram_size, cur_alloc_size,
-					       ordered_type);
+					       ram_size, cur_alloc_size, 0);
 		BUG_ON(ret);
 
 		if (disk_num_bytes < cur_alloc_size) {
@@ -506,82 +774,145 @@ again:
 			       cur_alloc_size);
 			break;
 		}
-
-		if (will_compress) {
-			/*
-			 * we're doing compression, we and we need to
-			 * submit the compressed extents down to the device.
-			 *
-			 * We lock down all the file pages, clearing their
-			 * dirty bits and setting them writeback.  Everyone
-			 * that wants to modify the page will wait on the
-			 * ordered extent above.
-			 *
-			 * The writeback bits on the file pages are
-			 * cleared when the compressed pages are on disk
-			 */
-			btrfs_end_transaction(trans, root);
-
-			if (start <= page_offset(locked_page) &&
-			    page_offset(locked_page) < start + ram_size) {
-				*page_started = 1;
-			}
-
-			extent_clear_unlock_delalloc(inode,
-						     &BTRFS_I(inode)->io_tree,
-						     start,
-						     start + ram_size - 1,
-						     NULL, 1, 1, 0);
-
-			ret = btrfs_submit_compressed_write(inode, start,
-						 ram_size, ins.objectid,
-						 cur_alloc_size, pages,
-						 nr_pages_ret);
-
-			BUG_ON(ret);
-			trans = btrfs_join_transaction(root, 1);
-			if (start + ram_size < end) {
-				start += ram_size;
-				alloc_hint = ins.objectid + ins.offset;
-				/* pages will be freed at end_bio time */
-				pages = NULL;
-				goto again;
-			} else {
-				/* we've written everything, time to go */
-				break;
-			}
-		}
 		/* we're not doing compressed IO, don't unlock the first
 		 * page (which the caller expects to stay locked), don't
 		 * clear any dirty bits and don't set any writeback bits
 		 */
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					     start, start + ram_size - 1,
-					     locked_page, 0, 0, 0);
+					     locked_page, unlock, 1,
+					     1, 0, 0, 0);
 		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
-
-	ret = 0;
 out:
+	ret = 0;
 	btrfs_end_transaction(trans, root);
 
 	return ret;
+}
 
-free_pages_out_fail:
-	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-				     start, end, locked_page, 0, 0, 0);
-free_pages_out:
-	for (i = 0; i < nr_pages_ret; i++) {
-		WARN_ON(pages[i]->mapping);
-		page_cache_release(pages[i]);
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	int num_added = 0;
+	async_cow = container_of(work, struct async_cow, work);
+
+	compress_file_range(async_cow->inode, async_cow->locked_page,
+			    async_cow->start, async_cow->end, async_cow,
+			    &num_added);
+	if (num_added == 0)
+		async_cow->inode = NULL;
+}
+
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root;
+	unsigned long nr_pages;
+
+	async_cow = container_of(work, struct async_cow, work);
+
+	root = async_cow->root;
+	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
+
+	if (atomic_read(&root->fs_info->async_delalloc_pages) <
+	    5 * 1042 * 1024 &&
+	    waitqueue_active(&root->fs_info->async_submit_wait))
+		wake_up(&root->fs_info->async_submit_wait);
+
+	if (async_cow->inode) {
+		submit_compressed_extents(async_cow->inode, async_cow);
 	}
-	if (pages)
-		kfree(pages);
+}
 
-	goto out;
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	async_cow = container_of(work, struct async_cow, work);
+	kfree(async_cow);
+}
+
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+				u64 start, u64 end, int *page_started,
+				unsigned long *nr_written)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long nr_pages;
+	u64 cur_end;
+	int limit = 10 * 1024 * 1042;
+
+	if (!btrfs_test_opt(root, COMPRESS)) {
+		return cow_file_range(inode, locked_page, start, end,
+				      page_started, nr_written, 1);
+	}
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
+			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+	while(start < end) {
+		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+		async_cow->inode = inode;
+		async_cow->root = root;
+		async_cow->locked_page = locked_page;
+		async_cow->start = start;
+
+		if (btrfs_test_flag(inode, NOCOMPRESS))
+			cur_end = end;
+		else
+			cur_end = min(end, start + 512 * 1024 - 1);
+
+		async_cow->end = cur_end;
+		INIT_LIST_HEAD(&async_cow->extents);
+
+		async_cow->work.func = async_cow_start;
+		async_cow->work.ordered_func = async_cow_submit;
+		async_cow->work.ordered_free = async_cow_free;
+		async_cow->work.flags = 0;
+
+		while(atomic_read(&root->fs_info->async_submit_draining) &&
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			     (atomic_read(&root->fs_info->async_delalloc_pages)
+			      == 0));
+		}
+
+		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+			PAGE_CACHE_SHIFT;
+		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+
+		btrfs_queue_worker(&root->fs_info->delalloc_workers,
+				   &async_cow->work);
+
+		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+			wait_event(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->async_delalloc_pages) <
+			    limit));
+		}
+
+		while(atomic_read(&root->fs_info->async_submit_draining) &&
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
+			   0));
+		}
+
+		*nr_written += nr_pages;
+		start = cur_end + 1;
+	}
+	*page_started = 1;
+	return 0;
 }
 
 /*
@@ -592,7 +923,8 @@ free_pages_out:
  * blocks on disk
  */
 static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
-			      u64 start, u64 end, int *page_started, int force)
+			      u64 start, u64 end, int *page_started, int force,
+			      unsigned long *nr_written)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -711,7 +1043,8 @@ out_check:
 		btrfs_release_path(root, path);
 		if (cow_start != (u64)-1) {
 			ret = cow_file_range(inode, locked_page, cow_start,
-					found_key.offset - 1, page_started);
+					found_key.offset - 1, page_started,
+					nr_written, 1);
 			BUG_ON(ret);
 			cow_start = (u64)-1;
 		}
@@ -748,9 +1081,10 @@ out_check:
 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
 					       num_bytes, num_bytes, type);
 		BUG_ON(ret);
+
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					cur_offset, cur_offset + num_bytes - 1,
-					locked_page, 0, 0, 0);
+					locked_page, 1, 1, 1, 0, 0, 0);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
@@ -761,7 +1095,7 @@ out_check:
 		cow_start = cur_offset;
 	if (cow_start != (u64)-1) {
 		ret = cow_file_range(inode, locked_page, cow_start, end,
-				     page_started);
+				     page_started, nr_written, 1);
 		BUG_ON(ret);
 	}
 
@@ -775,7 +1109,8 @@ out_check:
  * extent_io.c call back to do delayed allocation processing
  */
 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
-			      u64 start, u64 end, int *page_started)
+			      u64 start, u64 end, int *page_started,
+			      unsigned long *nr_written)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
@@ -783,13 +1118,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-					 page_started, 0);
+					 page_started, 0, nr_written);
 	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-					 page_started, 1);
+					 page_started, 1, nr_written);
 	else
-		ret = cow_file_range(inode, locked_page, start, end,
-				     page_started);
+		ret = cow_file_range_async(inode, locked_page, start, end,
+				     page_started, nr_written);
 
 	return ret;
 }
@@ -861,6 +1196,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	u64 map_length;
 	int ret;
 
+	if (bio_flags & EXTENT_BIO_COMPRESSED)
+		return 0;
+
 	length = bio->bi_size;
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
@@ -925,12 +1263,12 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		btrfs_test_flag(inode, NODATASUM);
 
 	if (!(rw & (1 << BIO_RW))) {
-		if (!skip_sum)
-			btrfs_lookup_bio_sums(root, inode, bio);
 
 		if (bio_flags & EXTENT_BIO_COMPRESSED)
 			return btrfs_submit_compressed_read(inode, bio,
 						    mirror_num, bio_flags);
+		else if (!skip_sum)
+			btrfs_lookup_bio_sums(root, inode, bio);
 		goto mapit;
 	} else if (!skip_sum) {
 		/* we're doing a write, do the async checksumming */
@@ -966,6 +1304,9 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
 {
+	if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
+		WARN_ON(1);
+	}
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   GFP_NOFS);
 }
@@ -2105,6 +2446,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
+	int encoding;
 	u64 mask = root->sectorsize - 1;
 
 	if (root->ref_cows)
@@ -2144,6 +2486,7 @@ search_again:
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = btrfs_key_type(&found_key);
+		encoding = 0;
 
 		if (found_key.objectid != inode->i_ino)
 			break;
@@ -2156,6 +2499,10 @@ search_again:
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_type = btrfs_file_extent_type(leaf, fi);
+			encoding = btrfs_file_extent_compression(leaf, fi);
+			encoding |= btrfs_file_extent_encryption(leaf, fi);
+			encoding |= btrfs_file_extent_other_encoding(leaf, fi);
+
 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
@@ -2200,7 +2547,7 @@ search_again:
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-			if (!del_item) {
+			if (!del_item && !encoding) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
 				extent_num_bytes = new_size -
@@ -2436,7 +2783,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 		last_byte = min(extent_map_end(em), block_end);
 		last_byte = (last_byte + mask) & ~mask;
 		if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
+			err = btrfs_drop_extents(trans, root, inode,
+						 cur_offset,
+						 cur_offset + hole_size,
+						 cur_offset, &hint_byte);
+			if (err)
+				break;
 			err = btrfs_insert_file_extent(trans, root,
 					inode->i_ino, cur_offset, 0,
 					0, hole_size, 0, hole_size,
@@ -3785,6 +4139,7 @@ int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
+
 	tree = &BTRFS_I(mapping->host)->io_tree;
 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
 }
@@ -4285,9 +4640,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	 * ordered extents get created before we return
 	 */
 	atomic_inc(&root->fs_info->async_submit_draining);
-	while(atomic_read(&root->fs_info->nr_async_submits)) {
+	while(atomic_read(&root->fs_info->nr_async_submits) ||
+	      atomic_read(&root->fs_info->async_delalloc_pages)) {
 		wait_event(root->fs_info->async_submit_wait,
-		   (atomic_read(&root->fs_info->nr_async_submits) == 0));
+		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 	}
 	atomic_dec(&root->fs_info->async_submit_draining);
 	return 0;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 370bb428559..027ad6b3839 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -390,7 +390,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
 	 * start IO on any dirty ones so the wait doesn't stall waiting
 	 * for pdflush to find them
 	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
+	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
 	if (wait) {
 		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 						 &entry->flags));
@@ -421,6 +421,12 @@ again:
 	 */
 	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
 
+	/* The compression code will leave pages locked but return from
+	 * writepage without setting the page writeback.  Starting again
+	 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+	 */
+	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+
 	btrfs_wait_on_page_writeback_range(inode->i_mapping,
 					   start >> PAGE_CACHE_SHIFT,
 					   orig_end >> PAGE_CACHE_SHIFT);
@@ -448,10 +454,7 @@ again:
 	}
 	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
 			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
-		printk("inode %lu still ordered or delalloc after wait "
-		       "%llu %llu\n", inode->i_ino,
-		       (unsigned long long)start,
-		       (unsigned long long)orig_end);
+		schedule_timeout(1);
 		goto again;
 	}
 	return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 431fdf144b5..ab9d5e89ed1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -375,6 +375,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
 	}
+
+	btrfs_start_delalloc_inodes(root);
+	btrfs_wait_ordered_extents(root, 0);
+
 	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index e99309180a1..ba2527d0873 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -423,8 +423,9 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 			/* we didn't make progress in this inflate
 			 * call, we're done
 			 */
-			if (ret != Z_STREAM_END)
+			if (ret != Z_STREAM_END) {
 				ret = -1;
+			}
 			break;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From af09abfece59aa50bfbf16f6f1f85822554e061f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 7 Nov 2008 12:35:44 -0500
Subject: Btrfs: make sure compressed bios don't complete too soon

When writing a compressed extent, a number of bios are created that
point to a single struct compressed_bio.  At end_io time an atomic counter in
the compressed_bio struct makes sure that all of the bios have finished
before final end_io processing is done.

But when multiple bios are needed to write a compressed extent, the
counter was being incremented after the first bio was sent to submit_bio.
It is possible the bio will complete before the counter is incremented,
making the end_io handler free the compressed_bio struct before
processing is finished.

The fix is to increment the atomic counter before bio submission,
both for compressed reads and writes.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/compression.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 284f21025bc..7397c622fb6 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -314,6 +314,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		    PAGE_CACHE_SIZE) {
 			bio_get(bio);
 
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
 			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 			BUG_ON(ret);
 
@@ -323,7 +330,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			bio_put(bio);
 
 			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
-			atomic_inc(&cb->pending_bios);
 			bio->bi_private = cb;
 			bio->bi_end_io = end_compressed_bio_write;
 			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@ -573,6 +579,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 			BUG_ON(ret);
 
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
+
 			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
 			BUG_ON(ret);
 
@@ -580,7 +594,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
 							GFP_NOFS);
-			atomic_inc(&cb->pending_bios);
 			comp_bio->bi_private = cb;
 			comp_bio->bi_end_io = end_compressed_bio_read;
 
-- 
cgit v1.2.3-70-g09d2


From ff5b7ee33d82414bf4baf299c21fb703bcc89629 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 10 Nov 2008 07:34:43 -0500
Subject: Btrfs: Fix csum error for compressed data

The decompress code doesn't take the logical offset in extent
pointer into account. If the logical offset isn't zero, data
will be decompressed into wrong pages.

The solution used here is to record the starting offset of the extent
in the file separately from the logical start of the extent_map struct.
This allows us to avoid problems inserting overlapping extents.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/compression.c | 6 ++----
 fs/btrfs/extent_map.h  | 1 +
 fs/btrfs/file.c        | 2 ++
 fs/btrfs/inode.c       | 3 +++
 4 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/compression.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7397c622fb6..8e7a78acf81 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -505,7 +505,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	struct block_device *bdev;
 	struct bio *comp_bio;
 	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
-	u64 em_len;
 	struct extent_map *em;
 	int ret;
 
@@ -524,9 +523,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	cb->errors = 0;
 	cb->inode = inode;
 
-	cb->start = em->start;
+	cb->start = em->orig_start;
 	compressed_len = em->block_len;
-	em_len = em->len;
 	free_extent_map(em);
 
 	cb->len = uncompressed_len;
@@ -545,7 +543,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	}
 	cb->nr_pages = nr_pages;
 
-	add_ra_bio_pages(inode, cb->start + em_len, cb);
+	add_ra_bio_pages(inode, em->start + em->len, cb);
 
 	if (!btrfs_test_opt(root, NODATASUM) &&
 	    !btrfs_test_flag(inode, NODATASUM)) {
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index accfedaeb51..fb6eeef06bb 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -20,6 +20,7 @@ struct extent_map {
 	/* all of these are in bytes */
 	u64 start;
 	u64 len;
+	u64 orig_start;
 	u64 block_start;
 	u64 block_len;
 	unsigned long flags;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 337221ecca2..85841c53880 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -222,6 +222,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		    em->start < start) {
 			split->start = em->start;
 			split->len = start - em->start;
+			split->orig_start = em->orig_start;
 			split->block_start = em->block_start;
 
 			if (compressed)
@@ -243,6 +244,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
 			split->start = start + len;
 			split->len = em->start + em->len - (start + len);
+			split->orig_start = em->orig_start;
 			split->bdev = em->bdev;
 			split->flags = flags;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e01c0d0310a..59660293d29 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3949,6 +3949,8 @@ again:
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		em->start = extent_start;
 		em->len = extent_end - extent_start;
+		em->orig_start = extent_start -
+				 btrfs_file_extent_offset(leaf, item);
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
 		if (bytenr == 0) {
 			em->block_start = EXTENT_MAP_HOLE;
@@ -3988,6 +3990,7 @@ again:
 		em->start = extent_start + extent_offset;
 		em->len = (copy_size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
+		em->orig_start = EXTENT_MAP_INLINE;
 		if (compressed)
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
-- 
cgit v1.2.3-70-g09d2


From e04ca626baee684bea9d6239e4e1119b696101b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 11:44:58 -0500
Subject: Btrfs: Fix use after free during compressed reads

Yan's fix to use the correct file offset during compressed reads used the
extent_map struct pointer after it had been freed.  This saves the
fields we want for later use instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/compression.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8e7a78acf81..b582c6fd80f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -505,6 +505,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	struct block_device *bdev;
 	struct bio *comp_bio;
 	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	u64 em_len;
+	u64 em_start;
 	struct extent_map *em;
 	int ret;
 
@@ -525,7 +527,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	cb->start = em->orig_start;
 	compressed_len = em->block_len;
+	em_len = em->len;
+	em_start = em->start;
 	free_extent_map(em);
+	em = NULL;
 
 	cb->len = uncompressed_len;
 	cb->compressed_len = compressed_len;
@@ -543,7 +548,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	}
 	cb->nr_pages = nr_pages;
 
-	add_ra_bio_pages(inode, em->start + em->len, cb);
+	add_ra_bio_pages(inode, em_start + em_len, cb);
 
 	if (!btrfs_test_opt(root, NODATASUM) &&
 	    !btrfs_test_flag(inode, NODATASUM)) {
-- 
cgit v1.2.3-70-g09d2


From 5b050f04c8ce911c5b6831305a24d70eab95e732 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 Nov 2008 09:34:41 -0500
Subject: Btrfs: Fix compile warnings on 32 bit machines

Simple casting here and there to fix things up.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c |  3 ++-
 fs/btrfs/extent_io.c   | 11 +++++++----
 fs/btrfs/inode.c       |  4 ++--
 fs/btrfs/zlib.c        |  2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/compression.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b582c6fd80f..bfd1512cce0 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -174,7 +174,8 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start,
 
 	while(nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
-				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
 		if (ret == 0) {
 			nr_pages -= 1;
 			index += 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5cc0082379c..54d013c3bb8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1157,7 +1157,8 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
 
 	while(nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
-				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 			if (pages[i] != locked_page)
 				unlock_page(pages[i]);
@@ -1192,7 +1193,8 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 	nrpages = end_index - index + 1;
 	while(nrpages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
-				     min(nrpages, ARRAY_SIZE(pages)), pages);
+				     min_t(unsigned long,
+				     nrpages, ARRAY_SIZE(pages)), pages);
 		if (ret == 0) {
 			ret = -EAGAIN;
 			goto done;
@@ -1346,7 +1348,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 
 	while(nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
-				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 			if (pages[i] == locked_page) {
 				page_cache_release(pages[i]);
@@ -1896,7 +1899,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	int contig = 0;
 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
-	size_t page_size = min(size, PAGE_CACHE_SIZE);
+	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
 
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5ca9c067237..2ed2deacde9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -184,7 +184,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 		int i = 0;
 		while(compressed_size > 0) {
 			cpage = compressed_pages[i];
-			cur_size = min(compressed_size,
+			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
 
 			kaddr = kmap(cpage);
@@ -3812,7 +3812,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 
 	read_extent_buffer(leaf, tmp, ptr, inline_size);
 
-	max_size = min(PAGE_CACHE_SIZE, max_size);
+	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
 	ret = btrfs_zlib_decompress(tmp, page, extent_offset,
 				    inline_size, max_size);
 	if (ret) {
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ba2527d0873..5b9f7002513 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -370,7 +370,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 
 	data_in = kmap(pages_in[page_in_index]);
 	workspace->inf_strm.next_in = data_in;
-	workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+	workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
 	workspace->inf_strm.total_in = 0;
 
 	workspace->inf_strm.total_out = 0;
-- 
cgit v1.2.3-70-g09d2