diff options
Diffstat (limited to 'fs/btrfs')
| -rw-r--r-- | fs/btrfs/Makefile | 3 | ||||
| -rw-r--r-- | fs/btrfs/compression.c | 454 | ||||
| -rw-r--r-- | fs/btrfs/compression.h | 47 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 99 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 18 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.h | 1 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 27 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 411 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.h | 17 | ||||
| -rw-r--r-- | fs/btrfs/extent_map.c | 9 | ||||
| -rw-r--r-- | fs/btrfs/extent_map.h | 6 | ||||
| -rw-r--r-- | fs/btrfs/file-item.c | 75 | ||||
| -rw-r--r-- | fs/btrfs/file.c | 263 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 584 | ||||
| -rw-r--r-- | fs/btrfs/ordered-data.c | 9 | ||||
| -rw-r--r-- | fs/btrfs/ordered-data.h | 10 | ||||
| -rw-r--r-- | fs/btrfs/print-tree.c | 7 | ||||
| -rw-r--r-- | fs/btrfs/super.c | 10 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/zlib.c | 637 | 
21 files changed, 2313 insertions, 379 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7125716e142..d2cf5a54a4b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \  	   transaction.o inode.o file.o tree-defrag.o \  	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \  	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ -	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o +	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ +	   compression.o  else  # Normal Makefile diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 00000000000..c5470367ca5 --- /dev/null +++ b/fs/btrfs/compression.c @@ -0,0 +1,454 @@ +/* + * Copyright (C) 2008 Oracle.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/bit_spinlock.h> +#include <linux/version.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compat.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { +	/* number of bios pending for this compressed extent */ +	atomic_t pending_bios; + +	/* the pages with the compressed data on them */ +	struct page **compressed_pages; + +	/* inode that owns this data */ +	struct inode *inode; + +	/* starting offset in the inode for our pages */ +	u64 start; + +	/* number of bytes in the inode we're working on */ +	unsigned long len; + +	/* number of bytes on disk */ +	unsigned long compressed_len; + +	/* number of compressed pages in the array */ +	unsigned long nr_pages; + +	/* IO errors */ +	int errors; + +	/* for reads, this is the bio we are copying the data into */ +	struct bio *orig_bio; +}; + +static struct bio *compressed_bio_alloc(struct block_device *bdev, +					u64 first_byte, gfp_t gfp_flags) +{ +	struct bio *bio; +	int nr_vecs; + +	nr_vecs = bio_get_nr_vecs(bdev); +	bio = bio_alloc(gfp_flags, nr_vecs); + +	if (bio == NULL && (current->flags & PF_MEMALLOC)) { +		while (!bio && (nr_vecs /= 2)) +			bio = bio_alloc(gfp_flags, nr_vecs); +	} + +	if (bio) { +		bio->bi_size = 0; +		bio->bi_bdev = bdev; +		bio->bi_sector = first_byte >> 9; +	} +	return bio; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ +	struct extent_io_tree *tree; +	struct compressed_bio *cb = bio->bi_private; +	struct inode *inode; +	struct page *page; +	unsigned long index; +	int ret; + +	if (err) +		cb->errors = 1; + +	/* if there are more bios still pending for this compressed +	 * extent, just exit +	 */ +	if (!atomic_dec_and_test(&cb->pending_bios)) +		goto out; + +	/* ok, we're the last bio for this extent, lets start +	 * the decompression. +	 */ +	inode = cb->inode; +	tree = &BTRFS_I(inode)->io_tree; +	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, +					cb->start, +					cb->orig_bio->bi_io_vec, +					cb->orig_bio->bi_vcnt, +					cb->compressed_len); +	if (ret) +		cb->errors = 1; + +	/* release the compressed pages */ +	index = 0; +	for (index = 0; index < cb->nr_pages; index++) { +		page = cb->compressed_pages[index]; +		page->mapping = NULL; +		page_cache_release(page); +	} + +	/* do io completion on the original bio */ +	if (cb->errors) +		bio_io_error(cb->orig_bio); +	else +		bio_endio(cb->orig_bio, 0); + +	/* finally free the cb struct */ +	kfree(cb->compressed_pages); +	kfree(cb); +out: +	bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline int end_compressed_writeback(struct inode *inode, u64 start, +					     unsigned long ram_size) +{ +	unsigned long index = start >> PAGE_CACHE_SHIFT; +	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; +	struct page *pages[16]; +	unsigned long nr_pages = end_index - index + 1; +	int i; +	int ret; + +	while(nr_pages > 0) { +		ret = find_get_pages_contig(inode->i_mapping, index, +				     min(nr_pages, ARRAY_SIZE(pages)), pages); +		if (ret == 0) { +			nr_pages -= 1; +			index += 1; +			continue; +		} +		for (i = 0; i < ret; i++) { +			end_page_writeback(pages[i]); +			page_cache_release(pages[i]); +		} +		nr_pages -= ret; +		index += ret; +	} +	/* the inode may be gone now */ +	return 0; +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ +	struct extent_io_tree *tree; +	struct compressed_bio *cb = bio->bi_private; +	struct inode *inode; +	struct page *page; +	unsigned long index; + +	if (err) +		cb->errors = 1; + +	/* if there are more bios still pending for this compressed +	 * extent, just exit +	 */ +	if (!atomic_dec_and_test(&cb->pending_bios)) +		goto out; + +	/* ok, we're the last bio for this extent, step one is to +	 * call back into the FS and do all the end_io operations +	 */ +	inode = cb->inode; +	tree = &BTRFS_I(inode)->io_tree; +	tree->ops->writepage_end_io_hook(cb->compressed_pages[0], +					 cb->start, +					 cb->start + cb->len - 1, +					 NULL, 1); + +	end_compressed_writeback(inode, cb->start, cb->len); +	/* note, our inode could be gone now */ + +	/* +	 * release the compressed pages, these came from alloc_page and +	 * are not attached to the inode at all +	 */ +	index = 0; +	for (index = 0; index < cb->nr_pages; index++) { +		page = cb->compressed_pages[index]; +		page->mapping = NULL; +		page_cache_release(page); +	} + +	/* finally free the cb struct */ +	kfree(cb->compressed_pages); +	kfree(cb); +out: +	bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, +				 unsigned long len, u64 disk_start, +				 unsigned long compressed_len, +				 struct page **compressed_pages, +				 unsigned long nr_pages) +{ +	struct bio *bio = NULL; +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct compressed_bio *cb; +	unsigned long bytes_left; +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +	int page_index = 0; +	struct page *page; +	u64 first_byte = disk_start; +	struct block_device *bdev; +	int ret; + +	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); +	cb = kmalloc(sizeof(*cb), GFP_NOFS); +	atomic_set(&cb->pending_bios, 0); +	cb->errors = 0; +	cb->inode = inode; +	cb->start = start; +	cb->len = len; +	cb->compressed_pages = compressed_pages; +	cb->compressed_len = compressed_len; +	cb->orig_bio = NULL; +	cb->nr_pages = nr_pages; + +	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + +	ret = btrfs_csum_file_bytes(root, inode, start, len); +	BUG_ON(ret); + +	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); +	bio->bi_private = cb; +	bio->bi_end_io = end_compressed_bio_write; +	atomic_inc(&cb->pending_bios); + +	/* create and submit bios for the compressed pages */ +	bytes_left = compressed_len; +	while(bytes_left > 0) { +		page = compressed_pages[page_index]; +		page->mapping = inode->i_mapping; +		if (bio->bi_size) +			ret = io_tree->ops->merge_bio_hook(page, 0, +							   PAGE_CACHE_SIZE, +							   bio, 0); +		else +			ret = 0; + +		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < +		    PAGE_CACHE_SIZE) { +			bio_get(bio); + +			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); +			BUG_ON(ret); + +			ret = btrfs_map_bio(root, WRITE, bio, 0, 1); +			BUG_ON(ret); + +			bio_put(bio); + +			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); +			atomic_inc(&cb->pending_bios); +			bio->bi_private = cb; +			bio->bi_end_io = end_compressed_bio_write; +			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); +		} +		page_index++; +		bytes_left -= PAGE_CACHE_SIZE; +		first_byte += PAGE_CACHE_SIZE; +	} +	bio_get(bio); + +	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); +	BUG_ON(ret); + +	ret = btrfs_map_bio(root, WRITE, bio, 0, 1); +	BUG_ON(ret); + +	bio_put(bio); +	return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it.  We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +				 int mirror_num, unsigned long bio_flags) +{ +	struct extent_io_tree *tree; +	struct extent_map_tree *em_tree; +	struct compressed_bio *cb; +	struct btrfs_root *root = BTRFS_I(inode)->root; +	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; +	unsigned long compressed_len; +	unsigned long nr_pages; +	unsigned long page_index; +	struct page *page; +	struct block_device *bdev; +	struct bio *comp_bio; +	u64 cur_disk_byte = (u64)bio->bi_sector << 9; +	struct extent_map *em; +	int ret; + +	tree = &BTRFS_I(inode)->io_tree; +	em_tree = &BTRFS_I(inode)->extent_tree; + +	/* we need the actual starting offset of this extent in the file */ +	spin_lock(&em_tree->lock); +	em = lookup_extent_mapping(em_tree, +				   page_offset(bio->bi_io_vec->bv_page), +				   PAGE_CACHE_SIZE); +	spin_unlock(&em_tree->lock); + +	cb = kmalloc(sizeof(*cb), GFP_NOFS); +	atomic_set(&cb->pending_bios, 0); +	cb->errors = 0; +	cb->inode = inode; + +	cb->start = em->start; +	compressed_len = em->block_len; +	free_extent_map(em); + +	cb->len = uncompressed_len; +	cb->compressed_len = compressed_len; +	cb->orig_bio = bio; + +	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / +				 PAGE_CACHE_SIZE; +	cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, +				       GFP_NOFS); +	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + +	for (page_index = 0; page_index < nr_pages; page_index++) { +		cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | +							      __GFP_HIGHMEM); +	} +	cb->nr_pages = nr_pages; + +	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); +	comp_bio->bi_private = cb; +	comp_bio->bi_end_io = end_compressed_bio_read; +	atomic_inc(&cb->pending_bios); + +	for (page_index = 0; page_index < nr_pages; page_index++) { +		page = cb->compressed_pages[page_index]; +		page->mapping = inode->i_mapping; +		if (comp_bio->bi_size) +			ret = tree->ops->merge_bio_hook(page, 0, +							PAGE_CACHE_SIZE, +							comp_bio, 0); +		else +			ret = 0; + +		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < +		    PAGE_CACHE_SIZE) { +			bio_get(comp_bio); + +			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); +			BUG_ON(ret); + +			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0); +			BUG_ON(ret); + +			bio_put(comp_bio); + +			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, +							GFP_NOFS); +			atomic_inc(&cb->pending_bios); +			bio->bi_private = cb; +			bio->bi_end_io = end_compressed_bio_write; +			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); +		} +		cur_disk_byte += PAGE_CACHE_SIZE; +	} +	bio_get(comp_bio); + +	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); +	BUG_ON(ret); + +	ret = btrfs_map_bio(root, READ, comp_bio, 0, 0); +	BUG_ON(ret); + +	bio_put(comp_bio); +	return 0; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 00000000000..421f5b4aa71 --- /dev/null +++ b/fs/btrfs/compression.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2008 Oracle.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +int btrfs_zlib_decompress(unsigned char *data_in, +			  struct page *dest_page, +			  unsigned long start_byte, +			  size_t srclen, size_t destlen); +int btrfs_zlib_compress_pages(struct address_space *mapping, +			      u64 start, unsigned long len, +			      struct page **pages, +			      unsigned long nr_dest_pages, +			      unsigned long *out_pages, +			      unsigned long *total_in, +			      unsigned long *total_out, +			      unsigned long max_out); +int btrfs_zlib_decompress_biovec(struct page **pages_in, +			      u64 disk_start, +			      struct bio_vec *bvec, +			      int vcnt, +			      size_t srclen); +void btrfs_zlib_exit(void); +int btrfs_submit_compressed_write(struct inode *inode, u64 start, +				  unsigned long len, u64 disk_start, +				  unsigned long compressed_len, +				  struct page **compressed_pages, +				  unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +				 int mirror_num, unsigned long bio_flags); +#endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8559f39fd47..793d8fdda24 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -400,10 +400,18 @@ struct btrfs_timespec {  	__le32 nsec;  } __attribute__ ((__packed__)); -/* - * there is no padding here on purpose.  If you want to extent the inode, - * make a new item type - */ +typedef enum { +	BTRFS_COMPRESS_NONE = 0, +	BTRFS_COMPRESS_ZLIB = 1, +	BTRFS_COMPRESS_LAST = 2, +} btrfs_compression_type; + +/* we don't understand any encryption methods right now */ +typedef enum { +	BTRFS_ENCRYPTION_NONE = 0, +	BTRFS_ENCRYPTION_LAST = 1, +} btrfs_encryption_type; +  struct btrfs_inode_item {  	/* nfs style generation number */  	__le64 generation; @@ -419,6 +427,7 @@ struct btrfs_inode_item {  	__le64 rdev;  	__le16 flags;  	__le16 compat_flags; +  	struct btrfs_timespec atime;  	struct btrfs_timespec ctime;  	struct btrfs_timespec mtime; @@ -454,8 +463,33 @@ struct btrfs_root_item {  #define BTRFS_FILE_EXTENT_INLINE 1  struct btrfs_file_extent_item { +	/* +	 * transaction id that created this extent +	 */  	__le64 generation; +	/* +	 * max number of bytes to hold this extent in ram +	 * when we split a compressed extent we can't know how big +	 * each of the resulting pieces will be.  So, this is +	 * an upper limit on the size of the extent in ram instead of +	 * an exact limit. +	 */ +	__le64 ram_bytes; + +	/* +	 * 32 bits for the various ways we might encode the data, +	 * including compression and encryption.  If any of these +	 * are set to something a given disk format doesn't understand +	 * it is treated like an incompat flag for reading and writing, +	 * but not for stat. +	 */ +	u8 compression; +	u8 encryption; +	__le16 other_encoding; /* spare for later use */ + +	/* are we inline data or a real extent? */  	u8 type; +  	/*  	 * disk space consumed by the extent, checksum blocks are included  	 * in these numbers @@ -471,9 +505,11 @@ struct btrfs_file_extent_item {  	 */  	__le64 offset;  	/* -	 * the logical number of file blocks (no csums included) +	 * the logical number of file blocks (no csums included).  This +	 * always reflects the size uncompressed and without encoding.  	 */  	__le64 num_bytes; +  } __attribute__ ((__packed__));  struct btrfs_csum_item { @@ -814,6 +850,7 @@ struct btrfs_root {  #define BTRFS_MOUNT_NOBARRIER		(1 << 2)  #define BTRFS_MOUNT_SSD			(1 << 3)  #define BTRFS_MOUNT_DEGRADED		(1 << 4) +#define BTRFS_MOUNT_COMPRESS		(1 << 5)  #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)  #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt) @@ -825,6 +862,7 @@ struct btrfs_root {  #define BTRFS_INODE_NODATASUM		(1 << 0)  #define BTRFS_INODE_NODATACOW		(1 << 1)  #define BTRFS_INODE_READONLY		(1 << 2) +#define BTRFS_INODE_NOCOMPRESS		(1 << 3)  #define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \  					 ~BTRFS_INODE_##flag)  #define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \ @@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)  	return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;  } -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, -					       struct btrfs_item *e) -{ -	unsigned long offset; -	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); -	return btrfs_item_size(eb, e) - offset; -} -  BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,  		   disk_bytenr, 64);  BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, @@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,  		  offset, 64);  BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,  		   num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, +		   ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, +		   compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, +		   encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, +		   other_encoding, 16); + +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, +					       struct btrfs_file_extent_item *e) +{ +	return btrfs_file_extent_ram_bytes(eb, e); +} + +/* + * this returns the number of bytes used by the item on disk, minus the + * size of any extent headers.  If a file is compressed on disk, this is + * the compressed size + */ +static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, +						    struct btrfs_item *e) +{ +	unsigned long offset; +	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); +	return btrfs_item_size(eb, e) - offset; +}  static inline struct btrfs_root *btrfs_sb(struct super_block *sb)  { @@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root  int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,  			  struct bio *bio);  int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, -			       struct btrfs_root *root, -			       u64 objectid, u64 pos, u64 disk_offset, -			       u64 disk_num_bytes, -			     u64 num_bytes, u64 offset); +			     struct btrfs_root *root, +			     u64 objectid, u64 pos, +			     u64 disk_offset, u64 disk_num_bytes, +			     u64 num_bytes, u64 offset, u64 ram_bytes, +			     u8 compression, u8 encryption, u16 other_encoding);  int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root,  			     struct btrfs_path *path, u64 objectid, @@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,  			   struct btrfs_ordered_sum *sums);  int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,  		       struct bio *bio); +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, +			  u64 start, unsigned long len);  struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,  					  struct btrfs_root *root,  					  struct btrfs_path *path, @@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,  				  int namelen);  int btrfs_merge_bio_hook(struct page *page, unsigned long offset, -			 size_t size, struct bio *bio); +			 size_t size, struct bio *bio, unsigned long bio_flags);  unsigned long btrfs_force_ra(struct address_space *mapping,  			      struct file_ra_state *ra, struct file *file, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0be044bb619..dc95f636a11 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -83,6 +83,7 @@ struct async_submit_bio {  	extent_submit_bio_hook_t *submit_bio_hook;  	int rw;  	int mirror_num; +	unsigned long bio_flags;  	struct btrfs_work work;  }; @@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,  	}  	em->start = 0;  	em->len = (u64)-1; +	em->block_len = (u64)-1;  	em->block_start = 0;  	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; @@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)  		wake_up(&fs_info->async_submit_wait);  	async->submit_bio_hook(async->inode, async->rw, async->bio, -			       async->mirror_num); +			       async->mirror_num, async->bio_flags);  	kfree(async);  }  int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,  			int rw, struct bio *bio, int mirror_num, +			unsigned long bio_flags,  			extent_submit_bio_hook_t *submit_bio_hook)  {  	struct async_submit_bio *async; @@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,  	async->submit_bio_hook = submit_bio_hook;  	async->work.func = run_one_async_submit;  	async->work.flags = 0; +	async->bio_flags = bio_flags;  	while(atomic_read(&fs_info->async_submit_draining) &&  	      atomic_read(&fs_info->nr_async_submits)) { @@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)  }  static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, -				 int mirror_num) +				 int mirror_num, unsigned long bio_flags)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret; @@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  }  static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, -				 int mirror_num) +				 int mirror_num, unsigned long bio_flags)  {  	/*  	 * kthread helpers are used to submit writes so that checksumming  	 * can happen in parallel across all CPUs  	 */  	if (!(rw & (1 << BIO_RW))) { -		return __btree_submit_bio_hook(inode, rw, bio, mirror_num); +		return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);  	}  	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, -				   inode, rw, bio, mirror_num, +				   inode, rw, bio, mirror_num, 0,  				   __btree_submit_bio_hook);  } @@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	fs_info->btree_inode = new_inode(sb);  	fs_info->btree_inode->i_ino = 1;  	fs_info->btree_inode->i_nlink = 1; +  	fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);  	INIT_LIST_HEAD(&fs_info->ordered_extents); @@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	 */  	btrfs_init_workers(&fs_info->workers, "worker",  			   fs_info->thread_pool_size); +  	btrfs_init_workers(&fs_info->submit_workers, "submit",  			   min_t(u64, fs_devices->num_devices,  			   fs_info->thread_pool_size)); @@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	}  	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); +	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, +				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);  	nodesize = btrfs_super_nodesize(disk_super);  	leafsize = btrfs_super_leafsize(disk_super); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f84f5058dbb..4eb1f1408d2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,  			int metadata);  int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,  			int rw, struct bio *bio, int mirror_num, +			unsigned long bio_flags,  			extent_submit_bio_hook_t *submit_bio_hook);  int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);  unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 280ac1aa9b6..bbf04e80a1a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,  	em->start = extent_key->objectid - offset;  	em->len = extent_key->offset; +	em->block_len = extent_key->offset;  	em->block_start = extent_key->objectid;  	em->bdev = root->fs_info->fs_devices->latest_bdev;  	set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -3314,10 +3315,14 @@ struct btrfs_ref_path {  };  struct disk_extent { +	u64 ram_bytes;  	u64 disk_bytenr;  	u64 disk_num_bytes;  	u64 offset;  	u64 num_bytes; +	u8 compression; +	u8 encryption; +	u16 other_encoding;  };  static int is_cowonly_root(u64 root_objectid) @@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,  			btrfs_file_extent_disk_num_bytes(leaf, fi);  		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);  		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); +		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); +		exts[nr].compression = btrfs_file_extent_compression(leaf, fi); +		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); +		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, +									   fi);  		WARN_ON(exts[nr].offset > 0);  		WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); @@ -3846,6 +3856,8 @@ next:  						new_extents[0].disk_bytenr);  			btrfs_set_file_extent_disk_num_bytes(leaf, fi,  						new_extents[0].disk_num_bytes); +			btrfs_set_file_extent_ram_bytes(leaf, fi, +						new_extents[0].ram_bytes);  			ext_offset += new_extents[0].offset;  			btrfs_set_file_extent_offset(leaf, fi, ext_offset);  			btrfs_mark_buffer_dirty(leaf); @@ -3911,6 +3923,16 @@ next:  						new_extents[i].disk_bytenr);  				btrfs_set_file_extent_disk_num_bytes(leaf, fi,  						new_extents[i].disk_num_bytes); +				btrfs_set_file_extent_ram_bytes(leaf, fi, +						new_extents[i].ram_bytes); + +				btrfs_set_file_extent_compression(leaf, fi, +						new_extents[i].compression); +				btrfs_set_file_extent_encryption(leaf, fi, +						new_extents[i].encryption); +				btrfs_set_file_extent_other_encoding(leaf, fi, +						new_extents[i].other_encoding); +  				btrfs_set_file_extent_num_bytes(leaf, fi,  							extent_len);  				ext_offset += new_extents[i].offset; @@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,  		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;  		btrfs_set_file_extent_generation(leaf, fi, trans->transid); +		btrfs_set_file_extent_ram_bytes(leaf, fi, +						new_extent->ram_bytes);  		btrfs_set_file_extent_disk_bytenr(leaf, fi,  						new_extent->disk_bytenr);  		btrfs_set_file_extent_disk_num_bytes(leaf, fi, @@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,  	BUG_ON(err);  	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, -				       group->key.offset, 0); +				       group->key.offset, 0, group->key.offset, +				       0, 0, 0);  	BUG_ON(err);  	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 563b2d12f4f..314041fdfa4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;  static LIST_HEAD(buffers);  static LIST_HEAD(states); +#define LEAK_DEBUG 1  #ifdef LEAK_DEBUG  static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;  #endif @@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);   *   * 1 is returned if we find something, 0 if nothing was in the tree   */ -static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, -					     u64 *start, u64 *end, u64 max_bytes) +static noinline u64 find_delalloc_range(struct extent_io_tree *tree, +					u64 *start, u64 *end, u64 max_bytes)  {  	struct rb_node *node;  	struct extent_state *state; @@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,  	u64 total_bytes = 0;  	spin_lock_irq(&tree->lock); +  	/*  	 * this search will find all the extents that end after  	 * our range starts.  	 */ -search_again:  	node = tree_search(tree, cur_start);  	if (!node) {  		if (!found) @@ -1100,40 +1101,6 @@ search_again:  				*end = state->end;  			goto out;  		} -		if (!found && !(state->state & EXTENT_BOUNDARY)) { -			struct extent_state *prev_state; -			struct rb_node *prev_node = node; -			while(1) { -				prev_node = rb_prev(prev_node); -				if (!prev_node) -					break; -				prev_state = rb_entry(prev_node, -						      struct extent_state, -						      rb_node); -				if ((prev_state->end + 1 != state->start) || -				    !(prev_state->state & EXTENT_DELALLOC)) -					break; -				if ((cur_start - prev_state->start) * 2 > -				     max_bytes) -					break; -				state = prev_state; -				node = prev_node; -			} -		} -		if (state->state & EXTENT_LOCKED) { -			DEFINE_WAIT(wait); -			atomic_inc(&state->refs); -			prepare_to_wait(&state->wq, &wait, -					TASK_UNINTERRUPTIBLE); -			spin_unlock_irq(&tree->lock); -			schedule(); -			spin_lock_irq(&tree->lock); -			finish_wait(&state->wq, &wait); -			free_extent_state(state); -			goto search_again; -		} -		set_state_cb(tree, state, EXTENT_LOCKED); -		state->state |= EXTENT_LOCKED;  		if (!found)  			*start = state->start;  		found++; @@ -1151,6 +1118,208 @@ out:  	return found;  } +static noinline int __unlock_for_delalloc(struct inode *inode, +					  struct page *locked_page, +					  u64 start, u64 end) +{ +	int ret; +	struct page *pages[16]; +	unsigned long index = start >> PAGE_CACHE_SHIFT; +	unsigned long end_index = end >> PAGE_CACHE_SHIFT; +	unsigned long nr_pages = end_index - index + 1; +	int i; + +	if (index == locked_page->index && end_index == index) +		return 0; + +	while(nr_pages > 0) { +		ret = find_get_pages_contig(inode->i_mapping, index, +				     min(nr_pages, ARRAY_SIZE(pages)), pages); +		for (i = 0; i < ret; i++) { +			if (pages[i] != locked_page) +				unlock_page(pages[i]); +			page_cache_release(pages[i]); +		} +		nr_pages -= ret; +		index += ret; +		cond_resched(); +	} +	return 0; +} + +static noinline int lock_delalloc_pages(struct inode *inode, +					struct page *locked_page, +					u64 delalloc_start, +					u64 delalloc_end) +{ +	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; +	unsigned long start_index = index; +	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; +	unsigned long pages_locked = 0; +	struct page *pages[16]; +	unsigned long nrpages; +	int ret; +	int i; + +	/* the caller is responsible for locking the start index */ +	if (index == locked_page->index && index == end_index) +		return 0; + +	/* skip the page at the start index */ +	nrpages = end_index - index + 1; +	while(nrpages > 0) { +		ret = find_get_pages_contig(inode->i_mapping, index, +				     min(nrpages, ARRAY_SIZE(pages)), pages); +		if (ret == 0) { +			ret = -EAGAIN; +			goto done; +		} +		/* now we have an array of pages, lock them all */ +		for (i = 0; i < ret; i++) { +			/* +			 * the caller is taking responsibility for +			 * locked_page +			 */ +			if (pages[i] != locked_page) +				lock_page(pages[i]); +			page_cache_release(pages[i]); +		} +		pages_locked += ret; +		nrpages -= ret; +		index += ret; +		cond_resched(); +	} +	ret = 0; +done: +	if (ret && pages_locked) { +		__unlock_for_delalloc(inode, locked_page, +			      delalloc_start, +			      ((u64)(start_index + pages_locked - 1)) << +			      PAGE_CACHE_SHIFT); +	} +	return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'.  start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct inode *inode, +					     struct extent_io_tree *tree, +					     struct page *locked_page, +					     u64 *start, u64 *end, +					     u64 max_bytes) +{ +	u64 delalloc_start; +	u64 delalloc_end; +	u64 found; +	int ret; +	int loops = 0; + +again: +	/* step one, find a bunch of delalloc bytes starting at start */ +	delalloc_start = *start; +	delalloc_end = 0; +	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, +				    max_bytes); +	if (!found) { +		*start = delalloc_start; +		*end = delalloc_end; +		return found; +	} + +	/* +	 * make sure to limit the number of pages we try to lock down +	 * if we're looping. +	 */ +	if (delalloc_end + 1 - delalloc_start > max_bytes && loops) { +		delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) & +			~((u64)PAGE_CACHE_SIZE - 1); +	} +	/* step two, lock all the pages after the page that has start */ +	ret = lock_delalloc_pages(inode, locked_page, +				  delalloc_start, delalloc_end); +	if (ret == -EAGAIN) { +		/* some of the pages are gone, lets avoid looping by +		 * shortening the size of the delalloc range we're searching +		 */ +		if (!loops) { +			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); +			max_bytes = PAGE_CACHE_SIZE - offset; +			loops = 1; +			goto again; +		} else { +			found = 0; +			goto out_failed; +		} +	} +	BUG_ON(ret); + +	/* step three, lock the state bits for the whole range */ +	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + +	/* then test to make sure it is all still delalloc */ +	ret = test_range_bit(tree, delalloc_start, delalloc_end, +			     EXTENT_DELALLOC, 1); +	if (!ret) { +		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); +		__unlock_for_delalloc(inode, locked_page, +			      delalloc_start, delalloc_end); +		cond_resched(); +		goto again; +	} +	*start = delalloc_start; +	*end = delalloc_end; +out_failed: +	return found; +} + +int extent_clear_unlock_delalloc(struct inode *inode, +				struct extent_io_tree *tree, +				u64 start, u64 end, struct page *locked_page, +				int clear_dirty, int set_writeback, +				int end_writeback) +{ +	int ret; +	struct page *pages[16]; +	unsigned long index = start >> PAGE_CACHE_SHIFT; +	unsigned long end_index = end >> PAGE_CACHE_SHIFT; +	unsigned long nr_pages = end_index - index + 1; +	int i; +	int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + +	if (clear_dirty) +		clear_bits |= EXTENT_DIRTY; + +	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); + +	while(nr_pages > 0) { +		ret = find_get_pages_contig(inode->i_mapping, index, +				     min(nr_pages, ARRAY_SIZE(pages)), pages); +		for (i = 0; i < ret; i++) { +			if (pages[i] == locked_page) { +				page_cache_release(pages[i]); +				continue; +			} +			if (clear_dirty) +				clear_page_dirty_for_io(pages[i]); +			if (set_writeback) +				set_page_writeback(pages[i]); +			if (end_writeback) +				end_page_writeback(pages[i]); +			unlock_page(pages[i]); +			page_cache_release(pages[i]); +		} +		nr_pages -= ret; +		index += ret; +		cond_resched(); +	} +	return 0; +} +EXPORT_SYMBOL(extent_clear_unlock_delalloc); +  /*   * count the number of bytes in the tree that have a given bit(s)   * set.  This can be fairly slow, except for EXTENT_DIRTY which is @@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,  	return bio;  } -static int submit_one_bio(int rw, struct bio *bio, int mirror_num) +static int submit_one_bio(int rw, struct bio *bio, int mirror_num, +			  unsigned long bio_flags)  {  	int ret = 0;  	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;  	struct page *page = bvec->bv_page;  	struct extent_io_tree *tree = bio->bi_private; -	struct rb_node *node; -	struct extent_state *state;  	u64 start;  	u64 end;  	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;  	end = start + bvec->bv_len - 1; -	spin_lock_irq(&tree->lock); -	node = __etree_search(tree, start, NULL, NULL); -	BUG_ON(!node); -	state = rb_entry(node, struct extent_state, rb_node); -	while(state->end < end) { -		node = rb_next(node); -		state = rb_entry(node, struct extent_state, rb_node); -	} -	BUG_ON(state->end != end); -	spin_unlock_irq(&tree->lock); -  	bio->bi_private = NULL;  	bio_get(bio);  	if (tree->ops && tree->ops->submit_bio_hook)  		tree->ops->submit_bio_hook(page->mapping->host, rw, bio, -					   mirror_num); +					   mirror_num, bio_flags);  	else  		submit_bio(rw, bio);  	if (bio_flagged(bio, BIO_EOPNOTSUPP)) @@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,  			      struct bio **bio_ret,  			      unsigned long max_pages,  			      bio_end_io_t end_io_func, -			      int mirror_num) +			      int mirror_num, +			      unsigned long prev_bio_flags, +			      unsigned long bio_flags)  {  	int ret = 0;  	struct bio *bio;  	int nr; +	int contig = 0; +	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; +	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; +	size_t page_size = min(size, PAGE_CACHE_SIZE);  	if (bio_ret && *bio_ret) {  		bio = *bio_ret; -		if (bio->bi_sector + (bio->bi_size >> 9) != sector || +		if (old_compressed) +			contig = bio->bi_sector == sector; +		else +			contig = bio->bi_sector + (bio->bi_size >> 9) == +				sector; + +		if (prev_bio_flags != bio_flags || !contig ||  		    (tree->ops && tree->ops->merge_bio_hook && -		     tree->ops->merge_bio_hook(page, offset, size, bio)) || -		    bio_add_page(bio, page, size, offset) < size) { -			ret = submit_one_bio(rw, bio, mirror_num); +		     tree->ops->merge_bio_hook(page, offset, page_size, bio, +					       bio_flags)) || +		    bio_add_page(bio, page, page_size, offset) < page_size) { +			ret = submit_one_bio(rw, bio, mirror_num, +					     prev_bio_flags);  			bio = NULL;  		} else {  			return 0;  		}  	} -	nr = bio_get_nr_vecs(bdev); +	if (this_compressed) +		nr = BIO_MAX_PAGES; +	else +		nr = bio_get_nr_vecs(bdev); +  	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);  	if (!bio) {  		printk("failed to allocate bio nr %d\n", nr);  	} - -	bio_add_page(bio, page, size, offset); +	bio_add_page(bio, page, page_size, offset);  	bio->bi_end_io = end_io_func;  	bio->bi_private = tree;  	if (bio_ret) {  		*bio_ret = bio;  	} else { -		ret = submit_one_bio(rw, bio, mirror_num); +		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);  	}  	return ret; @@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)  static int __extent_read_full_page(struct extent_io_tree *tree,  				   struct page *page,  				   get_extent_t *get_extent, -				   struct bio **bio, int mirror_num) +				   struct bio **bio, int mirror_num, +				   unsigned long *bio_flags)  {  	struct inode *inode = page->mapping->host;  	u64 start = (u64)page->index << PAGE_CACHE_SHIFT; @@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  	int nr = 0;  	size_t page_offset = 0;  	size_t iosize; +	size_t disk_io_size;  	size_t blocksize = inode->i_sb->s_blocksize; +	unsigned long this_bio_flag = 0;  	set_page_extent_mapped(page);  	end = page_end;  	lock_extent(tree, start, end, GFP_NOFS); +	if (page->index == last_byte >> PAGE_CACHE_SHIFT) { +		char *userpage; +		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + +		if (zero_offset) { +			iosize = PAGE_CACHE_SIZE - zero_offset; +			userpage = kmap_atomic(page, KM_USER0); +			memset(userpage + zero_offset, 0, iosize); +			flush_dcache_page(page); +			kunmap_atomic(userpage, KM_USER0); +		} +	}  	while (cur <= end) {  		if (cur >= last_byte) {  			char *userpage; @@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);  		}  		BUG_ON(end < cur); +		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) +			this_bio_flag = EXTENT_BIO_COMPRESSED; +  		iosize = min(extent_map_end(em) - cur, end - cur + 1);  		cur_end = min(extent_map_end(em) - 1, end);  		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); -		sector = (em->block_start + extent_offset) >> 9; +		if (this_bio_flag & EXTENT_BIO_COMPRESSED) { +			disk_io_size = em->block_len; +			sector = em->block_start >> 9; +		} else { +			sector = (em->block_start + extent_offset) >> 9; +			disk_io_size = iosize; +		}  		bdev = em->bdev;  		block_start = em->block_start;  		free_extent_map(em); @@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);  			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;  			pnr -= page->index;  			ret = submit_extent_page(READ, tree, page, -					 sector, iosize, page_offset, +					 sector, disk_io_size, page_offset,  					 bdev, bio, pnr, -					 end_bio_extent_readpage, mirror_num); +					 end_bio_extent_readpage, mirror_num, +					 *bio_flags, +					 this_bio_flag);  			nr++; +			*bio_flags = this_bio_flag;  		}  		if (ret)  			SetPageError(page); @@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,  			    get_extent_t *get_extent)  {  	struct bio *bio = NULL; +	unsigned long bio_flags = 0;  	int ret; -	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0); +	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, +				      &bio_flags);  	if (bio) -		submit_one_bio(READ, bio, 0); +		submit_one_bio(READ, bio, 0, bio_flags);  	return ret;  }  EXPORT_SYMBOL(extent_read_full_page); @@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;  	u64 nr_delalloc;  	u64 delalloc_end; +	int page_started; +	int compressed;  	WARN_ON(!PageLocked(page));  	pg_offset = i_size & (PAGE_CACHE_SIZE - 1); @@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	delalloc_start = start;  	delalloc_end = 0; +	page_started = 0;  	while(delalloc_end < page_end) { -		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, +		nr_delalloc = find_lock_delalloc_range(inode, tree, +						       page, +						       &delalloc_start,  						       &delalloc_end,  						       128 * 1024 * 1024);  		if (nr_delalloc == 0) {  			delalloc_start = delalloc_end + 1;  			continue;  		} -		tree->ops->fill_delalloc(inode, delalloc_start, -					 delalloc_end); -		clear_extent_bit(tree, delalloc_start, -				 delalloc_end, -				 EXTENT_LOCKED | EXTENT_DELALLOC, -				 1, 0, GFP_NOFS); +		tree->ops->fill_delalloc(inode, page, delalloc_start, +					 delalloc_end, &page_started);  		delalloc_start = delalloc_end + 1;  	} + +	/* did the fill delalloc function already unlock and start the IO? */ +	if (page_started) { +		return 0; +	} +  	lock_extent(tree, start, page_end, GFP_NOFS);  	unlock_start = start;  	if (tree->ops && tree->ops->writepage_start_hook) { -		ret = tree->ops->writepage_start_hook(page, start, page_end); +		ret = tree->ops->writepage_start_hook(page, start, +						      page_end);  		if (ret == -EAGAIN) {  			unlock_extent(tree, start, page_end, GFP_NOFS);  			redirty_page_for_writepage(wbc, page); @@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  		sector = (em->block_start + extent_offset) >> 9;  		bdev = em->bdev;  		block_start = em->block_start; +		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);  		free_extent_map(em);  		em = NULL; -		if (block_start == EXTENT_MAP_HOLE || +		/* +		 * compressed and inline extents are written through other +		 * paths in the FS +		 */ +		if (compressed || block_start == EXTENT_MAP_HOLE ||  		    block_start == EXTENT_MAP_INLINE) {  			clear_extent_dirty(tree, cur,  					   cur + iosize - 1, GFP_NOFS); @@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  			unlock_extent(tree, unlock_start, cur + iosize -1,  				      GFP_NOFS); -			if (tree->ops && tree->ops->writepage_end_io_hook) +			/* +			 * end_io notification does not happen here for +			 * compressed extents +			 */ +			if (!compressed && tree->ops && +			    tree->ops->writepage_end_io_hook)  				tree->ops->writepage_end_io_hook(page, cur,  							 cur + iosize - 1,  							 NULL, 1); -			cur = cur + iosize; +			else if (compressed) { +				/* we don't want to end_page_writeback on +				 * a compressed extent.  this happens +				 * elsewhere +				 */ +				nr++; +			} + +			cur += iosize;  			pg_offset += iosize;  			unlock_start = cur;  			continue;  		} -  		/* leave this out until we have a page_mkwrite call */  		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,  				   EXTENT_DIRTY, 0)) { @@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  			pg_offset += iosize;  			continue;  		} +  		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);  		if (tree->ops && tree->ops->writepage_io_hook) {  			ret = tree->ops->writepage_io_hook(page, cur, @@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  			ret = submit_extent_page(WRITE, tree, page, sector,  						 iosize, pg_offset, bdev,  						 &epd->bio, max_nr, -						 end_bio_extent_writepage, 0); +						 end_bio_extent_writepage, +						 0, 0, 0);  			if (ret)  				SetPageError(page);  		} @@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,  	extent_write_cache_pages(tree, mapping, &wbc_writepages,  				 __extent_writepage, &epd);  	if (epd.bio) { -		submit_one_bio(WRITE, epd.bio, 0); +		submit_one_bio(WRITE, epd.bio, 0, 0);  	}  	return ret;  } @@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,  	ret = extent_write_cache_pages(tree, mapping, wbc,  				       __extent_writepage, &epd);  	if (epd.bio) { -		submit_one_bio(WRITE, epd.bio, 0); +		submit_one_bio(WRITE, epd.bio, 0, 0);  	}  	return ret;  } @@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,  	struct bio *bio = NULL;  	unsigned page_idx;  	struct pagevec pvec; +	unsigned long bio_flags = 0;  	pagevec_init(&pvec, 0);  	for (page_idx = 0; page_idx < nr_pages; page_idx++) { @@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,  			if (!pagevec_add(&pvec, page))  				__pagevec_lru_add(&pvec);  			__extent_read_full_page(tree, page, get_extent, -						&bio, 0); +						&bio, 0, &bio_flags);  		}  		page_cache_release(page);  	} @@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,  		__pagevec_lru_add(&pvec);  	BUG_ON(!list_empty(pages));  	if (bio) -		submit_one_bio(READ, bio, 0); +		submit_one_bio(READ, bio, 0, bio_flags);  	return 0;  }  EXPORT_SYMBOL(extent_readpages); @@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,  			ret = submit_extent_page(READ, tree, page,  					 sector, iosize, page_offset, em->bdev,  					 NULL, 1, -					 end_bio_extent_preparewrite, 0); +					 end_bio_extent_preparewrite, 0, +					 0, 0);  			iocount++;  			block_start = block_start + iosize;  		} else { @@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,  			}  			if (!test_range_bit(tree, em->start,  					    extent_map_end(em) - 1, -					    EXTENT_LOCKED, 0)) { +					    EXTENT_LOCKED | EXTENT_WRITEBACK | +					    EXTENT_ORDERED, +					    0)) {  				remove_extent_mapping(map, em);  				/* once for the rb tree */  				free_extent_map(em); @@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  	int inc_all_pages = 0;  	unsigned long num_pages;  	struct bio *bio = NULL; +	unsigned long bio_flags = 0;  	if (eb->flags & EXTENT_UPTODATE)  		return 0; @@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  			ClearPageError(page);  			err = __extent_read_full_page(tree, page,  						      get_extent, &bio, -						      mirror_num); +						      mirror_num, &bio_flags);  			if (err) {  				ret = err;  				printk("err %d from __extent_read_full_page\n", ret); @@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,  	}  	if (bio) -		submit_one_bio(READ, bio, mirror_num); +		submit_one_bio(READ, bio, mirror_num, bio_flags);  	if (ret || !wait) {  		if (ret) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c9d1908a1ae..86f859b87a6 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -18,6 +18,9 @@  #define EXTENT_BOUNDARY (1 << 11)  #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +/* flags for bio submission */ +#define EXTENT_BIO_COMPRESSED 1 +  /*   * page->private values.  Every page that is controlled by the extent   * map has page->private set to one. @@ -28,14 +31,17 @@  struct extent_state;  typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw, -				       struct bio *bio, int mirror_num); +				       struct bio *bio, int mirror_num, +				       unsigned long bio_flags);  struct extent_io_ops { -	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); +	int (*fill_delalloc)(struct inode *inode, struct page *locked_page, +			     u64 start, u64 end, int *page_started);  	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);  	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);  	extent_submit_bio_hook_t *submit_bio_hook;  	int (*merge_bio_hook)(struct page *page, unsigned long offset, -			      size_t size, struct bio *bio); +			      size_t size, struct bio *bio, +			      unsigned long bio_flags);  	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);  	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,  				       u64 start, u64 end, @@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);  int release_extent_buffer_tail_pages(struct extent_buffer *eb);  int extent_range_uptodate(struct extent_io_tree *tree,  			  u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, +				struct extent_io_tree *tree, +				u64 start, u64 end, struct page *locked_page, +				int clear_dirty, int set_writeback, +				int clear_writeback);  #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 74b2a29880d..fd3ebfb8c3c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)  	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))  		return 0; +	/* +	 * don't merge compressed extents, we need to know their +	 * actual size +	 */ +	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) +		return 0; +  	if (extent_map_end(prev) == next->start &&  	    prev->flags == next->flags &&  	    prev->bdev == next->bdev && @@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree,  		if (rb && mergable_maps(merge, em)) {  			em->start = merge->start;  			em->len += merge->len; +			em->block_len += merge->block_len;  			em->block_start = merge->block_start;  			merge->in_tree = 0;  			rb_erase(&merge->rb_node, &tree->map); @@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree,  		merge = rb_entry(rb, struct extent_map, rb_node);  	if (rb && mergable_maps(em, merge)) {  		em->len += merge->len; +		em->block_len += merge->len;  		rb_erase(&merge->rb_node, &tree->map);  		merge->in_tree = 0;  		free_extent_map(merge); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 26ac6fe0b26..abbcbeb28c7 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -10,6 +10,7 @@  /* bits for the flags field */  #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ +#define EXTENT_FLAG_COMPRESSED 1  struct extent_map {  	struct rb_node rb_node; @@ -18,6 +19,7 @@ struct extent_map {  	u64 start;  	u64 len;  	u64 block_start; +	u64 block_len;  	unsigned long flags;  	struct block_device *bdev;  	atomic_t refs; @@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em)  static inline u64 extent_map_block_end(struct extent_map *em)  { -	if (em->block_start + em->len < em->block_start) +	if (em->block_start + em->block_len < em->block_start)  		return (u64)-1; -	return em->block_start + em->len; +	return em->block_start + em->block_len;  }  void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6dbe88b9d7d..f4d3fa71bc4 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root,  			     u64 objectid, u64 pos,  			     u64 disk_offset, u64 disk_num_bytes, -			     u64 num_bytes, u64 offset) +			     u64 num_bytes, u64 offset, u64 ram_bytes, +			     u8 compression, u8 encryption, u16 other_encoding)  {  	int ret = 0;  	struct btrfs_file_extent_item *item; @@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,  	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);  	btrfs_set_file_extent_offset(leaf, item, offset);  	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); +	btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);  	btrfs_set_file_extent_generation(leaf, item, trans->transid);  	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); +	btrfs_set_file_extent_compression(leaf, item, compression); +	btrfs_set_file_extent_encryption(leaf, item, encryption); +	btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); +  	btrfs_mark_buffer_dirty(leaf);  out:  	btrfs_free_path(path); @@ -213,6 +219,73 @@ found:  	return 0;  } +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, +			  u64 start, unsigned long len) +{ +	struct btrfs_ordered_sum *sums; +	struct btrfs_sector_sum *sector_sum; +	struct btrfs_ordered_extent *ordered; +	char *data; +	struct page *page; +	unsigned long total_bytes = 0; +	unsigned long this_sum_bytes = 0; + +	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS); +	if (!sums) +		return -ENOMEM; + +	sector_sum = sums->sums; +	sums->file_offset = start; +	sums->len = len; +	INIT_LIST_HEAD(&sums->list); +	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset); +	BUG_ON(!ordered); + +	while(len > 0) { +		if (start >= ordered->file_offset + ordered->len || +		    start < ordered->file_offset) { +			sums->len = this_sum_bytes; +			this_sum_bytes = 0; +			btrfs_add_ordered_sum(inode, ordered, sums); +			btrfs_put_ordered_extent(ordered); + +			sums = kzalloc(btrfs_ordered_sum_size(root, len), +				       GFP_NOFS); +			BUG_ON(!sums); +			sector_sum = sums->sums; +			sums->len = len; +			sums->file_offset = start; +			ordered = btrfs_lookup_ordered_extent(inode, +						      sums->file_offset); +			BUG_ON(!ordered); +		} + +		page = find_get_page(inode->i_mapping, +				     start >> PAGE_CACHE_SHIFT); + +		data = kmap_atomic(page, KM_USER0); +		sector_sum->sum = ~(u32)0; +		sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum, +						  PAGE_CACHE_SIZE); +		kunmap_atomic(data, KM_USER0); +		btrfs_csum_final(sector_sum->sum, +				 (char *)§or_sum->sum); +		sector_sum->offset = page_offset(page); +		page_cache_release(page); + +		sector_sum++; +		total_bytes += PAGE_CACHE_SIZE; +		this_sum_bytes += PAGE_CACHE_SIZE; +		start += PAGE_CACHE_SIZE; + +		WARN_ON(len < PAGE_CACHE_SIZE); +		len -= PAGE_CACHE_SIZE; +	} +	btrfs_add_ordered_sum(inode, ordered, sums); +	btrfs_put_ordered_extent(ordered); +	return 0; +} +  int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,  		       struct bio *bio)  { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 69abbe19add..0aa15436590 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)  	}  } -/* this does all the hard work for inserting an inline extent into - * the btree.  Any existing inline extent is extended as required to make room, - * otherwise things are inserted as required into the btree - */ -static int noinline insert_inline_extent(struct btrfs_trans_handle *trans, -				struct btrfs_root *root, struct inode *inode, -				u64 offset, size_t size, -				struct page **pages, size_t page_offset, -				int num_pages) -{ -	struct btrfs_key key; -	struct btrfs_path *path; -	struct extent_buffer *leaf; -	char *kaddr; -	unsigned long ptr; -	struct btrfs_file_extent_item *ei; -	struct page *page; -	u32 datasize; -	int err = 0; -	int ret; -	int i; -	ssize_t cur_size; - -	path = btrfs_alloc_path(); -	if (!path) -		return -ENOMEM; - -	btrfs_set_trans_block_group(trans, inode); - -	key.objectid = inode->i_ino; -	key.offset = offset; -	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - -	ret = btrfs_search_slot(trans, root, &key, path, 0, 1); -	if (ret < 0) { -		err = ret; -		goto fail; -	} -	if (ret == 1) { -		struct btrfs_key found_key; - -		if (path->slots[0] == 0) -			goto insert; - -		path->slots[0]--; -		leaf = path->nodes[0]; -		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - -		if (found_key.objectid != inode->i_ino) -			goto insert; - -		if (found_key.type != BTRFS_EXTENT_DATA_KEY) -			goto insert; -		ei = btrfs_item_ptr(leaf, path->slots[0], -				    struct btrfs_file_extent_item); - -		if (btrfs_file_extent_type(leaf, ei) != -		    BTRFS_FILE_EXTENT_INLINE) { -			goto insert; -		} -		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); -		ret = 0; -	} -	if (ret == 0) { -		u32 found_size; -		u64 found_end; - -		leaf = path->nodes[0]; -		ei = btrfs_item_ptr(leaf, path->slots[0], -				    struct btrfs_file_extent_item); - -		if (btrfs_file_extent_type(leaf, ei) != -		    BTRFS_FILE_EXTENT_INLINE) { -			err = ret; -			btrfs_print_leaf(root, leaf); -			printk("found wasn't inline offset %Lu inode %lu\n", -			       offset, inode->i_ino); -			goto fail; -		} -		found_size = btrfs_file_extent_inline_len(leaf, -					  btrfs_item_nr(leaf, path->slots[0])); -		found_end = key.offset + found_size; - -		if (found_end < offset + size) { -			btrfs_release_path(root, path); -			ret = btrfs_search_slot(trans, root, &key, path, -						offset + size - found_end, 1); -			BUG_ON(ret != 0); - -			ret = btrfs_extend_item(trans, root, path, -						offset + size - found_end); -			if (ret) { -				err = ret; -				goto fail; -			} -			leaf = path->nodes[0]; -			ei = btrfs_item_ptr(leaf, path->slots[0], -					    struct btrfs_file_extent_item); -			inode_add_bytes(inode, offset + size - found_end); -		} -		if (found_end < offset) { -			ptr = btrfs_file_extent_inline_start(ei) + found_size; -			memset_extent_buffer(leaf, 0, ptr, offset - found_end); -		} -	} else { -insert: -		btrfs_release_path(root, path); -		datasize = offset + size - key.offset; -		inode_add_bytes(inode, datasize); -		datasize = btrfs_file_extent_calc_inline_size(datasize); -		ret = btrfs_insert_empty_item(trans, root, path, &key, -					      datasize); -		if (ret) { -			err = ret; -			printk("got bad ret %d\n", ret); -			goto fail; -		} -		leaf = path->nodes[0]; -		ei = btrfs_item_ptr(leaf, path->slots[0], -				    struct btrfs_file_extent_item); -		btrfs_set_file_extent_generation(leaf, ei, trans->transid); -		btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); -	} -	ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset; - -	cur_size = size; -	i = 0; -	while (size > 0) { -		page = pages[i]; -		kaddr = kmap_atomic(page, KM_USER0); -		cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size); -		write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size); -		kunmap_atomic(kaddr, KM_USER0); -		page_offset = 0; -		ptr += cur_size; -		size -= cur_size; -		if (i >= num_pages) { -			printk("i %d num_pages %d\n", i, num_pages); -		} -		i++; -	} -	btrfs_mark_buffer_dirty(leaf); -fail: -	btrfs_free_path(path); -	return err; -} -  /*   * after copy_from_user, pages need to be dirtied and we need to make   * sure holes are created between the current EOF and the start of @@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,  	u64 start_pos;  	u64 end_of_last_block;  	u64 end_pos = pos + write_bytes; -	u64 inline_size; -	int did_inline = 0;  	loff_t isize = i_size_read(inode);  	start_pos = pos & ~((u64)root->sectorsize - 1); @@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,  			err = btrfs_insert_file_extent(trans, root,  						       inode->i_ino,  						       last_pos_in_file, -						       0, 0, hole_size, 0); +						       0, 0, hole_size, 0, +						       hole_size, 0, 0, 0);  			btrfs_drop_extent_cache(inode, last_pos_in_file,  					last_pos_in_file + hole_size - 1, 0);  			mutex_unlock(&BTRFS_I(inode)->extent_mutex); @@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,  			goto failed;  	} -	/* -	 * either allocate an extent for the new bytes or setup the key -	 * to show we are doing inline data in the extent +	/* check for reserved extents on each page, we don't want +	 * to reset the delalloc bit on things that already have +	 * extents reserved.  	 */ -	inline_size = end_pos; -	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) || -	    inline_size > root->fs_info->max_inline || -	    (inline_size & (root->sectorsize -1)) == 0 || -	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) { -		/* check for reserved extents on each page, we don't want -		 * to reset the delalloc bit on things that already have -		 * extents reserved. -		 */ -		btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); -		for (i = 0; i < num_pages; i++) { -			struct page *p = pages[i]; -			SetPageUptodate(p); -			ClearPageChecked(p); -			set_page_dirty(p); -		} -	} else { -		u64 aligned_end; -		/* step one, delete the existing extents in this range */ -		aligned_end = (pos + write_bytes + root->sectorsize - 1) & -			~((u64)root->sectorsize - 1); -		mutex_lock(&BTRFS_I(inode)->extent_mutex); -		err = btrfs_drop_extents(trans, root, inode, start_pos, -					 aligned_end, aligned_end, &hint_byte); -		if (err) -			goto failed; -		if (isize > inline_size) -			inline_size = min_t(u64, isize, aligned_end); -		inline_size -= start_pos; -		err = insert_inline_extent(trans, root, inode, start_pos, -					   inline_size, pages, 0, num_pages); -		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0); -		BUG_ON(err); -		mutex_unlock(&BTRFS_I(inode)->extent_mutex); - -		/* -		 * an ugly way to do all the prop accounting around -		 * the page bits and mapping tags -		 */ -		set_page_writeback(pages[0]); -		end_page_writeback(pages[0]); -		did_inline = 1; +	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); +	for (i = 0; i < num_pages; i++) { +		struct page *p = pages[i]; +		SetPageUptodate(p); +		ClearPageChecked(p); +		set_page_dirty(p);  	}  	if (end_pos > isize) {  		i_size_write(inode, end_pos); -		if (did_inline) -			BTRFS_I(inode)->disk_i_size = end_pos;  		btrfs_update_inode(trans, root, inode);  	}  failed: @@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  	int ret;  	int testend = 1;  	unsigned long flags; +	int compressed = 0;  	WARN_ON(end < start);  	if (end == (u64)-1) { @@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			free_extent_map(em);  			continue;  		} +		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);  		clear_bit(EXTENT_FLAG_PINNED, &em->flags);  		remove_extent_mapping(em_tree, em); @@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			split->start = em->start;  			split->len = start - em->start;  			split->block_start = em->block_start; + +			if (compressed) +				split->block_len = em->block_len; +			else +				split->block_len = split->len; +  			split->bdev = em->bdev;  			split->flags = flags;  			ret = add_extent_mapping(em_tree, split); @@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			split->bdev = em->bdev;  			split->flags = flags; -			split->block_start = em->block_start + diff; +			if (compressed) { +				split->block_len = em->block_len; +				split->block_start = em->block_start; +			} else { +				split->block_len = split->len; +				split->block_start = em->block_start + diff; +			}  			ret = add_extent_mapping(em_tree, split);  			BUG_ON(ret); @@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)  			struct btrfs_item *item;  			item = btrfs_item_nr(leaf, slot);  			extent_end = found_key.offset + -			     btrfs_file_extent_inline_len(leaf, item); +			     btrfs_file_extent_inline_len(leaf, extent);  			extent_end = (extent_end + root->sectorsize - 1) &  				~((u64)root->sectorsize -1 );  		} @@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,  	u64 extent_end = 0;  	u64 search_start = start;  	u64 leaf_start; +	u64 ram_bytes = 0; +	u8 compression = 0; +	u8 encryption = 0; +	u16 other_encoding = 0;  	u64 root_gen;  	u64 root_owner;  	struct extent_buffer *leaf; @@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,  	int recow;  	int ret; +	inline_limit = 0;  	btrfs_drop_extent_cache(inode, start, end - 1, 0);  	path = btrfs_alloc_path(); @@ -637,6 +470,12 @@ next_slot:  			extent = btrfs_item_ptr(leaf, slot,  						struct btrfs_file_extent_item);  			found_type = btrfs_file_extent_type(leaf, extent); +			compression = btrfs_file_extent_compression(leaf, +								    extent); +			encryption = btrfs_file_extent_encryption(leaf, +								  extent); +			other_encoding = btrfs_file_extent_other_encoding(leaf, +								  extent);  			if (found_type == BTRFS_FILE_EXTENT_REG) {  				extent_end =  				     btrfs_file_extent_disk_bytenr(leaf, @@ -646,13 +485,13 @@ next_slot:  				extent_end = key.offset +  				     btrfs_file_extent_num_bytes(leaf, extent); +				ram_bytes = btrfs_file_extent_ram_bytes(leaf, +								extent);  				found_extent = 1;  			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) { -				struct btrfs_item *item; -				item = btrfs_item_nr(leaf, slot);  				found_inline = 1;  				extent_end = key.offset + -				     btrfs_file_extent_inline_len(leaf, item); +				     btrfs_file_extent_inline_len(leaf, extent);  			}  		} else {  			extent_end = search_start; @@ -680,10 +519,9 @@ next_slot:  			search_start = (extent_end + mask) & ~mask;  		} else  			search_start = extent_end; -		if (end <= extent_end && start >= key.offset && found_inline) { + +		if (end <= extent_end && start >= key.offset && found_inline)  			*hint_byte = EXTENT_MAP_INLINE; -			goto out; -		}  		if (found_extent) {  			read_extent_buffer(leaf, &old, (unsigned long)extent, @@ -770,12 +608,27 @@ next_slot:  			write_extent_buffer(leaf, &old,  					    (unsigned long)extent, sizeof(old)); +			btrfs_set_file_extent_compression(leaf, extent, +							  compression); +			btrfs_set_file_extent_encryption(leaf, extent, +							 encryption); +			btrfs_set_file_extent_other_encoding(leaf, extent, +							     other_encoding);  			btrfs_set_file_extent_offset(leaf, extent,  				    le64_to_cpu(old.offset) + end - key.offset);  			WARN_ON(le64_to_cpu(old.num_bytes) <  				(extent_end - end));  			btrfs_set_file_extent_num_bytes(leaf, extent,  							extent_end - end); + +			/* +			 * set the ram bytes to the size of the full extent +			 * before splitting.  This is a worst case flag, +			 * but its the best we can do because we don't know +			 * how splitting affects compression +			 */ +			btrfs_set_file_extent_ram_bytes(leaf, extent, +							ram_bytes);  			btrfs_set_file_extent_type(leaf, extent,  						   BTRFS_FILE_EXTENT_REG); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf4bed6ca4d..9797592dc86 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -49,6 +49,7 @@  #include "compat.h"  #include "tree-log.h"  #include "ref-cache.h" +#include "compression.h"  struct btrfs_iget_args {  	u64 ino; @@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {  };  static void btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);  /*   * a very lame attempt at stopping writes when the FS is 85% full.  There @@ -114,57 +116,374 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,  }  /* + * this does all the hard work for inserting an inline extent into + * the btree.  The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static int noinline insert_inline_extent(struct btrfs_trans_handle *trans, +				struct btrfs_root *root, struct inode *inode, +				u64 start, size_t size, size_t compressed_size, +				struct page **compressed_pages) +{ +	struct btrfs_key key; +	struct btrfs_path *path; +	struct extent_buffer *leaf; +	struct page *page = NULL; +	char *kaddr; +	unsigned long ptr; +	struct btrfs_file_extent_item *ei; +	int err = 0; +	int ret; +	size_t cur_size = size; +	size_t datasize; +	unsigned long offset; +	int use_compress = 0; + +	if (compressed_size && compressed_pages) { +		use_compress = 1; +		cur_size = compressed_size; +	} + +	path = btrfs_alloc_path(); if (!path) +		return -ENOMEM; + +	btrfs_set_trans_block_group(trans, inode); + +	key.objectid = inode->i_ino; +	key.offset = start; +	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); +	inode_add_bytes(inode, size); +	datasize = btrfs_file_extent_calc_inline_size(cur_size); + +	inode_add_bytes(inode, size); +	ret = btrfs_insert_empty_item(trans, root, path, &key, +				      datasize); +	BUG_ON(ret); +	if (ret) { +		err = ret; +		printk("got bad ret %d\n", ret); +		goto fail; +	} +	leaf = path->nodes[0]; +	ei = btrfs_item_ptr(leaf, path->slots[0], +			    struct btrfs_file_extent_item); +	btrfs_set_file_extent_generation(leaf, ei, trans->transid); +	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); +	btrfs_set_file_extent_encryption(leaf, ei, 0); +	btrfs_set_file_extent_other_encoding(leaf, ei, 0); +	btrfs_set_file_extent_ram_bytes(leaf, ei, size); +	ptr = btrfs_file_extent_inline_start(ei); + +	if (use_compress) { +		struct page *cpage; +		int i = 0; +		while(compressed_size > 0) { +			cpage = compressed_pages[i]; +			cur_size = min(compressed_size, +				       PAGE_CACHE_SIZE); + +			kaddr = kmap(cpage); +			write_extent_buffer(leaf, kaddr, ptr, cur_size); +			kunmap(cpage); + +			i++; +			ptr += cur_size; +			compressed_size -= cur_size; +		} +		btrfs_set_file_extent_compression(leaf, ei, +						  BTRFS_COMPRESS_ZLIB); +	} else { +		page = find_get_page(inode->i_mapping, +				     start >> PAGE_CACHE_SHIFT); +		btrfs_set_file_extent_compression(leaf, ei, 0); +		kaddr = kmap_atomic(page, KM_USER0); +		offset = start & (PAGE_CACHE_SIZE - 1); +		write_extent_buffer(leaf, kaddr + offset, ptr, size); +		kunmap_atomic(kaddr, KM_USER0); +		page_cache_release(page); +	} +	btrfs_mark_buffer_dirty(leaf); +	btrfs_free_path(path); + +	BTRFS_I(inode)->disk_i_size = inode->i_size; +	btrfs_update_inode(trans, root, inode); +	return 0; +fail: +	btrfs_free_path(path); +	return err; +} + + +/* + * conditionally insert an inline extent into the file.  This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static int cow_file_range_inline(struct btrfs_trans_handle *trans, +				 struct btrfs_root *root, +				 struct inode *inode, u64 start, u64 end, +				 size_t compressed_size, +				 struct page **compressed_pages) +{ +	u64 isize = i_size_read(inode); +	u64 actual_end = min(end + 1, isize); +	u64 inline_len = actual_end - start; +	u64 aligned_end = (end + root->sectorsize - 1) & +			~((u64)root->sectorsize - 1); +	u64 hint_byte; +	u64 data_len = inline_len; +	int ret; + +	if (compressed_size) +		data_len = compressed_size; + +	if (start > 0 || +	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || +	    (!compressed_size && +	    (actual_end & (root->sectorsize - 1)) == 0) || +	    end + 1 < isize || +	    data_len > root->fs_info->max_inline) { +		return 1; +	} + +	mutex_lock(&BTRFS_I(inode)->extent_mutex); +	ret = btrfs_drop_extents(trans, root, inode, start, +				 aligned_end, aligned_end, &hint_byte); +	BUG_ON(ret); + +	if (isize > actual_end) +		inline_len = min_t(u64, isize, actual_end); +	ret = insert_inline_extent(trans, root, inode, start, +				   inline_len, compressed_size, +				   compressed_pages); +	BUG_ON(ret); +	btrfs_drop_extent_cache(inode, start, aligned_end, 0); +	mutex_unlock(&BTRFS_I(inode)->extent_mutex); +	return 0; +} + +/*   * when extent_io.c finds a delayed allocation range in the file,   * the call backs end up in this code.  The basic idea is to   * allocate extents on disk for the range, and create ordered data structs   * in ram to track those extents. + * + * locked_page is the page that writepage had locked already.  We use + * it to make sure we don't do extra locks or unlocks. + * + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it.  It may be clean and already done with + * IO when we return.   */ -static int cow_file_range(struct inode *inode, u64 start, u64 end) +static int cow_file_range(struct inode *inode, struct page *locked_page, +			  u64 start, u64 end, int *page_started)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans;  	u64 alloc_hint = 0;  	u64 num_bytes; +	unsigned long ram_size; +	u64 orig_start; +	u64 disk_num_bytes;  	u64 cur_alloc_size;  	u64 blocksize = root->sectorsize; -	u64 orig_num_bytes; +	u64 actual_end;  	struct btrfs_key ins;  	struct extent_map *em;  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;  	int ret = 0; +	struct page **pages = NULL; +	unsigned long nr_pages; +	unsigned long nr_pages_ret = 0; +	unsigned long total_compressed = 0; +	unsigned long total_in = 0; +	unsigned long max_compressed = 128 * 1024; +	unsigned long max_uncompressed = 256 * 1024; +	int i; +	int will_compress;  	trans = btrfs_join_transaction(root, 1);  	BUG_ON(!trans);  	btrfs_set_trans_block_group(trans, inode); +	orig_start = start; +	/* +	 * compression made this loop a bit ugly, but the basic idea is to +	 * compress some pages but keep the total size of the compressed +	 * extent relatively small.  If compression is off, this goto target +	 * is never used. +	 */ +again: +	will_compress = 0; +	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; +	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + +	actual_end = min_t(u64, i_size_read(inode), end + 1); +	total_compressed = actual_end - start; + +	/* we want to make sure that amount of ram required to uncompress +	 * an extent is reasonable, so we limit the total size in ram +	 * of a compressed extent to 256k +	 */ +	total_compressed = min(total_compressed, max_uncompressed);  	num_bytes = (end - start + blocksize) & ~(blocksize - 1);  	num_bytes = max(blocksize,  num_bytes); -	orig_num_bytes = num_bytes; +	disk_num_bytes = num_bytes; +	total_in = 0; +	ret = 0; -	if (alloc_hint == EXTENT_MAP_INLINE) -		goto out; +	/* we do compression for mount -o compress and when the +	 * inode has not been flagged as nocompress +	 */ +	if (!btrfs_test_flag(inode, NOCOMPRESS) && +	    btrfs_test_opt(root, COMPRESS)) { +		WARN_ON(pages); +		pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + +		/* we want to make sure the amount of IO required to satisfy +		 * a random read is reasonably small, so we limit the size +		 * of a compressed extent to 128k +		 */ +		ret = btrfs_zlib_compress_pages(inode->i_mapping, start, +						total_compressed, pages, +						nr_pages, &nr_pages_ret, +						&total_in, +						&total_compressed, +						max_compressed); + +		if (!ret) { +			unsigned long offset = total_compressed & +				(PAGE_CACHE_SIZE - 1); +			struct page *page = pages[nr_pages_ret - 1]; +			char *kaddr; + +			/* zero the tail end of the last page, we might be +			 * sending it down to disk +			 */ +			if (offset) { +				kaddr = kmap_atomic(page, KM_USER0); +				memset(kaddr + offset, 0, +				       PAGE_CACHE_SIZE - offset); +				kunmap_atomic(kaddr, KM_USER0); +			} +			will_compress = 1; +		} +	} +	if (start == 0) { +		/* lets try to make an inline extent */ +		if (ret || total_in < (end - start + 1)) { +			/* we didn't compress the entire range, try +			 * to make an uncompressed inline extent.  This +			 * is almost sure to fail, but maybe inline sizes +			 * will get bigger later +			 */ +			ret = cow_file_range_inline(trans, root, inode, +						    start, end, 0, NULL); +		} else { +			ret = cow_file_range_inline(trans, root, inode, +						    start, end, +						    total_compressed, pages); +		} +		if (ret == 0) { +			extent_clear_unlock_delalloc(inode, +						     &BTRFS_I(inode)->io_tree, +						     start, end, NULL, +						     1, 1, 1); +			*page_started = 1; +			ret = 0; +			goto free_pages_out; +		} +	} + +	if (will_compress) { +		/* +		 * we aren't doing an inline extent round the compressed size +		 * up to a block size boundary so the allocator does sane +		 * things +		 */ +		total_compressed = (total_compressed + blocksize - 1) & +			~(blocksize - 1); + +		/* +		 * one last check to make sure the compression is really a +		 * win, compare the page count read with the blocks on disk +		 */ +		total_in = (total_in + PAGE_CACHE_SIZE - 1) & +			~(PAGE_CACHE_SIZE - 1); +		if (total_compressed >= total_in) { +			will_compress = 0; +		} else { +			disk_num_bytes = total_compressed; +			num_bytes = total_in; +		} +	} +	if (!will_compress && pages) { +		/* +		 * the compression code ran but failed to make things smaller, +		 * free any pages it allocated and our page pointer array +		 */ +		for (i = 0; i < nr_pages_ret; i++) { +			page_cache_release(pages[i]); +		} +		kfree(pages); +		pages = NULL; +		total_compressed = 0; +		nr_pages_ret = 0; + +		/* flag the file so we don't compress in the future */ +		btrfs_set_flag(inode, NOCOMPRESS); +	} + +	BUG_ON(disk_num_bytes > +	       btrfs_super_total_bytes(&root->fs_info->super_copy)); -	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));  	mutex_lock(&BTRFS_I(inode)->extent_mutex);  	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);  	mutex_unlock(&BTRFS_I(inode)->extent_mutex); -	while(num_bytes > 0) { -		cur_alloc_size = min(num_bytes, root->fs_info->max_extent); +	while(disk_num_bytes > 0) { +		unsigned long min_bytes; + +		/* +		 * the max size of a compressed extent is pretty small, +		 * make the code a little less complex by forcing +		 * the allocator to find a whole compressed extent at once +		 */ +		if (will_compress) +			min_bytes = disk_num_bytes; +		else +			min_bytes = root->sectorsize; + +		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);  		ret = btrfs_reserve_extent(trans, root, cur_alloc_size, -					   root->sectorsize, 0, alloc_hint, +					   min_bytes, 0, alloc_hint,  					   (u64)-1, &ins, 1);  		if (ret) {  			WARN_ON(1); -			goto out; +			goto free_pages_out_fail;  		}  		em = alloc_extent_map(GFP_NOFS);  		em->start = start; -		em->len = ins.offset; + +		if (will_compress) { +			ram_size = num_bytes; +			em->len = num_bytes; +		} else { +			/* ramsize == disk size */ +			ram_size = ins.offset; +			em->len = ins.offset; +		} +  		em->block_start = ins.objectid; +		em->block_len = ins.offset;  		em->bdev = root->fs_info->fs_devices->latest_bdev; +  		mutex_lock(&BTRFS_I(inode)->extent_mutex);  		set_bit(EXTENT_FLAG_PINNED, &em->flags); + +		if (will_compress) +			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); +  		while(1) {  			spin_lock(&em_tree->lock);  			ret = add_extent_mapping(em_tree, em); @@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)  				break;  			}  			btrfs_drop_extent_cache(inode, start, -						start + ins.offset - 1, 0); +						start + ram_size - 1, 0);  		}  		mutex_unlock(&BTRFS_I(inode)->extent_mutex);  		cur_alloc_size = ins.offset;  		ret = btrfs_add_ordered_extent(inode, start, ins.objectid, -					       ins.offset, 0); +					       ram_size, cur_alloc_size, 0, +					       will_compress);  		BUG_ON(ret); -		if (num_bytes < cur_alloc_size) { -			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes, + +		if (disk_num_bytes < cur_alloc_size) { +			printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,  			       cur_alloc_size);  			break;  		} + +		if (will_compress) { +			/* +			 * we're doing compression, we and we need to +			 * submit the compressed extents down to the device. +			 * +			 * We lock down all the file pages, clearing their +			 * dirty bits and setting them writeback.  Everyone +			 * that wants to modify the page will wait on the +			 * ordered extent above. +			 * +			 * The writeback bits on the file pages are +			 * cleared when the compressed pages are on disk +			 */ +			btrfs_end_transaction(trans, root); + +			if (start <= page_offset(locked_page) && +			    page_offset(locked_page) < start + ram_size) { +				*page_started = 1; +			} + +			extent_clear_unlock_delalloc(inode, +						     &BTRFS_I(inode)->io_tree, +						     start, +						     start + ram_size - 1, +						     NULL, 1, 1, 0); + +			ret = btrfs_submit_compressed_write(inode, start, +						 ram_size, ins.objectid, +						 cur_alloc_size, pages, +						 nr_pages_ret); + +			BUG_ON(ret); +			trans = btrfs_join_transaction(root, 1); +			if (start + ram_size < end) { +				start += ram_size; +				alloc_hint = ins.objectid + ins.offset; +				/* pages will be freed at end_bio time */ +				pages = NULL; +				goto again; +			} else { +				/* we've written everything, time to go */ +				break; +			} +		} +		/* we're not doing compressed IO, don't unlock the first +		 * page (which the caller expects to stay locked), don't +		 * clear any dirty bits and don't set any writeback bits +		 */ +		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, +					     start, start + ram_size - 1, +					     locked_page, 0, 0, 0); +		disk_num_bytes -= cur_alloc_size;  		num_bytes -= cur_alloc_size;  		alloc_hint = ins.objectid + ins.offset;  		start += cur_alloc_size;  	} + +	ret = 0;  out:  	btrfs_end_transaction(trans, root); +  	return ret; + +free_pages_out_fail: +	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, +				     start, end, locked_page, 0, 0, 0); +free_pages_out: +	for (i = 0; i < nr_pages_ret; i++) +		page_cache_release(pages[i]); +	if (pages) +		kfree(pages); + +	goto out;  }  /* @@ -203,7 +591,8 @@ out:   * If no cow copies or snapshots exist, we write directly to the existing   * blocks on disk   */ -static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) +static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, +			      u64 start, u64 end, int *page_started)  {  	u64 extent_start;  	u64 extent_end; @@ -260,6 +649,11 @@ again:  		extent_end = extent_start + extent_num_bytes;  		err = 0; +		if (btrfs_file_extent_compression(leaf, item) || +		    btrfs_file_extent_encryption(leaf,item) || +		    btrfs_file_extent_other_encoding(leaf, item)) +			goto not_found; +  		if (loops && start != extent_start)  			goto not_found; @@ -284,7 +678,8 @@ again:  		bytenr += btrfs_file_extent_offset(leaf, item);  		extent_num_bytes = min(end + 1, extent_end) - start;  		ret = btrfs_add_ordered_extent(inode, start, bytenr, -						extent_num_bytes, 1); +						extent_num_bytes, +						extent_num_bytes, 1, 0);  		if (ret) {  			err = ret;  			goto out; @@ -300,7 +695,8 @@ again:  not_found:  		btrfs_end_transaction(trans, root);  		btrfs_free_path(path); -		return cow_file_range(inode, start, end); +		return cow_file_range(inode, locked_page, start, end, +				      page_started);  	}  out:  	WARN_ON(err); @@ -312,16 +708,19 @@ out:  /*   * extent_io.c call back to do delayed allocation processing   */ -static int run_delalloc_range(struct inode *inode, u64 start, u64 end) +static int run_delalloc_range(struct inode *inode, struct page *locked_page, +			      u64 start, u64 end, int *page_started)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret;  	if (btrfs_test_opt(root, NODATACOW) ||  	    btrfs_test_flag(inode, NODATACOW)) -		ret = run_delalloc_nocow(inode, start, end); +		ret = run_delalloc_nocow(inode, locked_page, start, end, +					 page_started);  	else -		ret = cow_file_range(inode, start, end); +		ret = cow_file_range(inode, locked_page, start, end, +				     page_started);  	return ret;  } @@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,   * we don't create bios that span stripes or chunks   */  int btrfs_merge_bio_hook(struct page *page, unsigned long offset, -			 size_t size, struct bio *bio) +			 size_t size, struct bio *bio, +			 unsigned long bio_flags)  {  	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;  	struct btrfs_mapping_tree *map_tree; @@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,   * are inserted into the btree   */  int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, -			  int mirror_num) +			  int mirror_num, unsigned long bio_flags)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret = 0; @@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,   * or reading the csums from the tree before a read   */  int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, -			  int mirror_num) +			  int mirror_num, unsigned long bio_flags)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret = 0; @@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  	if (!(rw & (1 << BIO_RW))) {  		btrfs_lookup_bio_sums(root, inode, bio); + +		if (bio_flags & EXTENT_BIO_COMPRESSED) { +			return btrfs_submit_compressed_read(inode, bio, +						    mirror_num, bio_flags); +		} +  		goto mapit;  	}  	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,  				   inode, rw, bio, mirror_num, -				   __btrfs_submit_bio_hook); +				   bio_flags, __btrfs_submit_bio_hook);  mapit:  	return btrfs_map_bio(root, rw, bio, mirror_num, 0);  } @@ -539,7 +945,7 @@ out_page:   * good idea.  This causes problems because we want to make sure COW   * properly happens and the data=ordered rules are followed.   * - * In our case any range that doesn't have the EXTENT_ORDERED bit set + * In our case any range that doesn't have the ORDERED bit set   * hasn't been properly setup for IO.  We kick off an async process   * to fix it up.  The async helper will wait for ordered extents, set   * the delalloc bit and make it safe to write the page. @@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  	btrfs_set_file_extent_disk_bytenr(leaf, extent_item,  					  ordered_extent->start);  	btrfs_set_file_extent_disk_num_bytes(leaf, extent_item, -					     ordered_extent->len); +					     ordered_extent->disk_len);  	btrfs_set_file_extent_offset(leaf, extent_item, 0); + +	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) +		btrfs_set_file_extent_compression(leaf, extent_item, 1); +	else +		btrfs_set_file_extent_compression(leaf, extent_item, 0); +	btrfs_set_file_extent_encryption(leaf, extent_item, 0); +	btrfs_set_file_extent_other_encoding(leaf, extent_item, 0); + +	/* ram bytes = extent_num_bytes for now */  	btrfs_set_file_extent_num_bytes(leaf, extent_item,  					ordered_extent->len); +	btrfs_set_file_extent_ram_bytes(leaf, extent_item, +					ordered_extent->len);  	btrfs_mark_buffer_dirty(leaf);  	btrfs_drop_extent_cache(inode, ordered_extent->file_offset, @@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)  	mutex_unlock(&BTRFS_I(inode)->extent_mutex);  	ins.objectid = ordered_extent->start; -	ins.offset = ordered_extent->len; +	ins.offset = ordered_extent->disk_len;  	ins.type = BTRFS_EXTENT_ITEM_KEY;  	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,  					  root->root_key.objectid, @@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,  	int ret;  	int rw;  	u64 logical; +	unsigned long bio_flags = 0;  	ret = get_state_private(failure_tree, start, &private);  	if (ret) { @@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,  		}  		logical = start - em->start;  		logical = em->block_start + logical; +		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) +			bio_flags = EXTENT_BIO_COMPRESSED;  		failrec->logical = logical;  		free_extent_map(em);  		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | @@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,  		rw = READ;  	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, -						      failrec->last_mirror); +						      failrec->last_mirror, +						      bio_flags);  	return 0;  } @@ -1644,10 +2065,8 @@ search_again:  				item_end +=  				    btrfs_file_extent_num_bytes(leaf, fi);  			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { -				struct btrfs_item *item = btrfs_item_nr(leaf, -							        path->slots[0]);  				item_end += btrfs_file_extent_inline_len(leaf, -									 item); +									 fi);  			}  			item_end--;  		} @@ -1715,7 +2134,14 @@ search_again:  				root_owner = btrfs_header_owner(leaf);  			}  		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { -			if (!del_item) { +			/* +			 * we can't truncate inline items that have had +			 * special encodings +			 */ +			if (!del_item && +			    btrfs_file_extent_compression(leaf, fi) == 0 && +			    btrfs_file_extent_encryption(leaf, fi) == 0 && +			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {  				u32 size = new_size - found_key.offset;  				if (root->ref_cows) { @@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)  			err = btrfs_insert_file_extent(trans, root,  						       inode->i_ino,  						       hole_start, 0, 0, -						       hole_size, 0); +						       hole_size, 0, hole_size, +						       0, 0, 0);  			btrfs_drop_extent_cache(inode, hole_start,  						(u64)-1, 0);  			btrfs_check_file(root, inode); @@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,  	start_diff = map_start - em->start;  	em->start = map_start;  	em->len = map_len; -	if (em->block_start < EXTENT_MAP_LAST_BYTE) +	if (em->block_start < EXTENT_MAP_LAST_BYTE && +	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {  		em->block_start += start_diff; +		em->block_len -= start_diff; +	}  	return add_extent_mapping(em_tree, em);  } +static noinline int uncompress_inline(struct btrfs_path *path, +				      struct inode *inode, struct page *page, +				      size_t pg_offset, u64 extent_offset, +				      struct btrfs_file_extent_item *item) +{ +	int ret; +	struct extent_buffer *leaf = path->nodes[0]; +	char *tmp; +	size_t max_size; +	unsigned long inline_size; +	unsigned long ptr; + +	WARN_ON(pg_offset != 0); +	max_size = btrfs_file_extent_ram_bytes(leaf, item); +	inline_size = btrfs_file_extent_inline_item_len(leaf, +					btrfs_item_nr(leaf, path->slots[0])); +	tmp = kmalloc(inline_size, GFP_NOFS); +	ptr = btrfs_file_extent_inline_start(item); + +	read_extent_buffer(leaf, tmp, ptr, inline_size); + +	max_size = min(PAGE_CACHE_SIZE, max_size); +	ret = btrfs_zlib_decompress(tmp, page, extent_offset, +				    inline_size, max_size); +	if (ret) { +		char *kaddr = kmap_atomic(page, KM_USER0); +		unsigned long copy_size = min_t(u64, +				  PAGE_CACHE_SIZE - pg_offset, +				  max_size - extent_offset); +		memset(kaddr + pg_offset, 0, copy_size); +		kunmap_atomic(kaddr, KM_USER0); +	} +	kfree(tmp); +	return 0; +} +  /*   * a bit scary, this does extent mapping from logical file offset to the disk.   * the ugly parts come from merging extents from the disk with the @@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct btrfs_trans_handle *trans = NULL; +	int compressed;  again:  	spin_lock(&em_tree->lock); @@ -2951,6 +3418,7 @@ again:  	em->bdev = root->fs_info->fs_devices->latest_bdev;  	em->start = EXTENT_MAP_HOLE;  	em->len = (u64)-1; +	em->block_len = (u64)-1;  	if (!path) {  		path = btrfs_alloc_path(); @@ -2983,6 +3451,7 @@ again:  	found_type = btrfs_file_extent_type(leaf, item);  	extent_start = found_key.offset; +	compressed = btrfs_file_extent_compression(leaf, item);  	if (found_type == BTRFS_FILE_EXTENT_REG) {  		extent_end = extent_start +  		       btrfs_file_extent_num_bytes(leaf, item); @@ -3005,10 +3474,18 @@ again:  			em->block_start = EXTENT_MAP_HOLE;  			goto insert;  		} -		bytenr += btrfs_file_extent_offset(leaf, item); -		em->block_start = bytenr;  		em->start = extent_start;  		em->len = extent_end - extent_start; +		if (compressed) { +			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); +			em->block_start = bytenr; +			em->block_len = btrfs_file_extent_disk_num_bytes(leaf, +									 item); +		} else { +			bytenr += btrfs_file_extent_offset(leaf, item); +			em->block_start = bytenr; +			em->block_len = em->len; +		}  		goto insert;  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {  		u64 page_start; @@ -3018,8 +3495,7 @@ again:  		size_t extent_offset;  		size_t copy_size; -		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf, -						    path->slots[0])); +		size = btrfs_file_extent_inline_len(leaf, item);  		extent_end = (extent_start + size + root->sectorsize - 1) &  			~((u64)root->sectorsize - 1);  		if (start < extent_start || start >= extent_end) { @@ -3035,9 +3511,10 @@ again:  		}  		em->block_start = EXTENT_MAP_INLINE; -		if (!page) { +		if (!page || create) {  			em->start = extent_start; -			em->len = size; +			em->len = (size + root->sectorsize - 1) & +			~((u64)root->sectorsize - 1);  			goto out;  		} @@ -3048,11 +3525,22 @@ again:  		em->start = extent_start + extent_offset;  		em->len = (copy_size + root->sectorsize - 1) &  			~((u64)root->sectorsize - 1); -		map = kmap(page); +		if (compressed) +			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);  		ptr = btrfs_file_extent_inline_start(item) + extent_offset;  		if (create == 0 && !PageUptodate(page)) { -			read_extent_buffer(leaf, map + pg_offset, ptr, -					   copy_size); +			if (btrfs_file_extent_compression(leaf, item) == +			    BTRFS_COMPRESS_ZLIB) { +				ret = uncompress_inline(path, inode, page, +							pg_offset, +							extent_offset, item); +				BUG_ON(ret); +			} else { +				map = kmap(page); +				read_extent_buffer(leaf, map + pg_offset, ptr, +						   copy_size); +				kunmap(page); +			}  			flush_dcache_page(page);  		} else if (create && PageUptodate(page)) {  			if (!trans) { @@ -3063,11 +3551,12 @@ again:  				trans = btrfs_join_transaction(root, 1);  				goto again;  			} +			map = kmap(page);  			write_extent_buffer(leaf, map + pg_offset, ptr,  					    copy_size); +			kunmap(page);  			btrfs_mark_buffer_dirty(leaf);  		} -		kunmap(page);  		set_extent_uptodate(io_tree, em->start,  				    extent_map_end(em) - 1, GFP_NOFS);  		goto insert; @@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,  	btrfs_set_file_extent_generation(leaf, ei, trans->transid);  	btrfs_set_file_extent_type(leaf, ei,  				   BTRFS_FILE_EXTENT_INLINE); +	btrfs_set_file_extent_encryption(leaf, ei, 0); +	btrfs_set_file_extent_compression(leaf, ei, 0); +	btrfs_set_file_extent_other_encoding(leaf, ei, 0); +	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); +  	ptr = btrfs_file_extent_inline_start(ei);  	write_extent_buffer(leaf, symname, ptr, name_len);  	btrfs_mark_buffer_dirty(leaf); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2eb6caba57c..b5745bb96d4 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,   * inserted.   */  int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, -			     u64 start, u64 len, int nocow) +			     u64 start, u64 len, u64 disk_len, int nocow, +			     int compressed)  {  	struct btrfs_ordered_inode_tree *tree;  	struct rb_node *node; @@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  	entry->file_offset = file_offset;  	entry->start = start;  	entry->len = len; +	entry->disk_len = disk_len;  	entry->inode = inode;  	if (nocow)  		set_bit(BTRFS_ORDERED_NOCOW, &entry->flags); +	if (compressed) +		set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);  	/* one ref for the tree */  	atomic_set(&entry->refs, 1); @@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode,  	 * for pdflush to find them  	 */  	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE); -	if (wait) +	if (wait) {  		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,  						 &entry->flags)); +	}  }  /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f50f8870a14..1ef464145d2 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -66,6 +66,8 @@ struct btrfs_ordered_sum {  #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ +  struct btrfs_ordered_extent {  	/* logical offset in the file */  	u64 file_offset; @@ -73,9 +75,12 @@ struct btrfs_ordered_extent {  	/* disk byte number */  	u64 start; -	/* length of the extent in bytes */ +	/* ram length of the extent in bytes */  	u64 len; +	/* extent length on disk */ +	u64 disk_len; +  	/* flags (described above) */  	unsigned long flags; @@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,  int btrfs_dec_test_ordered_pending(struct inode *inode,  				       u64 file_offset, u64 io_size);  int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, -			     u64 start, u64 len, int nocow); +			     u64 start, u64 len, u64 disk_len, int nocow, +			     int compressed);  int btrfs_add_ordered_sum(struct inode *inode,  			  struct btrfs_ordered_extent *entry,  			  struct btrfs_ordered_sum *sum); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index bd9ab3e9a7f..64725c13aa1 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)  			if (btrfs_file_extent_type(l, fi) ==  			    BTRFS_FILE_EXTENT_INLINE) {  				printk("\t\tinline extent data size %u\n", -			           btrfs_file_extent_inline_len(l, item)); +			           btrfs_file_extent_inline_len(l, fi));  				break;  			}  			printk("\t\textent data disk bytenr %llu nr %llu\n",  			       (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),  			       (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi)); -			printk("\t\textent data offset %llu nr %llu\n", +			printk("\t\textent data offset %llu nr %llu ram %llu\n",  			  (unsigned long long)btrfs_file_extent_offset(l, fi), -			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi)); +			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi), +			  (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));  			break;  		case BTRFS_BLOCK_GROUP_ITEM_KEY:  			bi = btrfs_item_ptr(l, i, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2e6039825b7..431fdf144b5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -47,6 +47,7 @@  #include "volumes.h"  #include "version.h"  #include "export.h" +#include "compression.h"  #define BTRFS_SUPER_MAGIC 0x9123683E @@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb)  enum {  	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,  	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, -	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_err, +	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,  };  static match_table_t tokens = { @@ -83,6 +84,7 @@ static match_table_t tokens = {  	{Opt_max_inline, "max_inline=%s"},  	{Opt_alloc_start, "alloc_start=%s"},  	{Opt_thread_pool, "thread_pool=%d"}, +	{Opt_compress, "compress"},  	{Opt_ssd, "ssd"},  	{Opt_noacl, "noacl"},  	{Opt_err, NULL}, @@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  			btrfs_set_opt(info->mount_opt, NODATACOW);  			btrfs_set_opt(info->mount_opt, NODATASUM);  			break; +		case Opt_compress: +			printk(KERN_INFO "btrfs: use compression\n"); +			btrfs_set_opt(info->mount_opt, COMPRESS); +			break;  		case Opt_ssd:  			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");  			btrfs_set_opt(info->mount_opt, SSD); @@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void)  	err = btrfs_interface_init();  	if (err)  		goto free_extent_map; +  	err = register_filesystem(&btrfs_fs_type);  	if (err)  		goto unregister_ioctl; @@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void)  	unregister_filesystem(&btrfs_fs_type);  	btrfs_exit_sysfs();  	btrfs_cleanup_fs_uuids(); +	btrfs_zlib_exit();  }  module_init(init_btrfs_fs) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cf618cc8b34..e6d579053a4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,  	if (found_type == BTRFS_FILE_EXTENT_REG)  		extent_end = start + btrfs_file_extent_num_bytes(eb, item);  	else if (found_type == BTRFS_FILE_EXTENT_INLINE) { -		size = btrfs_file_extent_inline_len(eb, -						    btrfs_item_nr(eb, slot)); +		size = btrfs_file_extent_inline_len(eb, item);  		extent_end = (start + size + mask) & ~mask;  	} else {  		ret = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2eed7f91f51..7db4cfd03a9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1816,6 +1816,7 @@ again:  	em->start = key.offset;  	em->len = *num_bytes;  	em->block_start = 0; +	em->block_len = em->len;  	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {  		ret = btrfs_add_system_chunk(trans, chunk_root, &key, @@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  	em->start = logical;  	em->len = length;  	em->block_start = 0; +	em->block_len = em->len;  	map->num_stripes = num_stripes;  	map->io_width = btrfs_chunk_io_width(leaf, chunk); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 00000000000..e99309180a1 --- /dev/null +++ b/fs/btrfs/zlib.c @@ -0,0 +1,637 @@ +/* + * Copyright (C) 2008 Oracle.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/zlib.h> +#include <linux/zutil.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> + +/* Plan: call deflate() with avail_in == *sourcelen, +	avail_out = *dstlen - 12 and flush == Z_FINISH. +	If it doesn't manage to finish,	call it again with +	avail_in == 0 and avail_out set to the remaining 12 +	bytes for it to clean up. +   Q: Is 12 bytes sufficient? +*/ +#define STREAM_END_SPACE 12 + +struct workspace { +	z_stream inf_strm; +	z_stream def_strm; +	char *buf; +	struct list_head list; +}; + +static LIST_HEAD(idle_workspace); +static DEFINE_SPINLOCK(workspace_lock); +static unsigned long num_workspace; +static atomic_t alloc_workspace = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); + +/* + * this finds an available zlib workspace or allocates a new one + * NULL or an ERR_PTR is returned if things go bad. + */ +static struct workspace *find_zlib_workspace(void) +{ +	struct workspace *workspace; +	int ret; +	int cpus = num_online_cpus(); + +again: +	spin_lock(&workspace_lock); +	if (!list_empty(&idle_workspace)) { +		workspace = list_entry(idle_workspace.next, struct workspace, +				       list); +		list_del(&workspace->list); +		num_workspace--; +		spin_unlock(&workspace_lock); +		return workspace; + +	} +	spin_unlock(&workspace_lock); +	if (atomic_read(&alloc_workspace) > cpus) { +		DEFINE_WAIT(wait); +		prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); +		if (atomic_read(&alloc_workspace) > cpus) +			schedule(); +		finish_wait(&workspace_wait, &wait); +		goto again; +	} +	atomic_inc(&alloc_workspace); +	workspace = kzalloc(sizeof(*workspace), GFP_NOFS); +	if (!workspace) { +		ret = -ENOMEM; +		goto fail; +	} + +	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); +	if (!workspace->def_strm.workspace) { +		ret = -ENOMEM; +		goto fail; +	} +	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); +	if (!workspace->inf_strm.workspace) { +		ret = -ENOMEM; +		goto fail_inflate; +	} +	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); +	if (!workspace->buf) { +		ret = -ENOMEM; +		goto fail_kmalloc; +	} +	return workspace; + +fail_kmalloc: +	vfree(workspace->inf_strm.workspace); +fail_inflate: +	vfree(workspace->def_strm.workspace); +fail: +	kfree(workspace); +	atomic_dec(&alloc_workspace); +	wake_up(&workspace_wait); +	return ERR_PTR(ret); +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static int free_workspace(struct workspace *workspace) +{ +	spin_lock(&workspace_lock); +	if (num_workspace < num_online_cpus()) { +		list_add_tail(&workspace->list, &idle_workspace); +		num_workspace++; +		spin_unlock(&workspace_lock); +		if (waitqueue_active(&workspace_wait)) +			wake_up(&workspace_wait); +		return 0; +	} +	spin_unlock(&workspace_lock); +	vfree(workspace->def_strm.workspace); +	vfree(workspace->inf_strm.workspace); +	kfree(workspace->buf); +	kfree(workspace); + +	atomic_dec(&alloc_workspace); +	if (waitqueue_active(&workspace_wait)) +		wake_up(&workspace_wait); +	return 0; +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ +	struct workspace *workspace; +	while(!list_empty(&idle_workspace)) { +		workspace = list_entry(idle_workspace.next, struct workspace, +				       list); +		list_del(&workspace->list); +		vfree(workspace->def_strm.workspace); +		vfree(workspace->inf_strm.workspace); +		kfree(workspace->buf); +		kfree(workspace); +		atomic_dec(&alloc_workspace); +	} +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated.  There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read.  It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_zlib_compress_pages(struct address_space *mapping, +			      u64 start, unsigned long len, +			      struct page **pages, +			      unsigned long nr_dest_pages, +			      unsigned long *out_pages, +			      unsigned long *total_in, +			      unsigned long *total_out, +			      unsigned long max_out) +{ +	int ret; +	struct workspace *workspace; +	char *data_in; +	char *cpage_out; +	int nr_pages = 0; +	struct page *in_page = NULL; +	struct page *out_page = NULL; +	int out_written = 0; +	int in_read = 0; +	unsigned long bytes_left; + +	*out_pages = 0; +	*total_out = 0; +	*total_in = 0; + +	workspace = find_zlib_workspace(); +	if (!workspace) +		return -1; + +	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { +		printk(KERN_WARNING "deflateInit failed\n"); +		ret = -1; +		goto out; +	} + +	workspace->def_strm.total_in = 0; +	workspace->def_strm.total_out = 0; + +	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); +	data_in = kmap(in_page); + +	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); +	cpage_out = kmap(out_page); +	pages[0] = out_page; +	nr_pages = 1; + +	workspace->def_strm.next_in = data_in; +	workspace->def_strm.next_out = cpage_out; +	workspace->def_strm.avail_out = PAGE_CACHE_SIZE; +	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + +	out_written = 0; +	in_read = 0; + +	while (workspace->def_strm.total_in < len) { +		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); +		if (ret != Z_OK) { +			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", +			       ret); +			zlib_deflateEnd(&workspace->def_strm); +			ret = -1; +			goto out; +		} + +		/* we're making it bigger, give up */ +		if (workspace->def_strm.total_in > 8192 && +		    workspace->def_strm.total_in < +		    workspace->def_strm.total_out) { +			ret = -1; +			goto out; +		} +		/* we need another page for writing out.  Test this +		 * before the total_in so we will pull in a new page for +		 * the stream end if required +		 */ +		if (workspace->def_strm.avail_out == 0) { +			kunmap(out_page); +			if (nr_pages == nr_dest_pages) { +				out_page = NULL; +				ret = -1; +				goto out; +			} +			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); +			cpage_out = kmap(out_page); +			pages[nr_pages] = out_page; +			nr_pages++; +			workspace->def_strm.avail_out = PAGE_CACHE_SIZE; +			workspace->def_strm.next_out = cpage_out; +		} +		/* we're all done */ +		if (workspace->def_strm.total_in >= len) +			break; + +		/* we've read in a full page, get a new one */ +		if (workspace->def_strm.avail_in == 0) { +			if (workspace->def_strm.total_out > max_out) +				break; + +			bytes_left = len - workspace->def_strm.total_in; +			kunmap(in_page); +			page_cache_release(in_page); + +			start += PAGE_CACHE_SIZE; +			in_page = find_get_page(mapping, +						start >> PAGE_CACHE_SHIFT); +			data_in = kmap(in_page); +			workspace->def_strm.avail_in = min(bytes_left, +							   PAGE_CACHE_SIZE); +			workspace->def_strm.next_in = data_in; +		} +	} +	workspace->def_strm.avail_in = 0; +	ret = zlib_deflate(&workspace->def_strm, Z_FINISH); +	zlib_deflateEnd(&workspace->def_strm); + +	if (ret != Z_STREAM_END) { +		ret = -1; +		goto out; +	} + +	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { +		ret = -1; +		goto out; +	} + +	ret = 0; +	*total_out = workspace->def_strm.total_out; +	*total_in = workspace->def_strm.total_in; +out: +	*out_pages = nr_pages; +	if (out_page) +		kunmap(out_page); + +	if (in_page) { +		kunmap(in_page); +		page_cache_release(in_page); +	} +	free_workspace(workspace); +	return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous.  They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_zlib_decompress_biovec(struct page **pages_in, +			      u64 disk_start, +			      struct bio_vec *bvec, +			      int vcnt, +			      size_t srclen) +{ +	int ret = 0; +	int wbits = MAX_WBITS; +	struct workspace *workspace; +	char *data_in; +	size_t total_out = 0; +	unsigned long page_bytes_left; +	unsigned long page_in_index = 0; +	unsigned long page_out_index = 0; +	struct page *page_out; +	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / +					PAGE_CACHE_SIZE; +	unsigned long buf_start; +	unsigned long buf_offset; +	unsigned long bytes; +	unsigned long working_bytes; +	unsigned long pg_offset; +	unsigned long start_byte; +	unsigned long current_buf_start; +	char *kaddr; + +	workspace = find_zlib_workspace(); +	if (!workspace) +		return -ENOMEM; + +	data_in = kmap(pages_in[page_in_index]); +	workspace->inf_strm.next_in = data_in; +	workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE); +	workspace->inf_strm.total_in = 0; + +	workspace->inf_strm.total_out = 0; +	workspace->inf_strm.next_out = workspace->buf; +	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; +	page_out = bvec[page_out_index].bv_page; +	page_bytes_left = PAGE_CACHE_SIZE; +	pg_offset = 0; + +	/* If it's deflate, and it's got no preset dictionary, then +	   we can tell zlib to skip the adler32 check. */ +	if (srclen > 2 && !(data_in[1] & PRESET_DICT) && +	    ((data_in[0] & 0x0f) == Z_DEFLATED) && +	    !(((data_in[0]<<8) + data_in[1]) % 31)) { + +		wbits = -((data_in[0] >> 4) + 8); +		workspace->inf_strm.next_in += 2; +		workspace->inf_strm.avail_in -= 2; +	} + +	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { +		printk(KERN_WARNING "inflateInit failed\n"); +		ret = -1; +		goto out; +	} +	while(workspace->inf_strm.total_in < srclen) { +		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); +		if (ret != Z_OK && ret != Z_STREAM_END) { +			break; +		} + +		/* +		 * buf start is the byte offset we're of the start of +		 * our workspace buffer +		 */ +		buf_start = total_out; + +		/* total_out is the last byte of the workspace buffer */ +		total_out = workspace->inf_strm.total_out; + +		working_bytes = total_out - buf_start; + +		/* +		 * start byte is the first byte of the page we're currently +		 * copying into relative to the start of the compressed data. +		 */ +		start_byte = page_offset(page_out) - disk_start; + +		if (working_bytes == 0) { +			/* we didn't make progress in this inflate +			 * call, we're done +			 */ +			if (ret != Z_STREAM_END) +				ret = -1; +			break; +		} + +		/* we haven't yet hit data corresponding to this page */ +		if (total_out <= start_byte) { +			goto next; +		} + +		/* +		 * the start of the data we care about is offset into +		 * the middle of our working buffer +		 */ +		if (total_out > start_byte && buf_start < start_byte) { +			buf_offset = start_byte - buf_start; +			working_bytes -= buf_offset; +		} else { +			buf_offset = 0; +		} +		current_buf_start = buf_start; + +		/* copy bytes from the working buffer into the pages */ +		while(working_bytes > 0) { +			bytes = min(PAGE_CACHE_SIZE - pg_offset, +				    PAGE_CACHE_SIZE - buf_offset); +			bytes = min(bytes, working_bytes); +			kaddr = kmap_atomic(page_out, KM_USER0); +			memcpy(kaddr + pg_offset, workspace->buf + buf_offset, +			       bytes); +			kunmap_atomic(kaddr, KM_USER0); +			flush_dcache_page(page_out); + +			pg_offset += bytes; +			page_bytes_left -= bytes; +			buf_offset += bytes; +			working_bytes -= bytes; +			current_buf_start += bytes; + +			/* check if we need to pick another page */ +			if (page_bytes_left == 0) { +				page_out_index++; +				if (page_out_index >= vcnt) { +					ret = 0; +					goto done; +				} +				page_out = bvec[page_out_index].bv_page; +				pg_offset = 0; +				page_bytes_left = PAGE_CACHE_SIZE; +				start_byte = page_offset(page_out) - disk_start; + +				/* +				 * make sure our new page is covered by this +				 * working buffer +				 */ +				if (total_out <= start_byte) { +					goto next; +				} + +				/* the next page in the biovec might not +				 * be adjacent to the last page, but it +				 * might still be found inside this working +				 * buffer.  bump our offset pointer +				 */ +				if (total_out > start_byte && +				    current_buf_start < start_byte) { +					buf_offset = start_byte - buf_start; +					working_bytes = total_out - start_byte; +					current_buf_start = buf_start + +						buf_offset; +				} +			} +		} +next: +		workspace->inf_strm.next_out = workspace->buf; +		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + +		if (workspace->inf_strm.avail_in == 0) { +			unsigned long tmp; +			kunmap(pages_in[page_in_index]); +			page_in_index++; +			if (page_in_index >= total_pages_in) { +				data_in = NULL; +				break; +			} +			data_in = kmap(pages_in[page_in_index]); +			workspace->inf_strm.next_in = data_in; +			tmp = srclen - workspace->inf_strm.total_in; +			workspace->inf_strm.avail_in = min(tmp, +							   PAGE_CACHE_SIZE); +		} +	} +	if (ret != Z_STREAM_END) { +		ret = -1; +	} else { +		ret = 0; +	} +done: +	zlib_inflateEnd(&workspace->inf_strm); +	if (data_in) +		kunmap(pages_in[page_in_index]); +out: +	free_workspace(workspace); +	return ret; +} + +/* + * a less complex decompression routine.  Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_zlib_decompress(unsigned char *data_in, +			  struct page *dest_page, +			  unsigned long start_byte, +			  size_t srclen, size_t destlen) +{ +	int ret = 0; +	int wbits = MAX_WBITS; +	struct workspace *workspace; +	unsigned long bytes_left = destlen; +	unsigned long total_out = 0; +	char *kaddr; + +	if (destlen > PAGE_CACHE_SIZE) +		return -ENOMEM; + +	workspace = find_zlib_workspace(); +	if (!workspace) +		return -ENOMEM; + +	workspace->inf_strm.next_in = data_in; +	workspace->inf_strm.avail_in = srclen; +	workspace->inf_strm.total_in = 0; + +	workspace->inf_strm.next_out = workspace->buf; +	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; +	workspace->inf_strm.total_out = 0; +	/* If it's deflate, and it's got no preset dictionary, then +	   we can tell zlib to skip the adler32 check. */ +	if (srclen > 2 && !(data_in[1] & PRESET_DICT) && +	    ((data_in[0] & 0x0f) == Z_DEFLATED) && +	    !(((data_in[0]<<8) + data_in[1]) % 31)) { + +		wbits = -((data_in[0] >> 4) + 8); +		workspace->inf_strm.next_in += 2; +		workspace->inf_strm.avail_in -= 2; +	} + +	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { +		printk(KERN_WARNING "inflateInit failed\n"); +		ret = -1; +		goto out; +	} + +	while(bytes_left > 0) { +		unsigned long buf_start; +		unsigned long buf_offset; +		unsigned long bytes; +		unsigned long pg_offset = 0; + +		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); +		if (ret != Z_OK && ret != Z_STREAM_END) { +			break; +		} + +		buf_start = total_out; +		total_out = workspace->inf_strm.total_out; + +		if (total_out == buf_start) { +			ret = -1; +			break; +		} + +		if (total_out <= start_byte) { +			goto next; +		} + +		if (total_out > start_byte && buf_start < start_byte) { +			buf_offset = start_byte - buf_start; +		} else { +			buf_offset = 0; +		} + +		bytes = min(PAGE_CACHE_SIZE - pg_offset, +			    PAGE_CACHE_SIZE - buf_offset); +		bytes = min(bytes, bytes_left); + +		kaddr = kmap_atomic(dest_page, KM_USER0); +		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); +		kunmap_atomic(kaddr, KM_USER0); + +		pg_offset += bytes; +		bytes_left -= bytes; +next: +		workspace->inf_strm.next_out = workspace->buf; +		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; +	} +	if (ret != Z_STREAM_END && bytes_left != 0) { +		ret = -1; +	} else { +		ret = 0; +	} +	zlib_inflateEnd(&workspace->inf_strm); +out: +	free_workspace(workspace); +	return ret; +} + +void btrfs_zlib_exit(void) +{ +    free_workspaces(); +}  |