diff options
Diffstat (limited to 'fs/ext4/inode.c')
| -rw-r--r-- | fs/ext4/inode.c | 384 | 
1 files changed, 128 insertions, 256 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 60a26f3a6f8..f9c642b22ef 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -78,16 +78,14 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)   * but there may still be a record of it in the journal, and that record   * still needs to be revoked.   * - * If the handle isn't valid we're not journaling so there's nothing to do. + * If the handle isn't valid we're not journaling, but we still need to + * call into ext4_journal_revoke() to put the buffer head.   */  int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,  		struct buffer_head *bh, ext4_fsblk_t blocknr)  {  	int err; -	if (!ext4_handle_valid(handle)) -		return 0; -  	might_sleep();  	BUFFER_TRACE(bh, "enter"); @@ -1513,14 +1511,14 @@ retry:  		 * Add inode to orphan list in case we crash before  		 * truncate finishes  		 */ -		if (pos + len > inode->i_size) +		if (pos + len > inode->i_size && ext4_can_truncate(inode))  			ext4_orphan_add(handle, inode);  		ext4_journal_stop(handle);  		if (pos + len > inode->i_size) { -			vmtruncate(inode, inode->i_size); +			ext4_truncate(inode);  			/* -			 * If vmtruncate failed early the inode might +			 * If truncate failed early the inode might  			 * still be on the orphan list; we need to  			 * make sure the inode is removed from the  			 * orphan list in that case. @@ -1614,7 +1612,7 @@ static int ext4_ordered_write_end(struct file *file,  		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,  							page, fsdata);  		copied = ret2; -		if (pos + len > inode->i_size) +		if (pos + len > inode->i_size && ext4_can_truncate(inode))  			/* if we have allocated more blocks and copied  			 * less. We will have blocks allocated outside  			 * inode->i_size. So truncate them @@ -1628,9 +1626,9 @@ static int ext4_ordered_write_end(struct file *file,  		ret = ret2;  	if (pos + len > inode->i_size) { -		vmtruncate(inode, inode->i_size); +		ext4_truncate(inode);  		/* -		 * If vmtruncate failed early the inode might still be +		 * If truncate failed early the inode might still be  		 * on the orphan list; we need to make sure the inode  		 * is removed from the orphan list in that case.  		 */ @@ -1655,7 +1653,7 @@ static int ext4_writeback_write_end(struct file *file,  	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,  							page, fsdata);  	copied = ret2; -	if (pos + len > inode->i_size) +	if (pos + len > inode->i_size && ext4_can_truncate(inode))  		/* if we have allocated more blocks and copied  		 * less. We will have blocks allocated outside  		 * inode->i_size. So truncate them @@ -1670,9 +1668,9 @@ static int ext4_writeback_write_end(struct file *file,  		ret = ret2;  	if (pos + len > inode->i_size) { -		vmtruncate(inode, inode->i_size); +		ext4_truncate(inode);  		/* -		 * If vmtruncate failed early the inode might still be +		 * If truncate failed early the inode might still be  		 * on the orphan list; we need to make sure the inode  		 * is removed from the orphan list in that case.  		 */ @@ -1722,7 +1720,7 @@ static int ext4_journalled_write_end(struct file *file,  	unlock_page(page);  	page_cache_release(page); -	if (pos + len > inode->i_size) +	if (pos + len > inode->i_size && ext4_can_truncate(inode))  		/* if we have allocated more blocks and copied  		 * less. We will have blocks allocated outside  		 * inode->i_size. So truncate them @@ -1733,9 +1731,9 @@ static int ext4_journalled_write_end(struct file *file,  	if (!ret)  		ret = ret2;  	if (pos + len > inode->i_size) { -		vmtruncate(inode, inode->i_size); +		ext4_truncate(inode);  		/* -		 * If vmtruncate failed early the inode might still be +		 * If truncate failed early the inode might still be  		 * on the orphan list; we need to make sure the inode  		 * is removed from the orphan list in that case.  		 */ @@ -2305,15 +2303,9 @@ flush_it:  	return;  } -static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)  { -	/* -	 * unmapped buffer is possible for holes. -	 * delay buffer is possible with delayed allocation. -	 * We also need to consider unwritten buffer as unmapped. -	 */ -	return (!buffer_mapped(bh) || buffer_delay(bh) || -				buffer_unwritten(bh)) && buffer_dirty(bh); +	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);  }  /* @@ -2398,9 +2390,9 @@ static int __mpage_da_writepage(struct page *page,  			 * We need to try to allocate  			 * unmapped blocks in the same page.  			 * Otherwise we won't make progress -			 * with the page in ext4_da_writepage +			 * with the page in ext4_writepage  			 */ -			if (ext4_bh_unmapped_or_delay(NULL, bh)) { +			if (ext4_bh_delay_or_unwritten(NULL, bh)) {  				mpage_add_bh_to_extent(mpd, logical,  						       bh->b_size,  						       bh->b_state); @@ -2517,7 +2509,6 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,  	 * so call get_block_wrap with create = 0  	 */  	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); -	BUG_ON(create && ret == 0);  	if (ret > 0) {  		bh_result->b_size = (ret << inode->i_blkbits);  		ret = 0; @@ -2525,15 +2516,102 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,  	return ret;  } +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ +	get_bh(bh); +	return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ +	put_bh(bh); +	return 0; +} + +static int __ext4_journalled_writepage(struct page *page, +				       struct writeback_control *wbc, +				       unsigned int len) +{ +	struct address_space *mapping = page->mapping; +	struct inode *inode = mapping->host; +	struct buffer_head *page_bufs; +	handle_t *handle = NULL; +	int ret = 0; +	int err; + +	page_bufs = page_buffers(page); +	BUG_ON(!page_bufs); +	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); +	/* As soon as we unlock the page, it can go away, but we have +	 * references to buffers so we are safe */ +	unlock_page(page); + +	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); +	if (IS_ERR(handle)) { +		ret = PTR_ERR(handle); +		goto out; +	} + +	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, +				do_journal_get_write_access); + +	err = walk_page_buffers(handle, page_bufs, 0, len, NULL, +				write_end_fn); +	if (ret == 0) +		ret = err; +	err = ext4_journal_stop(handle); +	if (!ret) +		ret = err; + +	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); +	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; +out: +	return ret; +} +  /* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + *   * This function can get called via...   *   - ext4_da_writepages after taking page lock (have journal handle)   *   - journal_submit_inode_data_buffers (no journal handle)   *   - shrink_page_list via pdflush (no journal handle)   *   - grab_page_cache when doing write_begin (have journal handle) + * + * We don't do any block allocation in this function. If we have page with + * multiple blocks we need to write those buffer_heads that are mapped. This + * is important for mmaped based write. So if we do with blocksize 1K + * truncate(f, 1024); + * a = mmap(f, 0, 4096); + * a[0] = 'a'; + * truncate(f, 4096); + * we have in the page first buffer_head mapped via page_mkwrite call back + * but other bufer_heads would be unmapped but dirty(dirty done via the + * do_wp_page). So writepage should write the first block. If we modify + * the mmap area beyond 1024 we will again get a page_fault and the + * page_mkwrite callback will do the block allocation and mark the + * buffer_heads mapped. + * + * We redirty the page if we have any buffer_heads that is either delay or + * unwritten in the page. + * + * We can get recursively called as show below. + * + *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + *		ext4_writepage() + * + * But since we don't do any block allocation we should not deadlock. + * Page also have the dirty flag cleared so we don't get recurive page_lock.   */ -static int ext4_da_writepage(struct page *page, -				struct writeback_control *wbc) +static int ext4_writepage(struct page *page, +			  struct writeback_control *wbc)  {  	int ret = 0;  	loff_t size; @@ -2541,7 +2619,7 @@ static int ext4_da_writepage(struct page *page,  	struct buffer_head *page_bufs;  	struct inode *inode = page->mapping->host; -	trace_ext4_da_writepage(inode, page); +	trace_ext4_writepage(inode, page);  	size = i_size_read(inode);  	if (page->index == size >> PAGE_CACHE_SHIFT)  		len = size & ~PAGE_CACHE_MASK; @@ -2551,7 +2629,7 @@ static int ext4_da_writepage(struct page *page,  	if (page_has_buffers(page)) {  		page_bufs = page_buffers(page);  		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, -					ext4_bh_unmapped_or_delay)) { +					ext4_bh_delay_or_unwritten)) {  			/*  			 * We don't want to do  block allocation  			 * So redirty the page and return @@ -2578,13 +2656,13 @@ static int ext4_da_writepage(struct page *page,  		 * all are mapped and non delay. We don't want to  		 * do block allocation here.  		 */ -		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, +		ret = block_prepare_write(page, 0, len,  					  noalloc_get_block_write);  		if (!ret) {  			page_bufs = page_buffers(page);  			/* check whether all are mapped and non delay */  			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, -						ext4_bh_unmapped_or_delay)) { +						ext4_bh_delay_or_unwritten)) {  				redirty_page_for_writepage(wbc, page);  				unlock_page(page);  				return 0; @@ -2600,7 +2678,16 @@ static int ext4_da_writepage(struct page *page,  			return 0;  		}  		/* now mark the buffer_heads as dirty and uptodate */ -		block_commit_write(page, 0, PAGE_CACHE_SIZE); +		block_commit_write(page, 0, len); +	} + +	if (PageChecked(page) && ext4_should_journal_data(inode)) { +		/* +		 * It's mmapped pagecache.  Add buffers and journal it.  There +		 * doesn't seem much point in redirtying the page here. +		 */ +		ClearPageChecked(page); +		return __ext4_journalled_writepage(page, wbc, len);  	}  	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) @@ -2907,7 +2994,7 @@ retry:  		 * i_size_read because we hold i_mutex.  		 */  		if (pos + len > inode->i_size) -			vmtruncate(inode, inode->i_size); +			ext4_truncate(inode);  	}  	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3130,222 +3217,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)  	return generic_block_bmap(mapping, block, ext4_get_block);  } -static int bget_one(handle_t *handle, struct buffer_head *bh) -{ -	get_bh(bh); -	return 0; -} - -static int bput_one(handle_t *handle, struct buffer_head *bh) -{ -	put_bh(bh); -	return 0; -} - -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), noone guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * In all journaling modes block_write_full_page() will start the I/O. - * - * Problem: - * - *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - *		ext4_writepage() - * - * Similar for: - * - *	ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext4_get_block().  We will deadlock on various things like - * lock_journal and i_data_sem - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - *	    non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - *   In journalled data mode, a data buffer may be metadata against the - *   current transaction.  But the same file is part of a shared mapping - *   and someone does a writepage() on it. - * - *   We will move the buffer onto the async_data list, but *after* it has - *   been dirtied. So there's a small window where we have dirty data on - *   BJ_Metadata. - * - *   Note that this only applies to the last partial page in the file.  The - *   bit which block_write_full_page() uses prepare/commit for.  (That's - *   broken code anyway: it's wrong for msync()). - * - *   It's a rare case: affects the final partial page, for journalled data - *   where the file is subject to bith write() and writepage() in the same - *   transction.  To fix it we'll need a custom block_write_full_page(). - *   We'll probably need that anyway for journalling writepage() output. - * - * We don't honour synchronous mounts for writepage().  That would be - * disastrous.  Any write() or metadata operation will sync the fs for - * us. - * - */ -static int __ext4_normal_writepage(struct page *page, -				   struct writeback_control *wbc) -{ -	struct inode *inode = page->mapping->host; - -	if (test_opt(inode->i_sb, NOBH)) -		return nobh_writepage(page, noalloc_get_block_write, wbc); -	else -		return block_write_full_page(page, noalloc_get_block_write, -					     wbc); -} - -static int ext4_normal_writepage(struct page *page, -				 struct writeback_control *wbc) -{ -	struct inode *inode = page->mapping->host; -	loff_t size = i_size_read(inode); -	loff_t len; - -	trace_ext4_normal_writepage(inode, page); -	J_ASSERT(PageLocked(page)); -	if (page->index == size >> PAGE_CACHE_SHIFT) -		len = size & ~PAGE_CACHE_MASK; -	else -		len = PAGE_CACHE_SIZE; - -	if (page_has_buffers(page)) { -		/* if page has buffers it should all be mapped -		 * and allocated. If there are not buffers attached -		 * to the page we know the page is dirty but it lost -		 * buffers. That means that at some moment in time -		 * after write_begin() / write_end() has been called -		 * all buffers have been clean and thus they must have been -		 * written at least once. So they are all mapped and we can -		 * happily proceed with mapping them and writing the page. -		 */ -		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, -					ext4_bh_unmapped_or_delay)); -	} - -	if (!ext4_journal_current_handle()) -		return __ext4_normal_writepage(page, wbc); - -	redirty_page_for_writepage(wbc, page); -	unlock_page(page); -	return 0; -} - -static int __ext4_journalled_writepage(struct page *page, -				       struct writeback_control *wbc) -{ -	struct address_space *mapping = page->mapping; -	struct inode *inode = mapping->host; -	struct buffer_head *page_bufs; -	handle_t *handle = NULL; -	int ret = 0; -	int err; - -	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, -				  noalloc_get_block_write); -	if (ret != 0) -		goto out_unlock; - -	page_bufs = page_buffers(page); -	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, -								bget_one); -	/* As soon as we unlock the page, it can go away, but we have -	 * references to buffers so we are safe */ -	unlock_page(page); - -	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); -	if (IS_ERR(handle)) { -		ret = PTR_ERR(handle); -		goto out; -	} - -	ret = walk_page_buffers(handle, page_bufs, 0, -			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); - -	err = walk_page_buffers(handle, page_bufs, 0, -				PAGE_CACHE_SIZE, NULL, write_end_fn); -	if (ret == 0) -		ret = err; -	err = ext4_journal_stop(handle); -	if (!ret) -		ret = err; - -	walk_page_buffers(handle, page_bufs, 0, -				PAGE_CACHE_SIZE, NULL, bput_one); -	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; -	goto out; - -out_unlock: -	unlock_page(page); -out: -	return ret; -} - -static int ext4_journalled_writepage(struct page *page, -				     struct writeback_control *wbc) -{ -	struct inode *inode = page->mapping->host; -	loff_t size = i_size_read(inode); -	loff_t len; - -	trace_ext4_journalled_writepage(inode, page); -	J_ASSERT(PageLocked(page)); -	if (page->index == size >> PAGE_CACHE_SHIFT) -		len = size & ~PAGE_CACHE_MASK; -	else -		len = PAGE_CACHE_SIZE; - -	if (page_has_buffers(page)) { -		/* if page has buffers it should all be mapped -		 * and allocated. If there are not buffers attached -		 * to the page we know the page is dirty but it lost -		 * buffers. That means that at some moment in time -		 * after write_begin() / write_end() has been called -		 * all buffers have been clean and thus they must have been -		 * written at least once. So they are all mapped and we can -		 * happily proceed with mapping them and writing the page. -		 */ -		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, -					ext4_bh_unmapped_or_delay)); -	} - -	if (ext4_journal_current_handle()) -		goto no_write; - -	if (PageChecked(page)) { -		/* -		 * It's mmapped pagecache.  Add buffers and journal it.  There -		 * doesn't seem much point in redirtying the page here. -		 */ -		ClearPageChecked(page); -		return __ext4_journalled_writepage(page, wbc); -	} else { -		/* -		 * It may be a page full of checkpoint-mode buffers.  We don't -		 * really know unless we go poke around in the buffer_heads. -		 * But block_write_full_page will do the right thing. -		 */ -		return block_write_full_page(page, noalloc_get_block_write, -					     wbc); -	} -no_write: -	redirty_page_for_writepage(wbc, page); -	unlock_page(page); -	return 0; -} -  static int ext4_readpage(struct file *file, struct page *page)  {  	return mpage_readpage(page, ext4_get_block); @@ -3492,7 +3363,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)  static const struct address_space_operations ext4_ordered_aops = {  	.readpage		= ext4_readpage,  	.readpages		= ext4_readpages, -	.writepage		= ext4_normal_writepage, +	.writepage		= ext4_writepage,  	.sync_page		= block_sync_page,  	.write_begin		= ext4_write_begin,  	.write_end		= ext4_ordered_write_end, @@ -3507,7 +3378,7 @@ static const struct address_space_operations ext4_ordered_aops = {  static const struct address_space_operations ext4_writeback_aops = {  	.readpage		= ext4_readpage,  	.readpages		= ext4_readpages, -	.writepage		= ext4_normal_writepage, +	.writepage		= ext4_writepage,  	.sync_page		= block_sync_page,  	.write_begin		= ext4_write_begin,  	.write_end		= ext4_writeback_write_end, @@ -3522,7 +3393,7 @@ static const struct address_space_operations ext4_writeback_aops = {  static const struct address_space_operations ext4_journalled_aops = {  	.readpage		= ext4_readpage,  	.readpages		= ext4_readpages, -	.writepage		= ext4_journalled_writepage, +	.writepage		= ext4_writepage,  	.sync_page		= block_sync_page,  	.write_begin		= ext4_write_begin,  	.write_end		= ext4_journalled_write_end, @@ -3536,7 +3407,7 @@ static const struct address_space_operations ext4_journalled_aops = {  static const struct address_space_operations ext4_da_aops = {  	.readpage		= ext4_readpage,  	.readpages		= ext4_readpages, -	.writepage		= ext4_da_writepage, +	.writepage		= ext4_writepage,  	.writepages		= ext4_da_writepages,  	.sync_page		= block_sync_page,  	.write_begin		= ext4_da_write_begin, @@ -3583,7 +3454,8 @@ int ext4_block_truncate_page(handle_t *handle,  	struct page *page;  	int err = 0; -	page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); +	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, +				   mapping_gfp_mask(mapping) & ~__GFP_FS);  	if (!page)  		return -EINVAL;  |