diff options
Diffstat (limited to 'fs/ext4')
| -rw-r--r-- | fs/ext4/ext4.h | 8 | ||||
| -rw-r--r-- | fs/ext4/extents.c | 105 | ||||
| -rw-r--r-- | fs/ext4/extents_status.c | 212 | ||||
| -rw-r--r-- | fs/ext4/extents_status.h | 9 | ||||
| -rw-r--r-- | fs/ext4/ialloc.c | 4 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 182 | ||||
| -rw-r--r-- | fs/ext4/mballoc.c | 23 | ||||
| -rw-r--r-- | fs/ext4/move_extent.c | 43 | ||||
| -rw-r--r-- | fs/ext4/page-io.c | 12 | ||||
| -rw-r--r-- | fs/ext4/resize.c | 4 | ||||
| -rw-r--r-- | fs/ext4/super.c | 4 | 
11 files changed, 530 insertions, 76 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a01ba31526..3b83cd60479 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -335,9 +335,9 @@ struct ext4_group_desc   */  struct flex_groups { -	atomic_t free_inodes; -	atomic_t free_clusters; -	atomic_t used_dirs; +	atomic64_t	free_clusters; +	atomic_t	free_inodes; +	atomic_t	used_dirs;  };  #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */ @@ -2617,7 +2617,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,  extern int __init ext4_init_pageio(void);  extern void ext4_add_complete_io(ext4_io_end_t *io_end);  extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); +extern void ext4_ioend_shutdown(struct inode *);  extern void ext4_free_io_end(ext4_io_end_t *io);  extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);  extern void ext4_end_io_work(struct work_struct *work); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 28dd8eeea6a..56efcaadf84 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1584,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,  	unsigned short ext1_ee_len, ext2_ee_len, max_len;  	/* -	 * Make sure that either both extents are uninitialized, or -	 * both are _not_. +	 * Make sure that both extents are initialized. We don't merge +	 * uninitialized extents so that we can be sure that end_io code has +	 * the extent that was written properly split out and conversion to +	 * initialized is trivial.  	 */ -	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) +	if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))  		return 0;  	if (ext4_ext_is_uninitialized(ex1)) @@ -2923,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle,  {  	ext4_fsblk_t newblock;  	ext4_lblk_t ee_block; -	struct ext4_extent *ex, newex, orig_ex; +	struct ext4_extent *ex, newex, orig_ex, zero_ex;  	struct ext4_extent *ex2 = NULL;  	unsigned int ee_len, depth;  	int err = 0; @@ -2943,6 +2945,10 @@ static int ext4_split_extent_at(handle_t *handle,  	newblock = split - ee_block + ext4_ext_pblock(ex);  	BUG_ON(split < ee_block || split >= (ee_block + ee_len)); +	BUG_ON(!ext4_ext_is_uninitialized(ex) && +	       split_flag & (EXT4_EXT_MAY_ZEROOUT | +			     EXT4_EXT_MARK_UNINIT1 | +			     EXT4_EXT_MARK_UNINIT2));  	err = ext4_ext_get_access(handle, inode, path + depth);  	if (err) @@ -2990,12 +2996,26 @@ static int ext4_split_extent_at(handle_t *handle,  	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);  	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {  		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { -			if (split_flag & EXT4_EXT_DATA_VALID1) +			if (split_flag & EXT4_EXT_DATA_VALID1) {  				err = ext4_ext_zeroout(inode, ex2); -			else +				zero_ex.ee_block = ex2->ee_block; +				zero_ex.ee_len = ext4_ext_get_actual_len(ex2); +				ext4_ext_store_pblock(&zero_ex, +						      ext4_ext_pblock(ex2)); +			} else {  				err = ext4_ext_zeroout(inode, ex); -		} else +				zero_ex.ee_block = ex->ee_block; +				zero_ex.ee_len = ext4_ext_get_actual_len(ex); +				ext4_ext_store_pblock(&zero_ex, +						      ext4_ext_pblock(ex)); +			} +		} else {  			err = ext4_ext_zeroout(inode, &orig_ex); +			zero_ex.ee_block = orig_ex.ee_block; +			zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); +			ext4_ext_store_pblock(&zero_ex, +					      ext4_ext_pblock(&orig_ex)); +		}  		if (err)  			goto fix_extent_len; @@ -3003,6 +3023,12 @@ static int ext4_split_extent_at(handle_t *handle,  		ex->ee_len = cpu_to_le16(ee_len);  		ext4_ext_try_to_merge(handle, inode, path, ex);  		err = ext4_ext_dirty(handle, inode, path + path->p_depth); +		if (err) +			goto fix_extent_len; + +		/* update extent status tree */ +		err = ext4_es_zeroout(inode, &zero_ex); +  		goto out;  	} else if (err)  		goto fix_extent_len; @@ -3041,6 +3067,7 @@ static int ext4_split_extent(handle_t *handle,  	int err = 0;  	int uninitialized;  	int split_flag1, flags1; +	int allocated = map->m_len;  	depth = ext_depth(inode);  	ex = path[depth].p_ext; @@ -3060,20 +3087,29 @@ static int ext4_split_extent(handle_t *handle,  				map->m_lblk + map->m_len, split_flag1, flags1);  		if (err)  			goto out; +	} else { +		allocated = ee_len - (map->m_lblk - ee_block);  	} - +	/* +	 * Update path is required because previous ext4_split_extent_at() may +	 * result in split of original leaf or extent zeroout. +	 */  	ext4_ext_drop_refs(path);  	path = ext4_ext_find_extent(inode, map->m_lblk, path);  	if (IS_ERR(path))  		return PTR_ERR(path); +	depth = ext_depth(inode); +	ex = path[depth].p_ext; +	uninitialized = ext4_ext_is_uninitialized(ex); +	split_flag1 = 0;  	if (map->m_lblk >= ee_block) { -		split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | -					    EXT4_EXT_DATA_VALID2); -		if (uninitialized) +		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; +		if (uninitialized) {  			split_flag1 |= EXT4_EXT_MARK_UNINIT1; -		if (split_flag & EXT4_EXT_MARK_UNINIT2) -			split_flag1 |= EXT4_EXT_MARK_UNINIT2; +			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | +						     EXT4_EXT_MARK_UNINIT2); +		}  		err = ext4_split_extent_at(handle, inode, path,  				map->m_lblk, split_flag1, flags);  		if (err) @@ -3082,7 +3118,7 @@ static int ext4_split_extent(handle_t *handle,  	ext4_ext_show_leaf(inode, path);  out: -	return err ? err : map->m_len; +	return err ? err : allocated;  }  /* @@ -3137,6 +3173,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  	ee_block = le32_to_cpu(ex->ee_block);  	ee_len = ext4_ext_get_actual_len(ex);  	allocated = ee_len - (map->m_lblk - ee_block); +	zero_ex.ee_len = 0;  	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3227,13 +3264,16 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  	if (EXT4_EXT_MAY_ZEROOUT & split_flag)  		max_zeroout = sbi->s_extent_max_zeroout_kb >> -			inode->i_sb->s_blocksize_bits; +			(inode->i_sb->s_blocksize_bits - 10);  	/* If extent is less than s_max_zeroout_kb, zeroout directly */  	if (max_zeroout && (ee_len <= max_zeroout)) {  		err = ext4_ext_zeroout(inode, ex);  		if (err)  			goto out; +		zero_ex.ee_block = ex->ee_block; +		zero_ex.ee_len = ext4_ext_get_actual_len(ex); +		ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));  		err = ext4_ext_get_access(handle, inode, path + depth);  		if (err) @@ -3292,6 +3332,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  		err = allocated;  out: +	/* If we have gotten a failure, don't zero out status tree */ +	if (!err) +		err = ext4_es_zeroout(inode, &zero_ex);  	return err ? err : allocated;  } @@ -3374,8 +3417,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,  		"block %llu, max_blocks %u\n", inode->i_ino,  		  (unsigned long long)ee_block, ee_len); -	/* If extent is larger than requested then split is required */ +	/* If extent is larger than requested it is a clear sign that we still +	 * have some extent state machine issues left. So extent_split is still +	 * required. +	 * TODO: Once all related issues will be fixed this situation should be +	 * illegal. +	 */  	if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG +		ext4_warning("Inode (%ld) finished: extent logical block %llu," +			     " len %u; IO logical block %llu, len %u\n", +			     inode->i_ino, (unsigned long long)ee_block, ee_len, +			     (unsigned long long)map->m_lblk, map->m_len); +#endif  		err = ext4_split_unwritten_extents(handle, inode, map, path,  						   EXT4_GET_BLOCKS_CONVERT);  		if (err < 0) @@ -3626,6 +3680,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  						 path, map->m_len);  		} else  			err = ret; +		map->m_flags |= EXT4_MAP_MAPPED; +		if (allocated > map->m_len) +			allocated = map->m_len; +		map->m_len = allocated;  		goto out2;  	}  	/* buffered IO case */ @@ -3675,6 +3733,7 @@ out:  					allocated - map->m_len);  		allocated = map->m_len;  	} +	map->m_len = allocated;  	/*  	 * If we have done fallocate with the offset that is already @@ -4106,9 +4165,6 @@ got_allocated_blocks:  			}  		} else {  			BUG_ON(allocated_clusters < reserved_clusters); -			/* We will claim quota for all newly allocated blocks.*/ -			ext4_da_update_reserve_space(inode, allocated_clusters, -							1);  			if (reserved_clusters < allocated_clusters) {  				struct ext4_inode_info *ei = EXT4_I(inode);  				int reservation = allocated_clusters - @@ -4159,6 +4215,15 @@ got_allocated_blocks:  				ei->i_reserved_data_blocks += reservation;  				spin_unlock(&ei->i_block_reservation_lock);  			} +			/* +			 * We will claim quota for all newly allocated blocks. +			 * We're updating the reserved space *after* the +			 * correction above so we do not accidentally free +			 * all the metadata reservation because we might +			 * actually need it later on. +			 */ +			ext4_da_update_reserve_space(inode, allocated_clusters, +							1);  		}  	} @@ -4368,8 +4433,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  	if (len <= EXT_UNINIT_MAX_LEN << blkbits)  		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -	/* Prevent race condition between unwritten */ -	ext4_flush_unwritten_io(inode);  retry:  	while (ret >= 0 && ret < max_blocks) {  		map.m_lblk = map.m_lblk + ret; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95796a1b752..fe3337a85ed 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -333,17 +333,27 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)  static int ext4_es_can_be_merged(struct extent_status *es1,  				 struct extent_status *es2)  { -	if (es1->es_lblk + es1->es_len != es2->es_lblk) +	if (ext4_es_status(es1) != ext4_es_status(es2))  		return 0; -	if (ext4_es_status(es1) != ext4_es_status(es2)) +	if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)  		return 0; -	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && -	    (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) +	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)  		return 0; -	return 1; +	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && +	    (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) +		return 1; + +	if (ext4_es_is_hole(es1)) +		return 1; + +	/* we need to check delayed extent is without unwritten status */ +	if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) +		return 1; + +	return 0;  }  static struct extent_status * @@ -389,6 +399,179 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)  	return es;  } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_es_insert_extent_ext_check(struct inode *inode, +					    struct extent_status *es) +{ +	struct ext4_ext_path *path = NULL; +	struct ext4_extent *ex; +	ext4_lblk_t ee_block; +	ext4_fsblk_t ee_start; +	unsigned short ee_len; +	int depth, ee_status, es_status; + +	path = ext4_ext_find_extent(inode, es->es_lblk, NULL); +	if (IS_ERR(path)) +		return; + +	depth = ext_depth(inode); +	ex = path[depth].p_ext; + +	if (ex) { + +		ee_block = le32_to_cpu(ex->ee_block); +		ee_start = ext4_ext_pblock(ex); +		ee_len = ext4_ext_get_actual_len(ex); + +		ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; +		es_status = ext4_es_is_unwritten(es) ? 1 : 0; + +		/* +		 * Make sure ex and es are not overlap when we try to insert +		 * a delayed/hole extent. +		 */ +		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { +			if (in_range(es->es_lblk, ee_block, ee_len)) { +				pr_warn("ES insert assertation failed for " +					"inode: %lu we can find an extent " +					"at block [%d/%d/%llu/%c], but we " +					"want to add an delayed/hole extent " +					"[%d/%d/%llu/%llx]\n", +					inode->i_ino, ee_block, ee_len, +					ee_start, ee_status ? 'u' : 'w', +					es->es_lblk, es->es_len, +					ext4_es_pblock(es), ext4_es_status(es)); +			} +			goto out; +		} + +		/* +		 * We don't check ee_block == es->es_lblk, etc. because es +		 * might be a part of whole extent, vice versa. +		 */ +		if (es->es_lblk < ee_block || +		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { +			pr_warn("ES insert assertation failed for inode: %lu " +				"ex_status [%d/%d/%llu/%c] != " +				"es_status [%d/%d/%llu/%c]\n", inode->i_ino, +				ee_block, ee_len, ee_start, +				ee_status ? 'u' : 'w', es->es_lblk, es->es_len, +				ext4_es_pblock(es), es_status ? 'u' : 'w'); +			goto out; +		} + +		if (ee_status ^ es_status) { +			pr_warn("ES insert assertation failed for inode: %lu " +				"ex_status [%d/%d/%llu/%c] != " +				"es_status [%d/%d/%llu/%c]\n", inode->i_ino, +				ee_block, ee_len, ee_start, +				ee_status ? 'u' : 'w', es->es_lblk, es->es_len, +				ext4_es_pblock(es), es_status ? 'u' : 'w'); +		} +	} else { +		/* +		 * We can't find an extent on disk.  So we need to make sure +		 * that we don't want to add an written/unwritten extent. +		 */ +		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { +			pr_warn("ES insert assertation failed for inode: %lu " +				"can't find an extent at block %d but we want " +				"to add an written/unwritten extent " +				"[%d/%d/%llu/%llx]\n", inode->i_ino, +				es->es_lblk, es->es_lblk, es->es_len, +				ext4_es_pblock(es), ext4_es_status(es)); +		} +	} +out: +	if (path) { +		ext4_ext_drop_refs(path); +		kfree(path); +	} +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, +					    struct extent_status *es) +{ +	struct ext4_map_blocks map; +	int retval; + +	/* +	 * Here we call ext4_ind_map_blocks to lookup a block mapping because +	 * 'Indirect' structure is defined in indirect.c.  So we couldn't +	 * access direct/indirect tree from outside.  It is too dirty to define +	 * this function in indirect.c file. +	 */ + +	map.m_lblk = es->es_lblk; +	map.m_len = es->es_len; + +	retval = ext4_ind_map_blocks(NULL, inode, &map, 0); +	if (retval > 0) { +		if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { +			/* +			 * We want to add a delayed/hole extent but this +			 * block has been allocated. +			 */ +			pr_warn("ES insert assertation failed for inode: %lu " +				"We can find blocks but we want to add a " +				"delayed/hole extent [%d/%d/%llu/%llx]\n", +				inode->i_ino, es->es_lblk, es->es_len, +				ext4_es_pblock(es), ext4_es_status(es)); +			return; +		} else if (ext4_es_is_written(es)) { +			if (retval != es->es_len) { +				pr_warn("ES insert assertation failed for " +					"inode: %lu retval %d != es_len %d\n", +					inode->i_ino, retval, es->es_len); +				return; +			} +			if (map.m_pblk != ext4_es_pblock(es)) { +				pr_warn("ES insert assertation failed for " +					"inode: %lu m_pblk %llu != " +					"es_pblk %llu\n", +					inode->i_ino, map.m_pblk, +					ext4_es_pblock(es)); +				return; +			} +		} else { +			/* +			 * We don't need to check unwritten extent because +			 * indirect-based file doesn't have it. +			 */ +			BUG_ON(1); +		} +	} else if (retval == 0) { +		if (ext4_es_is_written(es)) { +			pr_warn("ES insert assertation failed for inode: %lu " +				"We can't find the block but we want to add " +				"an written extent [%d/%d/%llu/%llx]\n", +				inode->i_ino, es->es_lblk, es->es_len, +				ext4_es_pblock(es), ext4_es_status(es)); +			return; +		} +	} +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, +					       struct extent_status *es) +{ +	/* +	 * We don't need to worry about the race condition because +	 * caller takes i_data_sem locking. +	 */ +	BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); +	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) +		ext4_es_insert_extent_ext_check(inode, es); +	else +		ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, +					       struct extent_status *es) +{ +} +#endif +  static int __es_insert_extent(struct inode *inode, struct extent_status *newes)  {  	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -471,6 +654,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,  	ext4_es_store_status(&newes, status);  	trace_ext4_es_insert_extent(inode, &newes); +	ext4_es_insert_extent_check(inode, &newes); +  	write_lock(&EXT4_I(inode)->i_es_lock);  	err = __es_remove_extent(inode, lblk, end);  	if (err != 0) @@ -669,6 +854,23 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,  	return err;  } +int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) +{ +	ext4_lblk_t  ee_block; +	ext4_fsblk_t ee_pblock; +	unsigned int ee_len; + +	ee_block  = le32_to_cpu(ex->ee_block); +	ee_len    = ext4_ext_get_actual_len(ex); +	ee_pblock = ext4_ext_pblock(ex); + +	if (ee_len == 0) +		return 0; + +	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, +				     EXTENT_STATUS_WRITTEN); +} +  static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)  {  	struct ext4_sb_info *sbi = container_of(shrink, diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f190dfe969d..d8e2d4dc311 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -21,6 +21,12 @@  #endif  /* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/*   * These flags live in the high bits of extent_status.es_pblk   */  #define EXTENT_STATUS_WRITTEN	(1ULL << 63) @@ -33,6 +39,8 @@  				 EXTENT_STATUS_DELAYED | \  				 EXTENT_STATUS_HOLE) +struct ext4_extent; +  struct extent_status {  	struct rb_node rb_node;  	ext4_lblk_t es_lblk;	/* first logical block extent covers */ @@ -58,6 +66,7 @@ extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,  					struct extent_status *es);  extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,  				 struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);  static inline int ext4_es_is_written(struct extent_status *es)  { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 32fd2b9075d..6c5bb8d993f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -324,8 +324,8 @@ error_return:  }  struct orlov_stats { +	__u64 free_clusters;  	__u32 free_inodes; -	__u32 free_clusters;  	__u32 used_dirs;  }; @@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,  	if (flex_size > 1) {  		stats->free_inodes = atomic_read(&flex_group[g].free_inodes); -		stats->free_clusters = atomic_read(&flex_group[g].free_clusters); +		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);  		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);  		return;  	} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9ea0cde3fa9..b3a5213bc73 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -185,8 +185,6 @@ void ext4_evict_inode(struct inode *inode)  	trace_ext4_evict_inode(inode); -	ext4_ioend_wait(inode); -  	if (inode->i_nlink) {  		/*  		 * When journalling data dirty buffers are tracked only in the @@ -207,7 +205,8 @@ void ext4_evict_inode(struct inode *inode)  		 * don't use page cache.  		 */  		if (ext4_should_journal_data(inode) && -		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { +		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && +		    inode->i_ino != EXT4_JOURNAL_INO) {  			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;  			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -216,6 +215,7 @@ void ext4_evict_inode(struct inode *inode)  			filemap_write_and_wait(&inode->i_data);  		}  		truncate_inode_pages(&inode->i_data, 0); +		ext4_ioend_shutdown(inode);  		goto no_delete;  	} @@ -225,6 +225,7 @@ void ext4_evict_inode(struct inode *inode)  	if (ext4_should_order_data(inode))  		ext4_begin_ordered_truncate(inode, 0);  	truncate_inode_pages(&inode->i_data, 0); +	ext4_ioend_shutdown(inode);  	if (is_bad_inode(inode))  		goto no_delete; @@ -482,6 +483,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,  	return num;  } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, +				       struct inode *inode, +				       struct ext4_map_blocks *es_map, +				       struct ext4_map_blocks *map, +				       int flags) +{ +	int retval; + +	map->m_flags = 0; +	/* +	 * There is a race window that the result is not the same. +	 * e.g. xfstests #223 when dioread_nolock enables.  The reason +	 * is that we lookup a block mapping in extent status tree with +	 * out taking i_data_sem.  So at the time the unwritten extent +	 * could be converted. +	 */ +	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) +		down_read((&EXT4_I(inode)->i_data_sem)); +	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { +		retval = ext4_ext_map_blocks(handle, inode, map, flags & +					     EXT4_GET_BLOCKS_KEEP_SIZE); +	} else { +		retval = ext4_ind_map_blocks(handle, inode, map, flags & +					     EXT4_GET_BLOCKS_KEEP_SIZE); +	} +	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) +		up_read((&EXT4_I(inode)->i_data_sem)); +	/* +	 * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag +	 * because it shouldn't be marked in es_map->m_flags. +	 */ +	map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); + +	/* +	 * We don't check m_len because extent will be collpased in status +	 * tree.  So the m_len might not equal. +	 */ +	if (es_map->m_lblk != map->m_lblk || +	    es_map->m_flags != map->m_flags || +	    es_map->m_pblk != map->m_pblk) { +		printk("ES cache assertation failed for inode: %lu " +		       "es_cached ex [%d/%d/%llu/%x] != " +		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n", +		       inode->i_ino, es_map->m_lblk, es_map->m_len, +		       es_map->m_pblk, es_map->m_flags, map->m_lblk, +		       map->m_len, map->m_pblk, map->m_flags, +		       retval, flags); +	} +} +#endif /* ES_AGGRESSIVE_TEST */ +  /*   * The ext4_map_blocks() function tries to look up the requested blocks,   * and returns if the blocks are already mapped. @@ -509,6 +562,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  {  	struct extent_status es;  	int retval; +#ifdef ES_AGGRESSIVE_TEST +	struct ext4_map_blocks orig_map; + +	memcpy(&orig_map, map, sizeof(*map)); +#endif  	map->m_flags = 0;  	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," @@ -531,6 +589,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		} else {  			BUG_ON(1);  		} +#ifdef ES_AGGRESSIVE_TEST +		ext4_map_blocks_es_recheck(handle, inode, map, +					   &orig_map, flags); +#endif  		goto found;  	} @@ -551,6 +613,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		int ret;  		unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST +		if (retval != map->m_len) { +			printk("ES len assertation failed for inode: %lu " +			       "retval %d != map->m_len %d " +			       "in %s (lookup)\n", inode->i_ino, retval, +			       map->m_len, __func__); +		} +#endif +  		status = map->m_flags & EXT4_MAP_UNWRITTEN ?  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;  		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -643,6 +714,24 @@ found:  		int ret;  		unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST +		if (retval != map->m_len) { +			printk("ES len assertation failed for inode: %lu " +			       "retval %d != map->m_len %d " +			       "in %s (allocation)\n", inode->i_ino, retval, +			       map->m_len, __func__); +		} +#endif + +		/* +		 * If the extent has been zeroed out, we don't need to update +		 * extent status tree. +		 */ +		if ((flags & EXT4_GET_BLOCKS_PRE_IO) && +		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) { +			if (ext4_es_is_written(&es)) +				goto has_zeroout; +		}  		status = map->m_flags & EXT4_MAP_UNWRITTEN ?  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;  		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -655,6 +744,7 @@ found:  			retval = ret;  	} +has_zeroout:  	up_write((&EXT4_I(inode)->i_data_sem));  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {  		int ret = check_block_validity(inode, map); @@ -1216,6 +1306,55 @@ static int ext4_journalled_write_end(struct file *file,  }  /* + * Reserve a metadata for a single block located at lblock + */ +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) +{ +	int retries = 0; +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	struct ext4_inode_info *ei = EXT4_I(inode); +	unsigned int md_needed; +	ext4_lblk_t save_last_lblock; +	int save_len; + +	/* +	 * recalculate the amount of metadata blocks to reserve +	 * in order to allocate nrblocks +	 * worse case is one extent per block +	 */ +repeat: +	spin_lock(&ei->i_block_reservation_lock); +	/* +	 * ext4_calc_metadata_amount() has side effects, which we have +	 * to be prepared undo if we fail to claim space. +	 */ +	save_len = ei->i_da_metadata_calc_len; +	save_last_lblock = ei->i_da_metadata_calc_last_lblock; +	md_needed = EXT4_NUM_B2C(sbi, +				 ext4_calc_metadata_amount(inode, lblock)); +	trace_ext4_da_reserve_space(inode, md_needed); + +	/* +	 * We do still charge estimated metadata to the sb though; +	 * we cannot afford to run out of free blocks. +	 */ +	if (ext4_claim_free_clusters(sbi, md_needed, 0)) { +		ei->i_da_metadata_calc_len = save_len; +		ei->i_da_metadata_calc_last_lblock = save_last_lblock; +		spin_unlock(&ei->i_block_reservation_lock); +		if (ext4_should_retry_alloc(inode->i_sb, &retries)) { +			cond_resched(); +			goto repeat; +		} +		return -ENOSPC; +	} +	ei->i_reserved_meta_blocks += md_needed; +	spin_unlock(&ei->i_block_reservation_lock); + +	return 0;       /* success */ +} + +/*   * Reserve a single cluster located at lblock   */  static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) @@ -1263,7 +1402,7 @@ repeat:  		ei->i_da_metadata_calc_last_lblock = save_last_lblock;  		spin_unlock(&ei->i_block_reservation_lock);  		if (ext4_should_retry_alloc(inode->i_sb, &retries)) { -			yield(); +			cond_resched();  			goto repeat;  		}  		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); @@ -1768,6 +1907,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  	struct extent_status es;  	int retval;  	sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST +	struct ext4_map_blocks orig_map; + +	memcpy(&orig_map, map, sizeof(*map)); +#endif  	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))  		invalid_block = ~0; @@ -1809,6 +1953,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  		else  			BUG_ON(1); +#ifdef ES_AGGRESSIVE_TEST +		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif  		return retval;  	} @@ -1843,8 +1990,11 @@ add_delayed:  		 * XXX: __block_prepare_write() unmaps passed block,  		 * is it OK?  		 */ -		/* If the block was allocated from previously allocated cluster, -		 * then we dont need to reserve it again. */ +		/* +		 * If the block was allocated from previously allocated cluster, +		 * then we don't need to reserve it again. However we still need +		 * to reserve metadata for every block we're going to write. +		 */  		if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {  			ret = ext4_da_reserve_space(inode, iblock);  			if (ret) { @@ -1852,6 +2002,13 @@ add_delayed:  				retval = ret;  				goto out_unlock;  			} +		} else { +			ret = ext4_da_reserve_metadata(inode, iblock); +			if (ret) { +				/* not enough space to reserve */ +				retval = ret; +				goto out_unlock; +			}  		}  		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -1873,6 +2030,15 @@ add_delayed:  		int ret;  		unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST +		if (retval != map->m_len) { +			printk("ES len assertation failed for inode: %lu " +			       "retval %d != map->m_len %d " +			       "in %s (lookup)\n", inode->i_ino, retval, +			       map->m_len, __func__); +		} +#endif +  		status = map->m_flags & EXT4_MAP_UNWRITTEN ?  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;  		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -2908,8 +3074,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)  	trace_ext4_releasepage(page); -	WARN_ON(PageChecked(page)); -	if (!page_has_buffers(page)) +	/* Page has dirty journalled data -> cannot release */ +	if (PageChecked(page))  		return 0;  	if (journal)  		return jbd2_journal_try_to_free_buffers(journal, page, wait); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7bb713a46fe..ee6614bdb63 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2804,8 +2804,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,  	if (sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group = ext4_flex_group(sbi,  							  ac->ac_b_ex.fe_group); -		atomic_sub(ac->ac_b_ex.fe_len, -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_sub(ac->ac_b_ex.fe_len, +			     &sbi->s_flex_groups[flex_group].free_clusters);  	}  	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -3692,11 +3692,7 @@ repeat:  	if (free < needed && busy) {  		busy = 0;  		ext4_unlock_group(sb, group); -		/* -		 * Yield the CPU here so that we don't get soft lockup -		 * in non preempt case. -		 */ -		yield(); +		cond_resched();  		goto repeat;  	} @@ -4246,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,  			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {  			/* let others to free the space */ -			yield(); +			cond_resched();  			ar->len = ar->len >> 1;  		}  		if (!ar->len) { @@ -4464,7 +4460,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,  	struct buffer_head *bitmap_bh = NULL;  	struct super_block *sb = inode->i_sb;  	struct ext4_group_desc *gdp; -	unsigned long freed = 0;  	unsigned int overflow;  	ext4_grpblk_t bit;  	struct buffer_head *gd_bh; @@ -4666,14 +4661,12 @@ do_more:  	if (sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group = ext4_flex_group(sbi, block_group); -		atomic_add(count_clusters, -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_add(count_clusters, +			     &sbi->s_flex_groups[flex_group].free_clusters);  	}  	ext4_mb_unload_buddy(&e4b); -	freed += count; -  	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))  		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); @@ -4811,8 +4804,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,  	if (sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group = ext4_flex_group(sbi, block_group); -		atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), +			     &sbi->s_flex_groups[flex_group].free_clusters);  	}  	ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 4e81d47aa8c..33e1c086858 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,16 +32,18 @@   */  static inline int  get_ext_path(struct inode *inode, ext4_lblk_t lblock, -		struct ext4_ext_path **path) +		struct ext4_ext_path **orig_path)  {  	int ret = 0; +	struct ext4_ext_path *path; -	*path = ext4_ext_find_extent(inode, lblock, *path); -	if (IS_ERR(*path)) { -		ret = PTR_ERR(*path); -		*path = NULL; -	} else if ((*path)[ext_depth(inode)].p_ext == NULL) +	path = ext4_ext_find_extent(inode, lblock, *orig_path); +	if (IS_ERR(path)) +		ret = PTR_ERR(path); +	else if (path[ext_depth(inode)].p_ext == NULL)  		ret = -ENODATA; +	else +		*orig_path = path;  	return ret;  } @@ -611,24 +613,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,  {  	struct ext4_ext_path *path = NULL;  	struct ext4_extent *ext; +	int ret = 0;  	ext4_lblk_t last = from + count;  	while (from < last) {  		*err = get_ext_path(inode, from, &path);  		if (*err) -			return 0; +			goto out;  		ext = path[ext_depth(inode)].p_ext; -		if (!ext) { -			ext4_ext_drop_refs(path); -			return 0; -		} -		if (uninit != ext4_ext_is_uninitialized(ext)) { -			ext4_ext_drop_refs(path); -			return 0; -		} +		if (uninit != ext4_ext_is_uninitialized(ext)) +			goto out;  		from += ext4_ext_get_actual_len(ext);  		ext4_ext_drop_refs(path);  	} -	return 1; +	ret = 1; +out: +	if (path) { +		ext4_ext_drop_refs(path); +		kfree(path); +	} +	return ret;  }  /** @@ -666,6 +669,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,  	int replaced_count = 0;  	int dext_alen; +	*err = ext4_es_remove_extent(orig_inode, from, count); +	if (*err) +		goto out; + +	*err = ext4_es_remove_extent(donor_inode, from, count); +	if (*err) +		goto out; +  	/* Get the original extent for the block "orig_off" */  	*err = get_ext_path(orig_inode, orig_off, &orig_path);  	if (*err) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 809b31003ec..047a6de04a0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -50,11 +50,21 @@ void ext4_exit_pageio(void)  	kmem_cache_destroy(io_page_cachep);  } -void ext4_ioend_wait(struct inode *inode) +/* + * This function is called by ext4_evict_inode() to make sure there is + * no more pending I/O completion work left to do. + */ +void ext4_ioend_shutdown(struct inode *inode)  {  	wait_queue_head_t *wq = ext4_ioend_wq(inode);  	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +	/* +	 * We need to make sure the work structure is finished being +	 * used before we let the inode get destroyed. +	 */ +	if (work_pending(&EXT4_I(inode)->i_unwritten_work)) +		cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);  }  static void put_io_page(struct ext4_io_page *io_page) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b2c8ee56eb9..c169477a62c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1360,8 +1360,8 @@ static void ext4_update_super(struct super_block *sb,  	    sbi->s_log_groups_per_flex) {  		ext4_group_t flex_group;  		flex_group = ext4_flex_group(sbi, group_data[0].group); -		atomic_add(EXT4_NUM_B2C(sbi, free_blocks), -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), +			     &sbi->s_flex_groups[flex_group].free_clusters);  		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,  			   &sbi->s_flex_groups[flex_group].free_inodes);  	} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b3818b48f41..5d6d5357812 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1927,8 +1927,8 @@ static int ext4_fill_flex_info(struct super_block *sb)  		flex_group = ext4_flex_group(sbi, i);  		atomic_add(ext4_free_inodes_count(sb, gdp),  			   &sbi->s_flex_groups[flex_group].free_inodes); -		atomic_add(ext4_free_group_clusters(sb, gdp), -			   &sbi->s_flex_groups[flex_group].free_clusters); +		atomic64_add(ext4_free_group_clusters(sb, gdp), +			     &sbi->s_flex_groups[flex_group].free_clusters);  		atomic_add(ext4_used_dirs_count(sb, gdp),  			   &sbi->s_flex_groups[flex_group].used_dirs);  	}  |