diff options
| author | Sage Weil <sage@inktank.com> | 2013-08-15 11:11:45 -0700 | 
|---|---|---|
| committer | Sage Weil <sage@inktank.com> | 2013-08-15 11:11:45 -0700 | 
| commit | ee3e542fec6e69bc9fb668698889a37d93950ddf (patch) | |
| tree | e74ee766a4764769ef1d3d45d266b4dea64101d3 /fs/ext4 | |
| parent | fe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff) | |
| parent | f1d6e17f540af37bb1891480143669ba7636c4cf (diff) | |
Merge remote-tracking branch 'linus/master' into testing
Diffstat (limited to 'fs/ext4')
| -rw-r--r-- | fs/ext4/balloc.c | 18 | ||||
| -rw-r--r-- | fs/ext4/dir.c | 158 | ||||
| -rw-r--r-- | fs/ext4/ext4.h | 189 | ||||
| -rw-r--r-- | fs/ext4/ext4_jbd2.c | 58 | ||||
| -rw-r--r-- | fs/ext4/ext4_jbd2.h | 29 | ||||
| -rw-r--r-- | fs/ext4/extents.c | 214 | ||||
| -rw-r--r-- | fs/ext4/extents_status.c | 144 | ||||
| -rw-r--r-- | fs/ext4/extents_status.h | 5 | ||||
| -rw-r--r-- | fs/ext4/file.c | 38 | ||||
| -rw-r--r-- | fs/ext4/fsync.c | 52 | ||||
| -rw-r--r-- | fs/ext4/ialloc.c | 13 | ||||
| -rw-r--r-- | fs/ext4/indirect.c | 40 | ||||
| -rw-r--r-- | fs/ext4/inline.c | 168 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 1787 | ||||
| -rw-r--r-- | fs/ext4/ioctl.c | 6 | ||||
| -rw-r--r-- | fs/ext4/mballoc.c | 32 | ||||
| -rw-r--r-- | fs/ext4/move_extent.c | 3 | ||||
| -rw-r--r-- | fs/ext4/namei.c | 54 | ||||
| -rw-r--r-- | fs/ext4/page-io.c | 336 | ||||
| -rw-r--r-- | fs/ext4/resize.c | 24 | ||||
| -rw-r--r-- | fs/ext4/super.c | 189 | 
21 files changed, 1837 insertions, 1720 deletions
| diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d0f13eada0ed..ddd715e42a5c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -38,8 +38,8 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,  	ext4_group_t group;  	if (test_opt2(sb, STD_GROUP_SIZE)) -		group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + -			 block) >> +		group = (block - +			 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>  			(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);  	else  		ext4_get_group_no_and_offset(sb, block, &group, NULL); @@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)  static inline int test_root(ext4_group_t a, int b)  { -	int num = b; - -	while (a > num) -		num *= b; -	return num == a; +	while (1) { +		if (a < b) +			return 0; +		if (a == b) +			return 1; +		if ((a % b) != 0) +			return 0; +		a = a / b; +	}  }  static int ext4_group_sparse(ext4_group_t group) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f8d56e4254e0..3c7d288ae94c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -29,8 +29,7 @@  #include "ext4.h"  #include "xattr.h" -static int ext4_dx_readdir(struct file *filp, -			   void *dirent, filldir_t filldir); +static int ext4_dx_readdir(struct file *, struct dir_context *);  /**   * Check if the given dir-inode refers to an htree-indexed directory @@ -103,60 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,  	return 1;  } -static int ext4_readdir(struct file *filp, -			 void *dirent, filldir_t filldir) +static int ext4_readdir(struct file *file, struct dir_context *ctx)  { -	int error = 0;  	unsigned int offset;  	int i, stored;  	struct ext4_dir_entry_2 *de;  	int err; -	struct inode *inode = file_inode(filp); +	struct inode *inode = file_inode(file);  	struct super_block *sb = inode->i_sb; -	int ret = 0;  	int dir_has_error = 0;  	if (is_dx_dir(inode)) { -		err = ext4_dx_readdir(filp, dirent, filldir); +		err = ext4_dx_readdir(file, ctx);  		if (err != ERR_BAD_DX_DIR) { -			ret = err; -			goto out; +			return err;  		}  		/*  		 * We don't set the inode dirty flag since it's not  		 * critical that it get flushed back to the disk.  		 */ -		ext4_clear_inode_flag(file_inode(filp), +		ext4_clear_inode_flag(file_inode(file),  				      EXT4_INODE_INDEX);  	}  	if (ext4_has_inline_data(inode)) {  		int has_inline_data = 1; -		ret = ext4_read_inline_dir(filp, dirent, filldir, +		int ret = ext4_read_inline_dir(file, ctx,  					   &has_inline_data);  		if (has_inline_data)  			return ret;  	}  	stored = 0; -	offset = filp->f_pos & (sb->s_blocksize - 1); +	offset = ctx->pos & (sb->s_blocksize - 1); -	while (!error && !stored && filp->f_pos < inode->i_size) { +	while (ctx->pos < inode->i_size) {  		struct ext4_map_blocks map;  		struct buffer_head *bh = NULL; -		map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); +		map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);  		map.m_len = 1;  		err = ext4_map_blocks(NULL, inode, &map, 0);  		if (err > 0) {  			pgoff_t index = map.m_pblk >>  					(PAGE_CACHE_SHIFT - inode->i_blkbits); -			if (!ra_has_index(&filp->f_ra, index)) +			if (!ra_has_index(&file->f_ra, index))  				page_cache_sync_readahead(  					sb->s_bdev->bd_inode->i_mapping, -					&filp->f_ra, filp, +					&file->f_ra, file,  					index, 1); -			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; +			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;  			bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);  		} @@ -166,16 +161,16 @@ static int ext4_readdir(struct file *filp,  		 */  		if (!bh) {  			if (!dir_has_error) { -				EXT4_ERROR_FILE(filp, 0, +				EXT4_ERROR_FILE(file, 0,  						"directory contains a "  						"hole at offset %llu", -					   (unsigned long long) filp->f_pos); +					   (unsigned long long) ctx->pos);  				dir_has_error = 1;  			}  			/* corrupt size?  Maybe no more blocks to read */ -			if (filp->f_pos > inode->i_blocks << 9) +			if (ctx->pos > inode->i_blocks << 9)  				break; -			filp->f_pos += sb->s_blocksize - offset; +			ctx->pos += sb->s_blocksize - offset;  			continue;  		} @@ -183,21 +178,20 @@ static int ext4_readdir(struct file *filp,  		if (!buffer_verified(bh) &&  		    !ext4_dirent_csum_verify(inode,  				(struct ext4_dir_entry *)bh->b_data)) { -			EXT4_ERROR_FILE(filp, 0, "directory fails checksum " +			EXT4_ERROR_FILE(file, 0, "directory fails checksum "  					"at offset %llu", -					(unsigned long long)filp->f_pos); -			filp->f_pos += sb->s_blocksize - offset; +					(unsigned long long)ctx->pos); +			ctx->pos += sb->s_blocksize - offset;  			brelse(bh);  			continue;  		}  		set_buffer_verified(bh); -revalidate:  		/* If the dir block has changed since the last call to  		 * readdir(2), then we might be pointing to an invalid  		 * dirent right now.  Scan from the start of the block  		 * to make sure. */ -		if (filp->f_version != inode->i_version) { +		if (file->f_version != inode->i_version) {  			for (i = 0; i < sb->s_blocksize && i < offset; ) {  				de = (struct ext4_dir_entry_2 *)  					(bh->b_data + i); @@ -214,57 +208,46 @@ revalidate:  							    sb->s_blocksize);  			}  			offset = i; -			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) +			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))  				| offset; -			filp->f_version = inode->i_version; +			file->f_version = inode->i_version;  		} -		while (!error && filp->f_pos < inode->i_size +		while (ctx->pos < inode->i_size  		       && offset < sb->s_blocksize) {  			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); -			if (ext4_check_dir_entry(inode, filp, de, bh, +			if (ext4_check_dir_entry(inode, file, de, bh,  						 bh->b_data, bh->b_size,  						 offset)) {  				/* -				 * On error, skip the f_pos to the next block +				 * On error, skip to the next block  				 */ -				filp->f_pos = (filp->f_pos | +				ctx->pos = (ctx->pos |  						(sb->s_blocksize - 1)) + 1; -				brelse(bh); -				ret = stored; -				goto out; +				break;  			}  			offset += ext4_rec_len_from_disk(de->rec_len,  					sb->s_blocksize);  			if (le32_to_cpu(de->inode)) { -				/* We might block in the next section -				 * if the data destination is -				 * currently swapped out.  So, use a -				 * version stamp to detect whether or -				 * not the directory has been modified -				 * during the copy operation. -				 */ -				u64 version = filp->f_version; - -				error = filldir(dirent, de->name, +				if (!dir_emit(ctx, de->name,  						de->name_len, -						filp->f_pos,  						le32_to_cpu(de->inode), -						get_dtype(sb, de->file_type)); -				if (error) -					break; -				if (version != filp->f_version) -					goto revalidate; -				stored++; +						get_dtype(sb, de->file_type))) { +					brelse(bh); +					return 0; +				}  			} -			filp->f_pos += ext4_rec_len_from_disk(de->rec_len, +			ctx->pos += ext4_rec_len_from_disk(de->rec_len,  						sb->s_blocksize);  		}  		offset = 0;  		brelse(bh); +		if (ctx->pos < inode->i_size) { +			if (!dir_relax(inode)) +				return 0; +		}  	} -out: -	return ret; +	return 0;  }  static inline int is_32bit_api(void) @@ -492,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,   * for all entres on the fname linked list.  (Normally there is only   * one entry on the linked list, unless there are 62 bit hash collisions.)   */ -static int call_filldir(struct file *filp, void *dirent, -			filldir_t filldir, struct fname *fname) +static int call_filldir(struct file *file, struct dir_context *ctx, +			struct fname *fname)  { -	struct dir_private_info *info = filp->private_data; -	loff_t	curr_pos; -	struct inode *inode = file_inode(filp); -	struct super_block *sb; -	int error; - -	sb = inode->i_sb; +	struct dir_private_info *info = file->private_data; +	struct inode *inode = file_inode(file); +	struct super_block *sb = inode->i_sb;  	if (!fname) {  		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " @@ -509,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent,  			 inode->i_ino, current->comm);  		return 0;  	} -	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); +	ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);  	while (fname) { -		error = filldir(dirent, fname->name, -				fname->name_len, curr_pos, +		if (!dir_emit(ctx, fname->name, +				fname->name_len,  				fname->inode, -				get_dtype(sb, fname->file_type)); -		if (error) { -			filp->f_pos = curr_pos; +				get_dtype(sb, fname->file_type))) {  			info->extra_fname = fname; -			return error; +			return 1;  		}  		fname = fname->next;  	}  	return 0;  } -static int ext4_dx_readdir(struct file *filp, -			 void *dirent, filldir_t filldir) +static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)  { -	struct dir_private_info *info = filp->private_data; -	struct inode *inode = file_inode(filp); +	struct dir_private_info *info = file->private_data; +	struct inode *inode = file_inode(file);  	struct fname *fname;  	int	ret;  	if (!info) { -		info = ext4_htree_create_dir_info(filp, filp->f_pos); +		info = ext4_htree_create_dir_info(file, ctx->pos);  		if (!info)  			return -ENOMEM; -		filp->private_data = info; +		file->private_data = info;  	} -	if (filp->f_pos == ext4_get_htree_eof(filp)) +	if (ctx->pos == ext4_get_htree_eof(file))  		return 0;	/* EOF */  	/* Some one has messed with f_pos; reset the world */ -	if (info->last_pos != filp->f_pos) { +	if (info->last_pos != ctx->pos) {  		free_rb_tree_fname(&info->root);  		info->curr_node = NULL;  		info->extra_fname = NULL; -		info->curr_hash = pos2maj_hash(filp, filp->f_pos); -		info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); +		info->curr_hash = pos2maj_hash(file, ctx->pos); +		info->curr_minor_hash = pos2min_hash(file, ctx->pos);  	}  	/* @@ -557,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp,  	 * chain, return them first.  	 */  	if (info->extra_fname) { -		if (call_filldir(filp, dirent, filldir, info->extra_fname)) +		if (call_filldir(file, ctx, info->extra_fname))  			goto finished;  		info->extra_fname = NULL;  		goto next_node; @@ -571,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp,  		 * cached entries.  		 */  		if ((!info->curr_node) || -		    (filp->f_version != inode->i_version)) { +		    (file->f_version != inode->i_version)) {  			info->curr_node = NULL;  			free_rb_tree_fname(&info->root); -			filp->f_version = inode->i_version; -			ret = ext4_htree_fill_tree(filp, info->curr_hash, +			file->f_version = inode->i_version; +			ret = ext4_htree_fill_tree(file, info->curr_hash,  						   info->curr_minor_hash,  						   &info->next_hash);  			if (ret < 0)  				return ret;  			if (ret == 0) { -				filp->f_pos = ext4_get_htree_eof(filp); +				ctx->pos = ext4_get_htree_eof(file);  				break;  			}  			info->curr_node = rb_first(&info->root); @@ -590,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp,  		fname = rb_entry(info->curr_node, struct fname, rb_hash);  		info->curr_hash = fname->hash;  		info->curr_minor_hash = fname->minor_hash; -		if (call_filldir(filp, dirent, filldir, fname)) +		if (call_filldir(file, ctx, fname))  			break;  	next_node:  		info->curr_node = rb_next(info->curr_node); @@ -601,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp,  			info->curr_minor_hash = fname->minor_hash;  		} else {  			if (info->next_hash == ~0) { -				filp->f_pos = ext4_get_htree_eof(filp); +				ctx->pos = ext4_get_htree_eof(file);  				break;  			}  			info->curr_hash = info->next_hash; @@ -609,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp,  		}  	}  finished: -	info->last_pos = filp->f_pos; +	info->last_pos = ctx->pos;  	return 0;  } @@ -624,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)  const struct file_operations ext4_dir_operations = {  	.llseek		= ext4_dir_llseek,  	.read		= generic_read_dir, -	.readdir	= ext4_readdir, +	.iterate	= ext4_readdir,  	.unlocked_ioctl = ext4_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl	= ext4_compat_ioctl, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5aae3d12d400..b577e45425b0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -177,38 +177,28 @@ struct ext4_map_blocks {  };  /* - * For delayed allocation tracking - */ -struct mpage_da_data { -	struct inode *inode; -	sector_t b_blocknr;		/* start block number of extent */ -	size_t b_size;			/* size of extent */ -	unsigned long b_state;		/* state of the extent */ -	unsigned long first_page, next_page;	/* extent of pages */ -	struct writeback_control *wbc; -	int io_done; -	int pages_written; -	int retval; -}; - -/*   * Flags for ext4_io_end->flags   */  #define	EXT4_IO_END_UNWRITTEN	0x0001 -#define EXT4_IO_END_ERROR	0x0002 -#define EXT4_IO_END_DIRECT	0x0004 +#define EXT4_IO_END_DIRECT	0x0002  /* - * For converting uninitialized extents on a work queue. + * For converting uninitialized extents on a work queue. 'handle' is used for + * buffered writeback.   */  typedef struct ext4_io_end {  	struct list_head	list;		/* per-file finished IO list */ +	handle_t		*handle;	/* handle reserved for extent +						 * conversion */  	struct inode		*inode;		/* file being written to */ +	struct bio		*bio;		/* Linked list of completed +						 * bios covering the extent */  	unsigned int		flag;		/* unwritten or not */  	loff_t			offset;		/* offset in the file */  	ssize_t			size;		/* size of the extent */  	struct kiocb		*iocb;		/* iocb struct for AIO */  	int			result;		/* error value for AIO */ +	atomic_t		count;		/* reference counter */  } ext4_io_end_t;  struct ext4_io_submit { @@ -581,11 +571,6 @@ enum {  #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020  /* - * Flags used by ext4_discard_partial_page_buffers - */ -#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED	0x0001 - -/*   * ioctl commands   */  #define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS @@ -879,6 +864,7 @@ struct ext4_inode_info {  	rwlock_t i_es_lock;  	struct list_head i_es_lru;  	unsigned int i_es_lru_nr;	/* protected by i_es_lock */ +	unsigned long i_touch_when;	/* jiffies of last accessing */  	/* ialloc */  	ext4_group_t	i_last_alloc_group; @@ -903,12 +889,22 @@ struct ext4_inode_info {  	qsize_t i_reserved_quota;  #endif -	/* completed IOs that might need unwritten extents handling */ -	struct list_head i_completed_io_list; +	/* Lock protecting lists below */  	spinlock_t i_completed_io_lock; +	/* +	 * Completed IOs that need unwritten extents handling and have +	 * transaction reserved +	 */ +	struct list_head i_rsv_conversion_list; +	/* +	 * Completed IOs that need unwritten extents handling and don't have +	 * transaction reserved +	 */ +	struct list_head i_unrsv_conversion_list;  	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */  	atomic_t i_unwritten; /* Nr. of inflight conversions pending */ -	struct work_struct i_unwritten_work;	/* deferred extent conversion */ +	struct work_struct i_rsv_conversion_work; +	struct work_struct i_unrsv_conversion_work;  	spinlock_t i_block_reservation_lock; @@ -1245,7 +1241,6 @@ struct ext4_sb_info {  	unsigned int s_mb_stats;  	unsigned int s_mb_order2_reqs;  	unsigned int s_mb_group_prealloc; -	unsigned int s_max_writeback_mb_bump;  	unsigned int s_max_dir_size_kb;  	/* where last allocation was done - for stream allocation */  	unsigned long s_mb_last_group; @@ -1281,8 +1276,10 @@ struct ext4_sb_info {  	struct flex_groups *s_flex_groups;  	ext4_group_t s_flex_groups_allocated; -	/* workqueue for dio unwritten */ -	struct workqueue_struct *dio_unwritten_wq; +	/* workqueue for unreserved extent convertions (dio) */ +	struct workqueue_struct *unrsv_conversion_wq; +	/* workqueue for reserved extent conversions (buffered io) */ +	struct workqueue_struct *rsv_conversion_wq;  	/* timer for periodic error stats printing */  	struct timer_list s_err_report; @@ -1307,6 +1304,7 @@ struct ext4_sb_info {  	/* Reclaim extents from extent status tree */  	struct shrinker s_es_shrinker;  	struct list_head s_es_lru; +	unsigned long s_es_last_sorted;  	struct percpu_counter s_extent_cache_cnt;  	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;  }; @@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,  					      struct ext4_io_end *io_end)  {  	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { +		/* Writeback has to have coversion transaction reserved */ +		WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle && +			!(io_end->flag & EXT4_IO_END_DIRECT));  		io_end->flag |= EXT4_IO_END_UNWRITTEN;  		atomic_inc(&EXT4_I(inode)->i_unwritten);  	} @@ -1999,7 +2000,6 @@ static inline  unsigned char get_dtype(struct super_block *sb, int filetype)  /* fsync.c */  extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -extern int ext4_flush_unwritten_io(struct inode *);  /* hash.c */  extern int ext4fs_dirhash(const char *name, int len, struct @@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);  extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);  extern int ext4_can_truncate(struct inode *inode);  extern void ext4_truncate(struct inode *); -extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);  extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);  extern void ext4_set_inode_flags(struct inode *);  extern void ext4_get_inode_flags(struct ext4_inode_info *); @@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);  extern void ext4_set_aops(struct inode *inode);  extern int ext4_writepage_trans_blocks(struct inode *);  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_discard_partial_page_buffers(handle_t *handle, -		struct address_space *mapping, loff_t from, -		loff_t length, int flags); +extern int ext4_block_truncate_page(handle_t *handle, +		struct address_space *mapping, loff_t from); +extern int ext4_block_zero_page_range(handle_t *handle, +		struct address_space *mapping, loff_t from, loff_t length); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, +			     loff_t lstart, loff_t lend);  extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);  extern qsize_t *ext4_get_reserved_space(struct inode *inode);  extern void ext4_da_update_reserve_space(struct inode *inode, @@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,  				const struct iovec *iov, loff_t offset,  				unsigned long nr_segs);  extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); -extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);  extern void ext4_ind_truncate(handle_t *, struct inode *inode);  extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,  				 ext4_lblk_t first, ext4_lblk_t stop); @@ -2166,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,  				    ext4_group_t ngroup);  extern const char *ext4_decode_error(struct super_block *sb, int errno,  				     char nbuf[16]); +  extern __printf(4, 5)  void __ext4_error(struct super_block *, const char *, unsigned int,  		  const char *, ...); -#define ext4_error(sb, message...)	__ext4_error(sb, __func__,	\ -						     __LINE__, ## message)  extern __printf(5, 6) -void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,  		      const char *, ...);  extern __printf(5, 6) -void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,  		     const char *, ...);  extern void __ext4_std_error(struct super_block *, const char *,  			     unsigned int, int);  extern __printf(4, 5)  void __ext4_abort(struct super_block *, const char *, unsigned int,  		  const char *, ...); -#define ext4_abort(sb, message...)	__ext4_abort(sb, __func__, \ -						       __LINE__, ## message)  extern __printf(4, 5)  void __ext4_warning(struct super_block *, const char *, unsigned int,  		    const char *, ...); -#define ext4_warning(sb, message...)	__ext4_warning(sb, __func__, \ -						       __LINE__, ## message)  extern __printf(3, 4) -void ext4_msg(struct super_block *, const char *, const char *, ...); +void __ext4_msg(struct super_block *, const char *, const char *, ...);  extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,  			   const char *, unsigned int, const char *); -#define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \ -						       __LINE__, msg)  extern __printf(7, 8)  void __ext4_grp_locked_error(const char *, unsigned int,  			     struct super_block *, ext4_group_t,  			     unsigned long, ext4_fsblk_t,  			     const char *, ...); -#define ext4_grp_locked_error(sb, grp, message...) \ -	__ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...)		\ +	__ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...)		\ +	__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...)					\ +	__ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...)					\ +	__ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...)					\ +	__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...)				\ +	__ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg)					\ +	__dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\ +	__ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ +				fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...)		\ +do {									\ +	no_printk(fmt, ##__VA_ARGS__);					\ +	__ext4_error_inode(inode, "", 0, block, " ");			\ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...)		\ +do {									\ +	no_printk(fmt, ##__VA_ARGS__);					\ +	__ext4_error_file(file, "", 0, block, " ");			\ +} while (0) +#define ext4_error(sb, fmt, ...)					\ +do {									\ +	no_printk(fmt, ##__VA_ARGS__);					\ +	__ext4_error(sb, "", 0, " ");					\ +} while (0) +#define ext4_abort(sb, fmt, ...)					\ +do {									\ +	no_printk(fmt, ##__VA_ARGS__);					\ +	__ext4_abort(sb, "", 0, " ");					\ +} while (0) +#define ext4_warning(sb, fmt, ...)					\ +do {									\ +	no_printk(fmt, ##__VA_ARGS__);					\ +	__ext4_warning(sb, "", 0, " ");					\ +} while (0) +#define ext4_msg(sb, level, fmt, ...)					\ +do {									\ +	no_printk(fmt, ##__VA_ARGS__);					\ +	__ext4_msg(sb, "", " ");					\ +} while (0) +#define dump_mmp_msg(sb, mmp, msg)					\ +	__dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\ +do {									\ +	no_printk(fmt, ##__VA_ARGS__);				\ +	__ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");	\ +} while (0) + +#endif +  extern void ext4_update_dynamic_rev(struct super_block *sb);  extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,  					__u32 compat); @@ -2312,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,  {  	 struct ext4_group_info ***grp_info;  	 long indexv, indexh; +	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);  	 grp_info = EXT4_SB(sb)->s_group_info;  	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));  	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); @@ -2515,7 +2573,7 @@ extern int ext4_try_create_inline_dir(handle_t *handle,  				      struct inode *parent,  				      struct inode *inode);  extern int ext4_read_inline_dir(struct file *filp, -				void *dirent, filldir_t filldir, +				struct dir_context *ctx,  				int *has_inline_data);  extern int htree_inlinedir_to_tree(struct file *dir_file,  				   struct inode *dir, ext4_lblk_t block, @@ -2598,8 +2656,7 @@ struct ext4_extent;  extern int ext4_ext_tree_init(handle_t *handle, struct inode *);  extern int ext4_ext_writepage_trans_blocks(struct inode *, int); -extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, -				       int chunk); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);  extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  			       struct ext4_map_blocks *map, int flags);  extern void ext4_ext_truncate(handle_t *, struct inode *); @@ -2609,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);  extern void ext4_ext_release(struct super_block *);  extern long ext4_fallocate(struct file *file, int mode, loff_t offset,  			  loff_t len); -extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, -			  ssize_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, +					  loff_t offset, ssize_t len);  extern int ext4_map_blocks(handle_t *handle, struct inode *inode,  			   struct ext4_map_blocks *map, int flags);  extern int ext4_ext_calc_metadata_amount(struct inode *inode, @@ -2650,12 +2707,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,  /* page-io.c */  extern int __init ext4_init_pageio(void); -extern void ext4_add_complete_io(ext4_io_end_t *io_end);  extern void ext4_exit_pageio(void); -extern void ext4_ioend_shutdown(struct inode *); -extern void ext4_free_io_end(ext4_io_end_t *io);  extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); -extern void ext4_end_io_work(struct work_struct *work); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, +				struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_end_io_unrsv_work(struct work_struct *work);  extern void ext4_io_submit(struct ext4_io_submit *io);  extern int ext4_bio_write_page(struct ext4_io_submit *io,  			       struct page *page, @@ -2668,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);  extern int ext4_mmp_csum_verify(struct super_block *sb,  				struct mmp_struct *mmp); -/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +/* + * Note that these flags will never ever appear in a buffer_head's state flag. + * See EXT4_MAP_... to see where this is used. + */  enum ext4_state_bits {  	BH_Uninit	/* blocks are allocated but uninitialized on disk */ -	  = BH_JBDPrivateStart, +	 = BH_JBDPrivateStart,  	BH_AllocFromCluster,	/* allocated blocks were part of already -				 * allocated cluster. Note that this flag will -				 * never, ever appear in a buffer_head's state -				 * flag. See EXT4_MAP_FROM_CLUSTER to see where -				 * this is used. */ +				 * allocated cluster. */  }; -BUFFER_FNS(Uninit, uninit) -TAS_BUFFER_FNS(Uninit, uninit) -  /*   * Add new method to test whether block and inode bitmaps are properly   * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 451eb4045330..72a3600aedbd 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)  /*   * Wrappers for jbd2_journal_start/end.   */ -handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, -				  int type, int nblocks) +static int ext4_journal_check_start(struct super_block *sb)  {  	journal_t *journal;  	might_sleep(); - -	trace_ext4_journal_start(sb, nblocks, _RET_IP_);  	if (sb->s_flags & MS_RDONLY) -		return ERR_PTR(-EROFS); - +		return -EROFS;  	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);  	journal = EXT4_SB(sb)->s_journal; -	if (!journal) -		return ext4_get_nojournal();  	/*  	 * Special case here: if the journal has aborted behind our  	 * backs (eg. EIO in the commit thread), then we still need to  	 * take the FS itself readonly cleanly.  	 */ -	if (is_journal_aborted(journal)) { +	if (journal && is_journal_aborted(journal)) {  		ext4_abort(sb, "Detected aborted journal"); -		return ERR_PTR(-EROFS); +		return -EROFS;  	} -	return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); +	return 0; +} + +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, +				  int type, int blocks, int rsv_blocks) +{ +	journal_t *journal; +	int err; + +	trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); +	err = ext4_journal_check_start(sb); +	if (err < 0) +		return ERR_PTR(err); + +	journal = EXT4_SB(sb)->s_journal; +	if (!journal) +		return ext4_get_nojournal(); +	return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, +				   type, line);  }  int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) @@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)  	return err;  } +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, +					int type) +{ +	struct super_block *sb; +	int err; + +	if (!ext4_handle_valid(handle)) +		return ext4_get_nojournal(); + +	sb = handle->h_journal->j_private; +	trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, +					  _RET_IP_); +	err = ext4_journal_check_start(sb); +	if (err < 0) { +		jbd2_journal_free_reserved(handle); +		return ERR_PTR(err); +	} + +	err = jbd2_journal_start_reserved(handle, type, line); +	if (err < 0) +		return ERR_PTR(err); +	return handle; +} +  void ext4_journal_abort_handle(const char *caller, unsigned int line,  			       const char *err_fn, struct buffer_head *bh,  			       handle_t *handle, int err) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index c8c6885406db..2877258d9497 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)  #define EXT4_HT_MIGRATE          8  #define EXT4_HT_MOVE_EXTENTS     9  #define EXT4_HT_XATTR           10 -#define EXT4_HT_MAX             11 +#define EXT4_HT_EXT_CONVERT     11 +#define EXT4_HT_MAX             12  /**   *   struct ext4_journal_cb_entry - Base structure for callback information. @@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,  	__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))  handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, -				  int type, int nblocks); +				  int type, int blocks, int rsv_blocks);  int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);  #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)  }  #define ext4_journal_start_sb(sb, type, nblocks)			\ -	__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) +	__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)  #define ext4_journal_start(inode, type, nblocks)			\ -	__ext4_journal_start((inode), __LINE__, (type), (nblocks)) +	__ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) + +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ +	__ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))  static inline handle_t *__ext4_journal_start(struct inode *inode,  					     unsigned int line, int type, -					     int nblocks) +					     int blocks, int rsv_blocks)  { -	return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); +	return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, +				       rsv_blocks);  }  #define ext4_journal_stop(handle) \  	__ext4_journal_stop(__func__, __LINE__, (handle)) +#define ext4_journal_start_reserved(handle, type) \ +	__ext4_journal_start_reserved((handle), __LINE__, (type)) + +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, +					int type); + +static inline void ext4_journal_free_reserved(handle_t *handle) +{ +	if (ext4_handle_valid(handle)) +		jbd2_journal_free_reserved(handle); +} +  static inline handle_t *ext4_journal_current_handle(void)  {  	return journal_current_handle(); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bc0f1910b9cf..72ba4705d4fa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,  		next_del = ext4_find_delayed_extent(inode, &es);  		if (!exists && next_del) {  			exists = 1; -			flags |= FIEMAP_EXTENT_DELALLOC; +			flags |= (FIEMAP_EXTENT_DELALLOC | +				  FIEMAP_EXTENT_UNKNOWN);  		}  		up_read(&EXT4_I(inode)->i_data_sem); @@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,  }  /* - * How many index/leaf blocks need to change/allocate to modify nrblocks? + * How many index/leaf blocks need to change/allocate to add @extents extents?   * - * if nrblocks are fit in a single extent (chunk flag is 1), then - * in the worse case, each tree level index/leaf need to be changed - * if the tree split due to insert a new extent, then the old tree - * index/leaf need to be updated too + * If we add a single extent, then in the worse case, each tree level + * index/leaf need to be changed in case of the tree split.   * - * If the nrblocks are discontiguous, they could cause - * the whole tree split more than once, but this is really rare. + * If more extents are inserted, they could cause the whole tree split more + * than once, but this is really rare.   */ -int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +int ext4_ext_index_trans_blocks(struct inode *inode, int extents)  {  	int index;  	int depth; @@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)  	depth = ext_depth(inode); -	if (chunk) +	if (extents <= 1)  		index = depth * 2;  	else  		index = depth * 3; @@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)  	return index;  } +static inline int get_default_free_blocks_flags(struct inode *inode) +{ +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) +		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; +	else if (ext4_should_journal_data(inode)) +		return EXT4_FREE_BLOCKS_FORGET; +	return 0; +} +  static int ext4_remove_blocks(handle_t *handle, struct inode *inode,  			      struct ext4_extent *ex, -			      ext4_fsblk_t *partial_cluster, +			      long long *partial_cluster,  			      ext4_lblk_t from, ext4_lblk_t to)  {  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	unsigned short ee_len =  ext4_ext_get_actual_len(ex);  	ext4_fsblk_t pblk; -	int flags = 0; - -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) -		flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; -	else if (ext4_should_journal_data(inode)) -		flags |= EXT4_FREE_BLOCKS_FORGET; +	int flags = get_default_free_blocks_flags(inode);  	/*  	 * For bigalloc file systems, we never free a partial cluster @@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,  	 * partial cluster here.  	 */  	pblk = ext4_ext_pblock(ex) + ee_len - 1; -	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { +	if ((*partial_cluster > 0) && +	    (EXT4_B2C(sbi, pblk) != *partial_cluster)) {  		ext4_free_blocks(handle, inode, NULL,  				 EXT4_C2B(sbi, *partial_cluster),  				 sbi->s_cluster_ratio, flags); @@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,  	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {  		/* tail removal */  		ext4_lblk_t num; +		unsigned int unaligned;  		num = le32_to_cpu(ex->ee_block) + ee_len - from;  		pblk = ext4_ext_pblock(ex) + ee_len - num; -		ext_debug("free last %u blocks starting %llu\n", num, pblk); +		/* +		 * Usually we want to free partial cluster at the end of the +		 * extent, except for the situation when the cluster is still +		 * used by any other extent (partial_cluster is negative). +		 */ +		if (*partial_cluster < 0 && +		    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) +			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + +		ext_debug("free last %u blocks starting %llu partial %lld\n", +			  num, pblk, *partial_cluster);  		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);  		/*  		 * If the block range to be freed didn't start at the  		 * beginning of a cluster, and we removed the entire -		 * extent, save the partial cluster here, since we -		 * might need to delete if we determine that the -		 * truncate operation has removed all of the blocks in -		 * the cluster. +		 * extent and the cluster is not used by any other extent, +		 * save the partial cluster here, since we might need to +		 * delete if we determine that the truncate operation has +		 * removed all of the blocks in the cluster. +		 * +		 * On the other hand, if we did not manage to free the whole +		 * extent, we have to mark the cluster as used (store negative +		 * cluster number in partial_cluster).  		 */ -		if (pblk & (sbi->s_cluster_ratio - 1) && -		    (ee_len == num)) +		unaligned = pblk & (sbi->s_cluster_ratio - 1); +		if (unaligned && (ee_len == num) && +		    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))  			*partial_cluster = EXT4_B2C(sbi, pblk); -		else +		else if (unaligned) +			*partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); +		else if (*partial_cluster > 0)  			*partial_cluster = 0; -	} else if (from == le32_to_cpu(ex->ee_block) -		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { -		/* head removal */ -		ext4_lblk_t num; -		ext4_fsblk_t start; - -		num = to - from; -		start = ext4_ext_pblock(ex); - -		ext_debug("free first %u blocks starting %llu\n", num, start); -		ext4_free_blocks(handle, inode, NULL, start, num, flags); - -	} else { -		printk(KERN_INFO "strange request: removal(2) " -				"%u-%u from %u:%u\n", -				from, to, le32_to_cpu(ex->ee_block), ee_len); -	} +	} else +		ext4_error(sbi->s_sb, "strange request: removal(2) " +			   "%u-%u from %u:%u\n", +			   from, to, le32_to_cpu(ex->ee_block), ee_len);  	return 0;  } @@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,   * @handle: The journal handle   * @inode:  The files inode   * @path:   The path to the leaf + * @partial_cluster: The cluster which we'll have to free if all extents + *                   has been released from it. It gets negative in case + *                   that the cluster is still used.   * @start:  The first block to remove   * @end:   The last block to remove   */  static int  ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, -		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, +		 struct ext4_ext_path *path, +		 long long *partial_cluster,  		 ext4_lblk_t start, ext4_lblk_t end)  {  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  	unsigned short ex_ee_len;  	unsigned uninitialized = 0;  	struct ext4_extent *ex; +	ext4_fsblk_t pblk;  	/* the header must be checked already in ext4_ext_remove_space() */  	ext_debug("truncate since %u in leaf to %u\n", start, end); @@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  		return -EIO;  	}  	/* find where to start removing */ -	ex = EXT_LAST_EXTENT(eh); +	ex = path[depth].p_ext; +	if (!ex) +		ex = EXT_LAST_EXTENT(eh);  	ex_ee_block = le32_to_cpu(ex->ee_block);  	ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  		/* If this extent is beyond the end of the hole, skip it */  		if (end < ex_ee_block) { +			/* +			 * We're going to skip this extent and move to another, +			 * so if this extent is not cluster aligned we have +			 * to mark the current cluster as used to avoid +			 * accidentally freeing it later on +			 */ +			pblk = ext4_ext_pblock(ex); +			if (pblk & (sbi->s_cluster_ratio - 1)) +				*partial_cluster = +					-((long long)EXT4_B2C(sbi, pblk));  			ex--;  			ex_ee_block = le32_to_cpu(ex->ee_block);  			ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  					sizeof(struct ext4_extent));  			}  			le16_add_cpu(&eh->eh_entries, -1); -		} else +		} else if (*partial_cluster > 0)  			*partial_cluster = 0;  		err = ext4_ext_dirty(handle, inode, path + depth); @@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  		err = ext4_ext_correct_indexes(handle, inode, path);  	/* -	 * If there is still a entry in the leaf node, check to see if -	 * it references the partial cluster.  This is the only place -	 * where it could; if it doesn't, we can free the cluster. +	 * Free the partial cluster only if the current extent does not +	 * reference it. Otherwise we might free used cluster.  	 */ -	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && +	if (*partial_cluster > 0 &&  	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=  	     *partial_cluster)) { -		int flags = EXT4_FREE_BLOCKS_FORGET; - -		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) -			flags |= EXT4_FREE_BLOCKS_METADATA; +		int flags = get_default_free_blocks_flags(inode);  		ext4_free_blocks(handle, inode, NULL,  				 EXT4_C2B(sbi, *partial_cluster), @@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,  	struct super_block *sb = inode->i_sb;  	int depth = ext_depth(inode);  	struct ext4_ext_path *path = NULL; -	ext4_fsblk_t partial_cluster = 0; +	long long partial_cluster = 0;  	handle_t *handle;  	int i = 0, err = 0; @@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,  		return PTR_ERR(handle);  again: -	trace_ext4_ext_remove_space(inode, start, depth); +	trace_ext4_ext_remove_space(inode, start, end, depth);  	/*  	 * Check if we are removing extents inside the extent tree. If that @@ -2813,6 +2835,9 @@ again:  				err = -EIO;  				break;  			} +			/* Yield here to deal with large extent trees. +			 * Should be a no-op if we did IO above. */ +			cond_resched();  			if (WARN_ON(i + 1 > depth)) {  				err = -EIO;  				break; @@ -2844,17 +2869,14 @@ again:  		}  	} -	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, -			path->p_hdr->eh_entries); +	trace_ext4_ext_remove_space_done(inode, start, end, depth, +			partial_cluster, path->p_hdr->eh_entries);  	/* If we still have something in the partial cluster and we have removed  	 * even the first extent, then we should free the blocks in the partial  	 * cluster as well. */ -	if (partial_cluster && path->p_hdr->eh_entries == 0) { -		int flags = EXT4_FREE_BLOCKS_FORGET; - -		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) -			flags |= EXT4_FREE_BLOCKS_METADATA; +	if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { +		int flags = get_default_free_blocks_flags(inode);  		ext4_free_blocks(handle, inode, NULL,  				 EXT4_C2B(EXT4_SB(sb), partial_cluster), @@ -4242,8 +4264,8 @@ got_allocated_blocks:  		/* not a good idea to call discard here directly,  		 * but otherwise we'd need to call it every free() */  		ext4_discard_preallocations(inode); -		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), -				 ext4_ext_get_actual_len(&newex), fb_flags); +		ext4_free_blocks(handle, inode, NULL, newblock, +				 EXT4_C2B(sbi, allocated_clusters), fb_flags);  		goto out2;  	} @@ -4363,8 +4385,9 @@ out2:  	}  out3: -	trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); - +	trace_ext4_ext_map_blocks_exit(inode, flags, map, +				       err ? err : allocated); +	ext4_es_lru_add(inode);  	return err ? err : allocated;  } @@ -4386,9 +4409,20 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)  	last_block = (inode->i_size + sb->s_blocksize - 1)  			>> EXT4_BLOCK_SIZE_BITS(sb); +retry:  	err = ext4_es_remove_extent(inode, last_block,  				    EXT_MAX_BLOCKS - last_block); +	if (err == -ENOMEM) { +		cond_resched(); +		congestion_wait(BLK_RW_ASYNC, HZ/50); +		goto retry; +	} +	if (err) { +		ext4_std_error(inode->i_sb, err); +		return; +	}  	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); +	ext4_std_error(inode->i_sb, err);  }  static void ext4_falloc_update_inode(struct inode *inode, @@ -4446,7 +4480,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  		return -EOPNOTSUPP;  	if (mode & FALLOC_FL_PUNCH_HOLE) -		return ext4_punch_hole(file, offset, len); +		return ext4_punch_hole(inode, offset, len);  	ret = ext4_convert_inline_data(inode);  	if (ret) @@ -4548,10 +4582,9 @@ retry:   * function, to convert the fallocated extents after IO is completed.   * Returns 0 on success.   */ -int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, -				    ssize_t len) +int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, +				   loff_t offset, ssize_t len)  { -	handle_t *handle;  	unsigned int max_blocks;  	int ret = 0;  	int ret2 = 0; @@ -4566,16 +4599,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,  	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -  		      map.m_lblk);  	/* -	 * credits to insert 1 extent into extent tree +	 * This is somewhat ugly but the idea is clear: When transaction is +	 * reserved, everything goes into it. Otherwise we rather start several +	 * smaller transactions for conversion of each extent separately.  	 */ -	credits = ext4_chunk_trans_blocks(inode, max_blocks); +	if (handle) { +		handle = ext4_journal_start_reserved(handle, +						     EXT4_HT_EXT_CONVERT); +		if (IS_ERR(handle)) +			return PTR_ERR(handle); +		credits = 0; +	} else { +		/* +		 * credits to insert 1 extent into extent tree +		 */ +		credits = ext4_chunk_trans_blocks(inode, max_blocks); +	}  	while (ret >= 0 && ret < max_blocks) {  		map.m_lblk += ret;  		map.m_len = (max_blocks -= ret); -		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); -		if (IS_ERR(handle)) { -			ret = PTR_ERR(handle); -			break; +		if (credits) { +			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, +						    credits); +			if (IS_ERR(handle)) { +				ret = PTR_ERR(handle); +				break; +			}  		}  		ret = ext4_map_blocks(handle, inode, &map,  				      EXT4_GET_BLOCKS_IO_CONVERT_EXT); @@ -4586,10 +4635,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,  				     inode->i_ino, map.m_lblk,  				     map.m_len, ret);  		ext4_mark_inode_dirty(handle, inode); -		ret2 = ext4_journal_stop(handle); -		if (ret <= 0 || ret2 ) +		if (credits) +			ret2 = ext4_journal_stop(handle); +		if (ret <= 0 || ret2)  			break;  	} +	if (!credits) +		ret2 = ext4_journal_stop(handle);  	return ret > 0 ? ret2 : ret;  } @@ -4659,7 +4711,7 @@ static int ext4_xattr_fiemap(struct inode *inode,  		error = ext4_get_inode_loc(inode, &iloc);  		if (error)  			return error; -		physical = iloc.bh->b_blocknr << blockbits; +		physical = (__u64)iloc.bh->b_blocknr << blockbits;  		offset = EXT4_GOOD_OLD_INODE_SIZE +  				EXT4_I(inode)->i_extra_isize;  		physical += offset; @@ -4667,7 +4719,7 @@ static int ext4_xattr_fiemap(struct inode *inode,  		flags |= FIEMAP_EXTENT_DATA_INLINE;  		brelse(iloc.bh);  	} else { /* external block */ -		physical = EXT4_I(inode)->i_file_acl << blockbits; +		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;  		length = inode->i_sb->s_blocksize;  	} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e6941e622d31..91cb110da1b4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -10,6 +10,7 @@   * Ext4 extents status tree core functions.   */  #include <linux/rbtree.h> +#include <linux/list_sort.h>  #include "ext4.h"  #include "extents_status.h"  #include "ext4_extents.h" @@ -147,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,  			      ext4_lblk_t end);  static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,  				       int nr_to_scan); +static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, +			    struct ext4_inode_info *locked_ei);  int __init ext4_init_es(void)  { @@ -291,7 +294,6 @@ out:  	read_unlock(&EXT4_I(inode)->i_es_lock); -	ext4_es_lru_add(inode);  	trace_ext4_es_find_delayed_extent_range_exit(inode, es);  } @@ -439,7 +441,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,  		 */  		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {  			if (in_range(es->es_lblk, ee_block, ee_len)) { -				pr_warn("ES insert assertation failed for " +				pr_warn("ES insert assertion failed for "  					"inode: %lu we can find an extent "  					"at block [%d/%d/%llu/%c], but we "  					"want to add an delayed/hole extent " @@ -458,7 +460,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,  		 */  		if (es->es_lblk < ee_block ||  		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { -			pr_warn("ES insert assertation failed for inode: %lu " +			pr_warn("ES insert assertion failed for inode: %lu "  				"ex_status [%d/%d/%llu/%c] != "  				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,  				ee_block, ee_len, ee_start, @@ -468,7 +470,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,  		}  		if (ee_status ^ es_status) { -			pr_warn("ES insert assertation failed for inode: %lu " +			pr_warn("ES insert assertion failed for inode: %lu "  				"ex_status [%d/%d/%llu/%c] != "  				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,  				ee_block, ee_len, ee_start, @@ -481,7 +483,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,  		 * that we don't want to add an written/unwritten extent.  		 */  		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { -			pr_warn("ES insert assertation failed for inode: %lu " +			pr_warn("ES insert assertion failed for inode: %lu "  				"can't find an extent at block %d but we want "  				"to add an written/unwritten extent "  				"[%d/%d/%llu/%llx]\n", inode->i_ino, @@ -519,7 +521,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,  			 * We want to add a delayed/hole extent but this  			 * block has been allocated.  			 */ -			pr_warn("ES insert assertation failed for inode: %lu " +			pr_warn("ES insert assertion failed for inode: %lu "  				"We can find blocks but we want to add a "  				"delayed/hole extent [%d/%d/%llu/%llx]\n",  				inode->i_ino, es->es_lblk, es->es_len, @@ -527,13 +529,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,  			return;  		} else if (ext4_es_is_written(es)) {  			if (retval != es->es_len) { -				pr_warn("ES insert assertation failed for " +				pr_warn("ES insert assertion failed for "  					"inode: %lu retval %d != es_len %d\n",  					inode->i_ino, retval, es->es_len);  				return;  			}  			if (map.m_pblk != ext4_es_pblock(es)) { -				pr_warn("ES insert assertation failed for " +				pr_warn("ES insert assertion failed for "  					"inode: %lu m_pblk %llu != "  					"es_pblk %llu\n",  					inode->i_ino, map.m_pblk, @@ -549,7 +551,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,  		}  	} else if (retval == 0) {  		if (ext4_es_is_written(es)) { -			pr_warn("ES insert assertation failed for inode: %lu " +			pr_warn("ES insert assertion failed for inode: %lu "  				"We can't find the block but we want to add "  				"an written extent [%d/%d/%llu/%llx]\n",  				inode->i_ino, es->es_lblk, es->es_len, @@ -632,10 +634,8 @@ out:  }  /* - * ext4_es_insert_extent() adds a space to a extent status tree. - * - * ext4_es_insert_extent is called by ext4_da_write_begin and - * ext4_es_remove_extent. + * ext4_es_insert_extent() adds information to an inode's extent + * status tree.   *   * Return 0 on success, error code on failure.   */ @@ -667,12 +667,17 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,  	err = __es_remove_extent(inode, lblk, end);  	if (err != 0)  		goto error; +retry:  	err = __es_insert_extent(inode, &newes); +	if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, +					       EXT4_I(inode))) +		goto retry; +	if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) +		err = 0;  error:  	write_unlock(&EXT4_I(inode)->i_es_lock); -	ext4_es_lru_add(inode);  	ext4_es_print_tree(inode);  	return err; @@ -734,7 +739,6 @@ out:  	read_unlock(&EXT4_I(inode)->i_es_lock); -	ext4_es_lru_add(inode);  	trace_ext4_es_lookup_extent_exit(inode, es, found);  	return found;  } @@ -748,8 +752,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,  	struct extent_status orig_es;  	ext4_lblk_t len1, len2;  	ext4_fsblk_t block; -	int err = 0; +	int err; +retry: +	err = 0;  	es = __es_tree_search(&tree->root, lblk);  	if (!es)  		goto out; @@ -784,6 +790,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,  			if (err) {  				es->es_lblk = orig_es.es_lblk;  				es->es_len = orig_es.es_len; +				if ((err == -ENOMEM) && +				    __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, +						     EXT4_I(inode))) +					goto retry;  				goto out;  			}  		} else { @@ -878,38 +888,64 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)  				     EXTENT_STATUS_WRITTEN);  } -static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, +				     struct list_head *b) +{ +	struct ext4_inode_info *eia, *eib; +	eia = list_entry(a, struct ext4_inode_info, i_es_lru); +	eib = list_entry(b, struct ext4_inode_info, i_es_lru); + +	if (eia->i_touch_when == eib->i_touch_when) +		return 0; +	if (time_after(eia->i_touch_when, eib->i_touch_when)) +		return 1; +	else +		return -1; +} + +static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, +			    struct ext4_inode_info *locked_ei)  { -	struct ext4_sb_info *sbi = container_of(shrink, -					struct ext4_sb_info, s_es_shrinker);  	struct ext4_inode_info *ei; -	struct list_head *cur, *tmp, scanned; -	int nr_to_scan = sc->nr_to_scan; +	struct list_head *cur, *tmp; +	LIST_HEAD(skiped);  	int ret, nr_shrunk = 0; -	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); -	trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); - -	if (!nr_to_scan) -		return ret; +	spin_lock(&sbi->s_es_lru_lock); -	INIT_LIST_HEAD(&scanned); +	/* +	 * If the inode that is at the head of LRU list is newer than +	 * last_sorted time, that means that we need to sort this list. +	 */ +	ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); +	if (sbi->s_es_last_sorted < ei->i_touch_when) { +		list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); +		sbi->s_es_last_sorted = jiffies; +	} -	spin_lock(&sbi->s_es_lru_lock);  	list_for_each_safe(cur, tmp, &sbi->s_es_lru) { -		list_move_tail(cur, &scanned); +		/* +		 * If we have already reclaimed all extents from extent +		 * status tree, just stop the loop immediately. +		 */ +		if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) +			break;  		ei = list_entry(cur, struct ext4_inode_info, i_es_lru); -		read_lock(&ei->i_es_lock); -		if (ei->i_es_lru_nr == 0) { -			read_unlock(&ei->i_es_lock); +		/* Skip the inode that is newer than the last_sorted time */ +		if (sbi->s_es_last_sorted < ei->i_touch_when) { +			list_move_tail(cur, &skiped);  			continue;  		} -		read_unlock(&ei->i_es_lock); + +		if (ei->i_es_lru_nr == 0 || ei == locked_ei) +			continue;  		write_lock(&ei->i_es_lock);  		ret = __es_try_to_reclaim_extents(ei, nr_to_scan); +		if (ei->i_es_lru_nr == 0) +			list_del_init(&ei->i_es_lru);  		write_unlock(&ei->i_es_lock);  		nr_shrunk += ret; @@ -917,29 +953,50 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)  		if (nr_to_scan == 0)  			break;  	} -	list_splice_tail(&scanned, &sbi->s_es_lru); + +	/* Move the newer inodes into the tail of the LRU list. */ +	list_splice_tail(&skiped, &sbi->s_es_lru);  	spin_unlock(&sbi->s_es_lru_lock); +	if (locked_ei && nr_shrunk == 0) +		nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); + +	return nr_shrunk; +} + +static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ +	struct ext4_sb_info *sbi = container_of(shrink, +					struct ext4_sb_info, s_es_shrinker); +	int nr_to_scan = sc->nr_to_scan; +	int ret, nr_shrunk; + +	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); +	trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + +	if (!nr_to_scan) +		return ret; + +	nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); +  	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);  	trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);  	return ret;  } -void ext4_es_register_shrinker(struct super_block *sb) +void ext4_es_register_shrinker(struct ext4_sb_info *sbi)  { -	struct ext4_sb_info *sbi; - -	sbi = EXT4_SB(sb);  	INIT_LIST_HEAD(&sbi->s_es_lru);  	spin_lock_init(&sbi->s_es_lru_lock); +	sbi->s_es_last_sorted = 0;  	sbi->s_es_shrinker.shrink = ext4_es_shrink;  	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;  	register_shrinker(&sbi->s_es_shrinker);  } -void ext4_es_unregister_shrinker(struct super_block *sb) +void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)  { -	unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); +	unregister_shrinker(&sbi->s_es_shrinker);  }  void ext4_es_lru_add(struct inode *inode) @@ -947,11 +1004,14 @@ void ext4_es_lru_add(struct inode *inode)  	struct ext4_inode_info *ei = EXT4_I(inode);  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	ei->i_touch_when = jiffies; + +	if (!list_empty(&ei->i_es_lru)) +		return; +  	spin_lock(&sbi->s_es_lru_lock);  	if (list_empty(&ei->i_es_lru))  		list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); -	else -		list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);  	spin_unlock(&sbi->s_es_lru_lock);  } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f740eb03b707..e936730cc5b0 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -39,6 +39,7 @@  				 EXTENT_STATUS_DELAYED | \  				 EXTENT_STATUS_HOLE) +struct ext4_sb_info;  struct ext4_extent;  struct extent_status { @@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,  	es->es_pblk = block;  } -extern void ext4_es_register_shrinker(struct super_block *sb); -extern void ext4_es_unregister_shrinker(struct super_block *sb); +extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);  extern void ext4_es_lru_add(struct inode *inode);  extern void ext4_es_lru_del(struct inode *inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b1b4d51b5d86..6f4cc567c382 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,  	blkbits = inode->i_sb->s_blocksize_bits;  	startoff = *offset;  	lastoff = startoff; -	endoff = (map->m_lblk + map->m_len) << blkbits; +	endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;  	index = startoff >> PAGE_CACHE_SHIFT;  	end = endoff >> PAGE_CACHE_SHIFT; @@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)  		ret = ext4_map_blocks(NULL, inode, &map, 0);  		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {  			if (last != start) -				dataoff = last << blkbits; +				dataoff = (loff_t)last << blkbits;  			break;  		} @@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)  		ext4_es_find_delayed_extent_range(inode, last, last, &es);  		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {  			if (last != start) -				dataoff = last << blkbits; +				dataoff = (loff_t)last << blkbits;  			break;  		} @@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)  		}  		last++; -		dataoff = last << blkbits; +		dataoff = (loff_t)last << blkbits;  	} while (last <= end);  	mutex_unlock(&inode->i_mutex); @@ -494,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)  	if (dataoff > isize)  		return -ENXIO; -	if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) -		return -EINVAL; -	if (dataoff > maxsize) -		return -EINVAL; - -	if (dataoff != file->f_pos) { -		file->f_pos = dataoff; -		file->f_version = 0; -	} - -	return dataoff; +	return vfs_setpos(file, dataoff, maxsize);  }  /* @@ -540,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)  		ret = ext4_map_blocks(NULL, inode, &map, 0);  		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {  			last += ret; -			holeoff = last << blkbits; +			holeoff = (loff_t)last << blkbits;  			continue;  		} @@ -551,7 +541,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)  		ext4_es_find_delayed_extent_range(inode, last, last, &es);  		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {  			last = es.es_lblk + es.es_len; -			holeoff = last << blkbits; +			holeoff = (loff_t)last << blkbits;  			continue;  		} @@ -566,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)  							      &map, &holeoff);  			if (!unwritten) {  				last += ret; -				holeoff = last << blkbits; +				holeoff = (loff_t)last << blkbits;  				continue;  			}  		} @@ -580,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)  	if (holeoff > isize)  		holeoff = isize; -	if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) -		return -EINVAL; -	if (holeoff > maxsize) -		return -EINVAL; - -	if (holeoff != file->f_pos) { -		file->f_pos = holeoff; -		file->f_version = 0; -	} - -	return holeoff; +	return vfs_setpos(file, holeoff, maxsize);  }  /* diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e0ba8a408def..a8bc47f75fa0 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)  	return ret;  } -/** - * __sync_file - generic_file_fsync without the locking and filemap_write - * @inode:	inode to sync - * @datasync:	only sync essential metadata if true - * - * This is just generic_file_fsync without the locking.  This is needed for - * nojournal mode to make sure this inodes data/metadata makes it to disk - * properly.  The i_mutex should be held already. - */ -static int __sync_inode(struct inode *inode, int datasync) -{ -	int err; -	int ret; - -	ret = sync_mapping_buffers(inode->i_mapping); -	if (!(inode->i_state & I_DIRTY)) -		return ret; -	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) -		return ret; - -	err = sync_inode_metadata(inode, 1); -	if (ret == 0) -		ret = err; -	return ret; -} -  /*   * akpm: A new design for ext4_sync_file().   * @@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	struct inode *inode = file->f_mapping->host;  	struct ext4_inode_info *ei = EXT4_I(inode);  	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; -	int ret, err; +	int ret = 0, err;  	tid_t commit_tid;  	bool needs_barrier = false; @@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	trace_ext4_sync_file_enter(file, datasync); -	ret = filemap_write_and_wait_range(inode->i_mapping, start, end); -	if (ret) -		return ret; -	mutex_lock(&inode->i_mutex); - -	if (inode->i_sb->s_flags & MS_RDONLY) -		goto out; - -	ret = ext4_flush_unwritten_io(inode); -	if (ret < 0) +	if (inode->i_sb->s_flags & MS_RDONLY) { +		/* Make sure that we read updated s_mount_flags value */ +		smp_rmb(); +		if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) +			ret = -EROFS;  		goto out; +	}  	if (!journal) { -		ret = __sync_inode(inode, datasync); +		ret = generic_file_fsync(file, start, end, datasync);  		if (!ret && !hlist_empty(&inode->i_dentry))  			ret = ext4_sync_parent(inode);  		goto out;  	} +	ret = filemap_write_and_wait_range(inode->i_mapping, start, end); +	if (ret) +		return ret;  	/*  	 * data=writeback,ordered:  	 *  The caller's filemap_fdatawrite()/wait will sync the data. @@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  		if (!ret)  			ret = err;  	} - out: -	mutex_unlock(&inode->i_mutex); +out:  	trace_ext4_sync_file_exit(inode, ret);  	return ret;  } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 00a818d67b54..8bf5999875ee 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -734,11 +734,8 @@ repeat_in_this_group:  		ino = ext4_find_next_zero_bit((unsigned long *)  					      inode_bitmap_bh->b_data,  					      EXT4_INODES_PER_GROUP(sb), ino); -		if (ino >= EXT4_INODES_PER_GROUP(sb)) { -			if (++group == ngroups) -				group = 0; -			continue; -		} +		if (ino >= EXT4_INODES_PER_GROUP(sb)) +			goto next_group;  		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {  			ext4_error(sb, "reserved inode found cleared - "  				   "inode=%lu", ino + 1); @@ -747,7 +744,8 @@ repeat_in_this_group:  		if (!handle) {  			BUG_ON(nblocks <= 0);  			handle = __ext4_journal_start_sb(dir->i_sb, line_no, -							 handle_type, nblocks); +							 handle_type, nblocks, +							 0);  			if (IS_ERR(handle)) {  				err = PTR_ERR(handle);  				ext4_std_error(sb, err); @@ -768,6 +766,9 @@ repeat_in_this_group:  			goto got; /* we grabbed the inode! */  		if (ino < EXT4_INODES_PER_GROUP(sb))  			goto repeat_in_this_group; +next_group: +		if (++group == ngroups) +			group = 0;  	}  	err = -ENOSPC;  	goto out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index b8d5d351e24f..87b30cd357e7 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -624,7 +624,7 @@ cleanup:  		partial--;  	}  out: -	trace_ext4_ind_map_blocks_exit(inode, map, err); +	trace_ext4_ind_map_blocks_exit(inode, flags, map, err);  	return err;  } @@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,  retry:  	if (rw == READ && ext4_should_dioread_nolock(inode)) { -		if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { -			mutex_lock(&inode->i_mutex); -			ext4_flush_unwritten_io(inode); -			mutex_unlock(&inode->i_mutex); -		}  		/*  		 * Nolock dioread optimization may be dynamically disabled  		 * via ext4_inode_block_unlocked_dio(). Check inode's state @@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)  	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;  } -int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +/* + * Calculate number of indirect blocks touched by mapping @nrblocks logically + * contiguous blocks + */ +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)  { -	int indirects; - -	/* if nrblocks are contiguous */ -	if (chunk) { -		/* -		 * With N contiguous data blocks, we need at most -		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, -		 * 2 dindirect blocks, and 1 tindirect block -		 */ -		return DIV_ROUND_UP(nrblocks, -				    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; -	}  	/* -	 * if nrblocks are not contiguous, worse case, each block touch -	 * a indirect block, and each indirect block touch a double indirect -	 * block, plus a triple indirect block +	 * With N contiguous data blocks, we need at most +	 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, +	 * 2 dindirect blocks, and 1 tindirect block  	 */ -	indirects = nrblocks * 2 + 1; -	return indirects; +	return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;  }  /* @@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,  			     __le32 *last)  {  	__le32 *p; -	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; +	int	flags = EXT4_FREE_BLOCKS_VALIDATED;  	int	err;  	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) -		flags |= EXT4_FREE_BLOCKS_METADATA; +		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; +	else if (ext4_should_journal_data(inode)) +		flags |= EXT4_FREE_BLOCKS_FORGET;  	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,  				   count)) { diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3e2bf873e8a8..d9ecbf1113a7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,  		entry = (struct ext4_xattr_entry *)  			((void *)raw_inode + EXT4_I(inode)->i_inline_off); -		free += le32_to_cpu(entry->e_value_size); +		free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));  		goto out;  	} @@ -1404,16 +1404,15 @@ out:   * offset as if '.' and '..' really take place.   *   */ -int ext4_read_inline_dir(struct file *filp, -			 void *dirent, filldir_t filldir, +int ext4_read_inline_dir(struct file *file, +			 struct dir_context *ctx,  			 int *has_inline_data)  { -	int error = 0;  	unsigned int offset, parent_ino; -	int i, stored; +	int i;  	struct ext4_dir_entry_2 *de;  	struct super_block *sb; -	struct inode *inode = file_inode(filp); +	struct inode *inode = file_inode(file);  	int ret, inline_size = 0;  	struct ext4_iloc iloc;  	void *dir_buf = NULL; @@ -1444,9 +1443,8 @@ int ext4_read_inline_dir(struct file *filp,  		goto out;  	sb = inode->i_sb; -	stored = 0;  	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); -	offset = filp->f_pos; +	offset = ctx->pos;  	/*  	 * dotdot_offset and dotdot_size is the real offset and @@ -1460,104 +1458,74 @@ int ext4_read_inline_dir(struct file *filp,  	extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;  	extra_size = extra_offset + inline_size; -	while (!error && !stored && filp->f_pos < extra_size) { -revalidate: -		/* -		 * If the version has changed since the last call to -		 * readdir(2), then we might be pointing to an invalid -		 * dirent right now.  Scan from the start of the inline -		 * dir to make sure. -		 */ -		if (filp->f_version != inode->i_version) { -			for (i = 0; i < extra_size && i < offset;) { -				/* -				 * "." is with offset 0 and -				 * ".." is dotdot_offset. -				 */ -				if (!i) { -					i = dotdot_offset; -					continue; -				} else if (i == dotdot_offset) { -					i = dotdot_size; -					continue; -				} -				/* for other entry, the real offset in -				 * the buf has to be tuned accordingly. -				 */ -				de = (struct ext4_dir_entry_2 *) -					(dir_buf + i - extra_offset); -				/* It's too expensive to do a full -				 * dirent test each time round this -				 * loop, but we do have to test at -				 * least that it is non-zero.  A -				 * failure will be detected in the -				 * dirent test below. */ -				if (ext4_rec_len_from_disk(de->rec_len, -					extra_size) < EXT4_DIR_REC_LEN(1)) -					break; -				i += ext4_rec_len_from_disk(de->rec_len, -							    extra_size); -			} -			offset = i; -			filp->f_pos = offset; -			filp->f_version = inode->i_version; -		} - -		while (!error && filp->f_pos < extra_size) { -			if (filp->f_pos == 0) { -				error = filldir(dirent, ".", 1, 0, inode->i_ino, -						DT_DIR); -				if (error) -					break; -				stored++; -				filp->f_pos = dotdot_offset; +	/* +	 * If the version has changed since the last call to +	 * readdir(2), then we might be pointing to an invalid +	 * dirent right now.  Scan from the start of the inline +	 * dir to make sure. +	 */ +	if (file->f_version != inode->i_version) { +		for (i = 0; i < extra_size && i < offset;) { +			/* +			 * "." is with offset 0 and +			 * ".." is dotdot_offset. +			 */ +			if (!i) { +				i = dotdot_offset;  				continue; -			} - -			if (filp->f_pos == dotdot_offset) { -				error = filldir(dirent, "..", 2, -						dotdot_offset, -						parent_ino, DT_DIR); -				if (error) -					break; -				stored++; - -				filp->f_pos = dotdot_size; +			} else if (i == dotdot_offset) { +				i = dotdot_size;  				continue;  			} - +			/* for other entry, the real offset in +			 * the buf has to be tuned accordingly. +			 */  			de = (struct ext4_dir_entry_2 *) -				(dir_buf + filp->f_pos - extra_offset); -			if (ext4_check_dir_entry(inode, filp, de, -						 iloc.bh, dir_buf, -						 extra_size, filp->f_pos)) { -				ret = stored; +				(dir_buf + i - extra_offset); +			/* It's too expensive to do a full +			 * dirent test each time round this +			 * loop, but we do have to test at +			 * least that it is non-zero.  A +			 * failure will be detected in the +			 * dirent test below. */ +			if (ext4_rec_len_from_disk(de->rec_len, extra_size) +				< EXT4_DIR_REC_LEN(1)) +				break; +			i += ext4_rec_len_from_disk(de->rec_len, +						    extra_size); +		} +		offset = i; +		ctx->pos = offset; +		file->f_version = inode->i_version; +	} + +	while (ctx->pos < extra_size) { +		if (ctx->pos == 0) { +			if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))  				goto out; -			} -			if (le32_to_cpu(de->inode)) { -				/* We might block in the next section -				 * if the data destination is -				 * currently swapped out.  So, use a -				 * version stamp to detect whether or -				 * not the directory has been modified -				 * during the copy operation. -				 */ -				u64 version = filp->f_version; +			ctx->pos = dotdot_offset; +			continue; +		} -				error = filldir(dirent, de->name, -						de->name_len, -						filp->f_pos, -						le32_to_cpu(de->inode), -						get_dtype(sb, de->file_type)); -				if (error) -					break; -				if (version != filp->f_version) -					goto revalidate; -				stored++; -			} -			filp->f_pos += ext4_rec_len_from_disk(de->rec_len, -							      extra_size); +		if (ctx->pos == dotdot_offset) { +			if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) +				goto out; +			ctx->pos = dotdot_size; +			continue; +		} + +		de = (struct ext4_dir_entry_2 *) +			(dir_buf + ctx->pos - extra_offset); +		if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, +					 extra_size, ctx->pos)) +			goto out; +		if (le32_to_cpu(de->inode)) { +			if (!dir_emit(ctx, de->name, de->name_len, +				      le32_to_cpu(de->inode), +				      get_dtype(sb, de->file_type))) +				goto out;  		} +		ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);  	}  out:  	kfree(dir_buf); @@ -1842,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,  	if (error)  		goto out; -	physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; +	physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;  	physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;  	physical += offsetof(struct ext4_inode, i_block);  	length = i_size_read(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d6382b89ecbd..dd32a2eacd0d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,  						   new_size);  } -static void ext4_invalidatepage(struct page *page, unsigned long offset); +static void ext4_invalidatepage(struct page *page, unsigned int offset, +				unsigned int length);  static int __ext4_journalled_writepage(struct page *page, unsigned int len);  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, -		struct inode *inode, struct page *page, loff_t from, -		loff_t length, int flags); +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, +				  int pextents);  /*   * Test whether an inode is a fast symlink. @@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)  			filemap_write_and_wait(&inode->i_data);  		}  		truncate_inode_pages(&inode->i_data, 0); -		ext4_ioend_shutdown(inode); + +		WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));  		goto no_delete;  	} @@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)  	if (ext4_should_order_data(inode))  		ext4_begin_ordered_truncate(inode, 0);  	truncate_inode_pages(&inode->i_data, 0); -	ext4_ioend_shutdown(inode); +	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));  	if (is_bad_inode(inode))  		goto no_delete; @@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,  #define check_block_validity(inode, map)	\  	__check_block_validity((inode), __func__, __LINE__, (map)) -/* - * Return the number of contiguous dirty pages in a given inode - * starting at page frame idx. - */ -static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, -				    unsigned int max_pages) -{ -	struct address_space *mapping = inode->i_mapping; -	pgoff_t	index; -	struct pagevec pvec; -	pgoff_t num = 0; -	int i, nr_pages, done = 0; - -	if (max_pages == 0) -		return 0; -	pagevec_init(&pvec, 0); -	while (!done) { -		index = idx; -		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, -					      PAGECACHE_TAG_DIRTY, -					      (pgoff_t)PAGEVEC_SIZE); -		if (nr_pages == 0) -			break; -		for (i = 0; i < nr_pages; i++) { -			struct page *page = pvec.pages[i]; -			struct buffer_head *bh, *head; - -			lock_page(page); -			if (unlikely(page->mapping != mapping) || -			    !PageDirty(page) || -			    PageWriteback(page) || -			    page->index != idx) { -				done = 1; -				unlock_page(page); -				break; -			} -			if (page_has_buffers(page)) { -				bh = head = page_buffers(page); -				do { -					if (!buffer_delay(bh) && -					    !buffer_unwritten(bh)) -						done = 1; -					bh = bh->b_this_page; -				} while (!done && (bh != head)); -			} -			unlock_page(page); -			if (done) -				break; -			idx++; -			num++; -			if (num >= max_pages) { -				done = 1; -				break; -			} -		} -		pagevec_release(&pvec); -	} -	return num; -} -  #ifdef ES_AGGRESSIVE_TEST  static void ext4_map_blocks_es_recheck(handle_t *handle,  				       struct inode *inode, @@ -524,7 +465,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,  	if (es_map->m_lblk != map->m_lblk ||  	    es_map->m_flags != map->m_flags ||  	    es_map->m_pblk != map->m_pblk) { -		printk("ES cache assertation failed for inode: %lu " +		printk("ES cache assertion failed for inode: %lu "  		       "es_cached ex [%d/%d/%llu/%x] != "  		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",  		       inode->i_ino, es_map->m_lblk, es_map->m_len, @@ -575,6 +516,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  	/* Lookup extent status tree firstly */  	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { +		ext4_es_lru_add(inode);  		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {  			map->m_pblk = ext4_es_pblock(&es) +  					map->m_lblk - es.es_lblk; @@ -613,14 +555,13 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		int ret;  		unsigned long long status; -#ifdef ES_AGGRESSIVE_TEST -		if (retval != map->m_len) { -			printk("ES len assertation failed for inode: %lu " -			       "retval %d != map->m_len %d " -			       "in %s (lookup)\n", inode->i_ino, retval, -			       map->m_len, __func__); +		if (unlikely(retval != map->m_len)) { +			ext4_warning(inode->i_sb, +				     "ES len assertion failed for inode " +				     "%lu: retval %d != map->m_len %d", +				     inode->i_ino, retval, map->m_len); +			WARN_ON(1);  		} -#endif  		status = map->m_flags & EXT4_MAP_UNWRITTEN ?  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -714,14 +655,13 @@ found:  		int ret;  		unsigned long long status; -#ifdef ES_AGGRESSIVE_TEST -		if (retval != map->m_len) { -			printk("ES len assertation failed for inode: %lu " -			       "retval %d != map->m_len %d " -			       "in %s (allocation)\n", inode->i_ino, retval, -			       map->m_len, __func__); +		if (unlikely(retval != map->m_len)) { +			ext4_warning(inode->i_sb, +				     "ES len assertion failed for inode " +				     "%lu: retval %d != map->m_len %d", +				     inode->i_ino, retval, map->m_len); +			WARN_ON(1);  		} -#endif  		/*  		 * If the extent has been zeroed out, we don't need to update @@ -1118,10 +1058,13 @@ static int ext4_write_end(struct file *file,  		}  	} -	if (ext4_has_inline_data(inode)) -		copied = ext4_write_inline_data_end(inode, pos, len, -						    copied, page); -	else +	if (ext4_has_inline_data(inode)) { +		ret = ext4_write_inline_data_end(inode, pos, len, +						 copied, page); +		if (ret < 0) +			goto errout; +		copied = ret; +	} else  		copied = block_write_end(file, mapping, pos,  					 len, copied, page, fsdata); @@ -1157,8 +1100,6 @@ static int ext4_write_end(struct file *file,  	if (i_size_changed)  		ext4_mark_inode_dirty(handle, inode); -	if (copied < 0) -		ret = copied;  	if (pos + len > inode->i_size && ext4_can_truncate(inode))  		/* if we have allocated more blocks and copied  		 * less. We will have blocks allocated outside @@ -1415,21 +1356,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)  }  static void ext4_da_page_release_reservation(struct page *page, -					     unsigned long offset) +					     unsigned int offset, +					     unsigned int length)  {  	int to_release = 0;  	struct buffer_head *head, *bh;  	unsigned int curr_off = 0;  	struct inode *inode = page->mapping->host;  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	unsigned int stop = offset + length;  	int num_clusters;  	ext4_fsblk_t lblk; +	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); +  	head = page_buffers(page);  	bh = head;  	do {  		unsigned int next_off = curr_off + bh->b_size; +		if (next_off > stop) +			break; +  		if ((offset <= curr_off) && (buffer_delay(bh))) {  			to_release++;  			clear_buffer_delay(bh); @@ -1460,140 +1408,43 @@ static void ext4_da_page_release_reservation(struct page *page,   * Delayed allocation stuff   */ -/* - * mpage_da_submit_io - walks through extent of pages and try to write - * them with writepage() call back - * - * @mpd->inode: inode - * @mpd->first_page: first page of the extent - * @mpd->next_page: page after the last page of the extent - * - * By the time mpage_da_submit_io() is called we expect all blocks - * to be allocated. this may be wrong if allocation failed. - * - * As pages are already locked by write_cache_pages(), we can't use it - */ -static int mpage_da_submit_io(struct mpage_da_data *mpd, -			      struct ext4_map_blocks *map) -{ -	struct pagevec pvec; -	unsigned long index, end; -	int ret = 0, err, nr_pages, i; -	struct inode *inode = mpd->inode; -	struct address_space *mapping = inode->i_mapping; -	loff_t size = i_size_read(inode); -	unsigned int len, block_start; -	struct buffer_head *bh, *page_bufs = NULL; -	sector_t pblock = 0, cur_logical = 0; -	struct ext4_io_submit io_submit; +struct mpage_da_data { +	struct inode *inode; +	struct writeback_control *wbc; -	BUG_ON(mpd->next_page <= mpd->first_page); -	memset(&io_submit, 0, sizeof(io_submit)); +	pgoff_t first_page;	/* The first page to write */ +	pgoff_t next_page;	/* Current page to examine */ +	pgoff_t last_page;	/* Last page to examine */  	/* -	 * We need to start from the first_page to the next_page - 1 -	 * to make sure we also write the mapped dirty buffer_heads. -	 * If we look at mpd->b_blocknr we would only be looking -	 * at the currently mapped buffer_heads. +	 * Extent to map - this can be after first_page because that can be +	 * fully mapped. We somewhat abuse m_flags to store whether the extent +	 * is delalloc or unwritten.  	 */ -	index = mpd->first_page; -	end = mpd->next_page - 1; - -	pagevec_init(&pvec, 0); -	while (index <= end) { -		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); -		if (nr_pages == 0) -			break; -		for (i = 0; i < nr_pages; i++) { -			int skip_page = 0; -			struct page *page = pvec.pages[i]; - -			index = page->index; -			if (index > end) -				break; - -			if (index == size >> PAGE_CACHE_SHIFT) -				len = size & ~PAGE_CACHE_MASK; -			else -				len = PAGE_CACHE_SIZE; -			if (map) { -				cur_logical = index << (PAGE_CACHE_SHIFT - -							inode->i_blkbits); -				pblock = map->m_pblk + (cur_logical - -							map->m_lblk); -			} -			index++; - -			BUG_ON(!PageLocked(page)); -			BUG_ON(PageWriteback(page)); - -			bh = page_bufs = page_buffers(page); -			block_start = 0; -			do { -				if (map && (cur_logical >= map->m_lblk) && -				    (cur_logical <= (map->m_lblk + -						     (map->m_len - 1)))) { -					if (buffer_delay(bh)) { -						clear_buffer_delay(bh); -						bh->b_blocknr = pblock; -					} -					if (buffer_unwritten(bh) || -					    buffer_mapped(bh)) -						BUG_ON(bh->b_blocknr != pblock); -					if (map->m_flags & EXT4_MAP_UNINIT) -						set_buffer_uninit(bh); -					clear_buffer_unwritten(bh); -				} - -				/* -				 * skip page if block allocation undone and -				 * block is dirty -				 */ -				if (ext4_bh_delay_or_unwritten(NULL, bh)) -					skip_page = 1; -				bh = bh->b_this_page; -				block_start += bh->b_size; -				cur_logical++; -				pblock++; -			} while (bh != page_bufs); - -			if (skip_page) { -				unlock_page(page); -				continue; -			} - -			clear_page_dirty_for_io(page); -			err = ext4_bio_write_page(&io_submit, page, len, -						  mpd->wbc); -			if (!err) -				mpd->pages_written++; -			/* -			 * In error case, we have to continue because -			 * remaining pages are still locked -			 */ -			if (ret == 0) -				ret = err; -		} -		pagevec_release(&pvec); -	} -	ext4_io_submit(&io_submit); -	return ret; -} +	struct ext4_map_blocks map; +	struct ext4_io_submit io_submit;	/* IO submission data */ +}; -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) +static void mpage_release_unused_pages(struct mpage_da_data *mpd, +				       bool invalidate)  {  	int nr_pages, i;  	pgoff_t index, end;  	struct pagevec pvec;  	struct inode *inode = mpd->inode;  	struct address_space *mapping = inode->i_mapping; -	ext4_lblk_t start, last; + +	/* This is necessary when next_page == 0. */ +	if (mpd->first_page >= mpd->next_page) +		return;  	index = mpd->first_page;  	end   = mpd->next_page - 1; - -	start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); -	last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); -	ext4_es_remove_extent(inode, start, last - start + 1); +	if (invalidate) { +		ext4_lblk_t start, last; +		start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); +		last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); +		ext4_es_remove_extent(inode, start, last - start + 1); +	}  	pagevec_init(&pvec, 0);  	while (index <= end) { @@ -1606,14 +1457,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)  				break;  			BUG_ON(!PageLocked(page));  			BUG_ON(PageWriteback(page)); -			block_invalidatepage(page, 0); -			ClearPageUptodate(page); +			if (invalidate) { +				block_invalidatepage(page, 0, PAGE_CACHE_SIZE); +				ClearPageUptodate(page); +			}  			unlock_page(page);  		}  		index = pvec.pages[nr_pages - 1]->index + 1;  		pagevec_release(&pvec);  	} -	return;  }  static void ext4_print_free_blocks(struct inode *inode) @@ -1642,215 +1494,6 @@ static void ext4_print_free_blocks(struct inode *inode)  	return;  } -/* - * mpage_da_map_and_submit - go through given space, map them - *       if necessary, and then submit them for I/O - * - * @mpd - bh describing space - * - * The function skips space we know is already mapped to disk blocks. - * - */ -static void mpage_da_map_and_submit(struct mpage_da_data *mpd) -{ -	int err, blks, get_blocks_flags; -	struct ext4_map_blocks map, *mapp = NULL; -	sector_t next = mpd->b_blocknr; -	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; -	loff_t disksize = EXT4_I(mpd->inode)->i_disksize; -	handle_t *handle = NULL; - -	/* -	 * If the blocks are mapped already, or we couldn't accumulate -	 * any blocks, then proceed immediately to the submission stage. -	 */ -	if ((mpd->b_size == 0) || -	    ((mpd->b_state  & (1 << BH_Mapped)) && -	     !(mpd->b_state & (1 << BH_Delay)) && -	     !(mpd->b_state & (1 << BH_Unwritten)))) -		goto submit_io; - -	handle = ext4_journal_current_handle(); -	BUG_ON(!handle); - -	/* -	 * Call ext4_map_blocks() to allocate any delayed allocation -	 * blocks, or to convert an uninitialized extent to be -	 * initialized (in the case where we have written into -	 * one or more preallocated blocks). -	 * -	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to -	 * indicate that we are on the delayed allocation path.  This -	 * affects functions in many different parts of the allocation -	 * call path.  This flag exists primarily because we don't -	 * want to change *many* call functions, so ext4_map_blocks() -	 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the -	 * inode's allocation semaphore is taken. -	 * -	 * If the blocks in questions were delalloc blocks, set -	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting -	 * variables are updated after the blocks have been allocated. -	 */ -	map.m_lblk = next; -	map.m_len = max_blocks; -	/* -	 * We're in delalloc path and it is possible that we're going to -	 * need more metadata blocks than previously reserved. However -	 * we must not fail because we're in writeback and there is -	 * nothing we can do about it so it might result in data loss. -	 * So use reserved blocks to allocate metadata if possible. -	 */ -	get_blocks_flags = EXT4_GET_BLOCKS_CREATE | -			   EXT4_GET_BLOCKS_METADATA_NOFAIL; -	if (ext4_should_dioread_nolock(mpd->inode)) -		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; -	if (mpd->b_state & (1 << BH_Delay)) -		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; - - -	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); -	if (blks < 0) { -		struct super_block *sb = mpd->inode->i_sb; - -		err = blks; -		/* -		 * If get block returns EAGAIN or ENOSPC and there -		 * appears to be free blocks we will just let -		 * mpage_da_submit_io() unlock all of the pages. -		 */ -		if (err == -EAGAIN) -			goto submit_io; - -		if (err == -ENOSPC && ext4_count_free_clusters(sb)) { -			mpd->retval = err; -			goto submit_io; -		} - -		/* -		 * get block failure will cause us to loop in -		 * writepages, because a_ops->writepage won't be able -		 * to make progress. The page will be redirtied by -		 * writepage and writepages will again try to write -		 * the same. -		 */ -		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { -			ext4_msg(sb, KERN_CRIT, -				 "delayed block allocation failed for inode %lu " -				 "at logical offset %llu with max blocks %zd " -				 "with error %d", mpd->inode->i_ino, -				 (unsigned long long) next, -				 mpd->b_size >> mpd->inode->i_blkbits, err); -			ext4_msg(sb, KERN_CRIT, -				"This should not happen!! Data will be lost"); -			if (err == -ENOSPC) -				ext4_print_free_blocks(mpd->inode); -		} -		/* invalidate all the pages */ -		ext4_da_block_invalidatepages(mpd); - -		/* Mark this page range as having been completed */ -		mpd->io_done = 1; -		return; -	} -	BUG_ON(blks == 0); - -	mapp = ↦ -	if (map.m_flags & EXT4_MAP_NEW) { -		struct block_device *bdev = mpd->inode->i_sb->s_bdev; -		int i; - -		for (i = 0; i < map.m_len; i++) -			unmap_underlying_metadata(bdev, map.m_pblk + i); -	} - -	/* -	 * Update on-disk size along with block allocation. -	 */ -	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; -	if (disksize > i_size_read(mpd->inode)) -		disksize = i_size_read(mpd->inode); -	if (disksize > EXT4_I(mpd->inode)->i_disksize) { -		ext4_update_i_disksize(mpd->inode, disksize); -		err = ext4_mark_inode_dirty(handle, mpd->inode); -		if (err) -			ext4_error(mpd->inode->i_sb, -				   "Failed to mark inode %lu dirty", -				   mpd->inode->i_ino); -	} - -submit_io: -	mpage_da_submit_io(mpd, mapp); -	mpd->io_done = 1; -} - -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ -		(1 << BH_Delay) | (1 << BH_Unwritten)) - -/* - * mpage_add_bh_to_extent - try to add one more block to extent of blocks - * - * @mpd->lbh - extent of blocks - * @logical - logical number of the block in the file - * @b_state - b_state of the buffer head added - * - * the function is used to collect contig. blocks in same state - */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, -				   unsigned long b_state) -{ -	sector_t next; -	int blkbits = mpd->inode->i_blkbits; -	int nrblocks = mpd->b_size >> blkbits; - -	/* -	 * XXX Don't go larger than mballoc is willing to allocate -	 * This is a stopgap solution.  We eventually need to fold -	 * mpage_da_submit_io() into this function and then call -	 * ext4_map_blocks() multiple times in a loop -	 */ -	if (nrblocks >= (8*1024*1024 >> blkbits)) -		goto flush_it; - -	/* check if the reserved journal credits might overflow */ -	if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) { -		if (nrblocks >= EXT4_MAX_TRANS_DATA) { -			/* -			 * With non-extent format we are limited by the journal -			 * credit available.  Total credit needed to insert -			 * nrblocks contiguous blocks is dependent on the -			 * nrblocks.  So limit nrblocks. -			 */ -			goto flush_it; -		} -	} -	/* -	 * First block in the extent -	 */ -	if (mpd->b_size == 0) { -		mpd->b_blocknr = logical; -		mpd->b_size = 1 << blkbits; -		mpd->b_state = b_state & BH_FLAGS; -		return; -	} - -	next = mpd->b_blocknr + nrblocks; -	/* -	 * Can we merge the block to our big extent? -	 */ -	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { -		mpd->b_size += 1 << blkbits; -		return; -	} - -flush_it: -	/* -	 * We couldn't merge the block to our extent, so we -	 * need to flush current  extent and start new one -	 */ -	mpage_da_map_and_submit(mpd); -	return; -} -  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)  {  	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); @@ -1885,7 +1528,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  	/* Lookup extent status tree firstly */  	if (ext4_es_lookup_extent(inode, iblock, &es)) { - +		ext4_es_lru_add(inode);  		if (ext4_es_is_hole(&es)) {  			retval = 0;  			down_read((&EXT4_I(inode)->i_data_sem)); @@ -1992,14 +1635,13 @@ add_delayed:  		int ret;  		unsigned long long status; -#ifdef ES_AGGRESSIVE_TEST -		if (retval != map->m_len) { -			printk("ES len assertation failed for inode: %lu " -			       "retval %d != map->m_len %d " -			       "in %s (lookup)\n", inode->i_ino, retval, -			       map->m_len, __func__); +		if (unlikely(retval != map->m_len)) { +			ext4_warning(inode->i_sb, +				     "ES len assertion failed for inode " +				     "%lu: retval %d != map->m_len %d", +				     inode->i_ino, retval, map->m_len); +			WARN_ON(1);  		} -#endif  		status = map->m_flags & EXT4_MAP_UNWRITTEN ?  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -2156,7 +1798,7 @@ out:   * lock so we have to do some magic.   *   * This function can get called via... - *   - ext4_da_writepages after taking page lock (have journal handle) + *   - ext4_writepages after taking page lock (have journal handle)   *   - journal_submit_inode_data_buffers (no journal handle)   *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)   *   - grab_page_cache when doing write_begin (have journal handle) @@ -2234,76 +1876,405 @@ static int ext4_writepage(struct page *page,  		 */  		return __ext4_journalled_writepage(page, len); -	memset(&io_submit, 0, sizeof(io_submit)); +	ext4_io_submit_init(&io_submit, wbc); +	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); +	if (!io_submit.io_end) { +		redirty_page_for_writepage(wbc, page); +		unlock_page(page); +		return -ENOMEM; +	}  	ret = ext4_bio_write_page(&io_submit, page, len, wbc);  	ext4_io_submit(&io_submit); +	/* Drop io_end reference we got from init */ +	ext4_put_io_end_defer(io_submit.io_end);  	return ret;  } +#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) +  /* - * This is called via ext4_da_writepages() to - * calculate the total number of credits to reserve to fit - * a single extent allocation into a single transaction, - * ext4_da_writpeages() will loop calling this before - * the block allocation. + * mballoc gives us at most this number of blocks... + * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). + * The rest of mballoc seems to handle chunks upto full group size.   */ +#define MAX_WRITEPAGES_EXTENT_LEN 2048 -static int ext4_da_writepages_trans_blocks(struct inode *inode) +/* + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map + * + * @mpd - extent of blocks + * @lblk - logical number of the block in the file + * @b_state - b_state of the buffer head added + * + * the function is used to collect contig. blocks in same state + */ +static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, +				  unsigned long b_state) +{ +	struct ext4_map_blocks *map = &mpd->map; + +	/* Don't go larger than mballoc is willing to allocate */ +	if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) +		return 0; + +	/* First block in the extent? */ +	if (map->m_len == 0) { +		map->m_lblk = lblk; +		map->m_len = 1; +		map->m_flags = b_state & BH_FLAGS; +		return 1; +	} + +	/* Can we merge the block to our big extent? */ +	if (lblk == map->m_lblk + map->m_len && +	    (b_state & BH_FLAGS) == map->m_flags) { +		map->m_len++; +		return 1; +	} +	return 0; +} + +static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, +				    struct buffer_head *head, +				    struct buffer_head *bh, +				    ext4_lblk_t lblk) +{ +	struct inode *inode = mpd->inode; +	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) +							>> inode->i_blkbits; + +	do { +		BUG_ON(buffer_locked(bh)); + +		if (!buffer_dirty(bh) || !buffer_mapped(bh) || +		    (!buffer_delay(bh) && !buffer_unwritten(bh)) || +		    lblk >= blocks) { +			/* Found extent to map? */ +			if (mpd->map.m_len) +				return false; +			if (lblk >= blocks) +				return true; +			continue; +		} +		if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state)) +			return false; +	} while (lblk++, (bh = bh->b_this_page) != head); +	return true; +} + +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +{ +	int len; +	loff_t size = i_size_read(mpd->inode); +	int err; + +	BUG_ON(page->index != mpd->first_page); +	if (page->index == size >> PAGE_CACHE_SHIFT) +		len = size & ~PAGE_CACHE_MASK; +	else +		len = PAGE_CACHE_SIZE; +	clear_page_dirty_for_io(page); +	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); +	if (!err) +		mpd->wbc->nr_to_write--; +	mpd->first_page++; + +	return err; +} + +/* + * mpage_map_buffers - update buffers corresponding to changed extent and + *		       submit fully mapped pages for IO + * + * @mpd - description of extent to map, on return next extent to map + * + * Scan buffers corresponding to changed extent (we expect corresponding pages + * to be already locked) and update buffer state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits, + * and mark buffers as uninit when we perform writes to uninitialized extents + * and do extent conversion after IO is finished. If the last page is not fully + * mapped, we update @map to the next extent in the last page that needs + * mapping. Otherwise we submit the page for IO. + */ +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) +{ +	struct pagevec pvec; +	int nr_pages, i; +	struct inode *inode = mpd->inode; +	struct buffer_head *head, *bh; +	int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; +	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) +							>> inode->i_blkbits; +	pgoff_t start, end; +	ext4_lblk_t lblk; +	sector_t pblock; +	int err; + +	start = mpd->map.m_lblk >> bpp_bits; +	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; +	lblk = start << bpp_bits; +	pblock = mpd->map.m_pblk; + +	pagevec_init(&pvec, 0); +	while (start <= end) { +		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, +					  PAGEVEC_SIZE); +		if (nr_pages == 0) +			break; +		for (i = 0; i < nr_pages; i++) { +			struct page *page = pvec.pages[i]; + +			if (page->index > end) +				break; +			/* Upto 'end' pages must be contiguous */ +			BUG_ON(page->index != start); +			bh = head = page_buffers(page); +			do { +				if (lblk < mpd->map.m_lblk) +					continue; +				if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { +					/* +					 * Buffer after end of mapped extent. +					 * Find next buffer in the page to map. +					 */ +					mpd->map.m_len = 0; +					mpd->map.m_flags = 0; +					add_page_bufs_to_extent(mpd, head, bh, +								lblk); +					pagevec_release(&pvec); +					return 0; +				} +				if (buffer_delay(bh)) { +					clear_buffer_delay(bh); +					bh->b_blocknr = pblock++; +				} +				clear_buffer_unwritten(bh); +			} while (++lblk < blocks && +				 (bh = bh->b_this_page) != head); + +			/* +			 * FIXME: This is going to break if dioread_nolock +			 * supports blocksize < pagesize as we will try to +			 * convert potentially unmapped parts of inode. +			 */ +			mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; +			/* Page fully mapped - let IO run! */ +			err = mpage_submit_page(mpd, page); +			if (err < 0) { +				pagevec_release(&pvec); +				return err; +			} +			start++; +		} +		pagevec_release(&pvec); +	} +	/* Extent fully mapped and matches with page boundary. We are done. */ +	mpd->map.m_len = 0; +	mpd->map.m_flags = 0; +	return 0; +} + +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)  { -	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; +	struct inode *inode = mpd->inode; +	struct ext4_map_blocks *map = &mpd->map; +	int get_blocks_flags; +	int err; +	trace_ext4_da_write_pages_extent(inode, map);  	/* -	 * With non-extent format the journal credit needed to -	 * insert nrblocks contiguous block is dependent on -	 * number of contiguous block. So we will limit -	 * number of contiguous block to a sane value +	 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or +	 * to convert an uninitialized extent to be initialized (in the case +	 * where we have written into one or more preallocated blocks).  It is +	 * possible that we're going to need more metadata blocks than +	 * previously reserved. However we must not fail because we're in +	 * writeback and there is nothing we can do about it so it might result +	 * in data loss.  So use reserved blocks to allocate metadata if +	 * possible. +	 * +	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks +	 * in question are delalloc blocks.  This affects functions in many +	 * different parts of the allocation call path.  This flag exists +	 * primarily because we don't want to change *many* call functions, so +	 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag +	 * once the inode's allocation semaphore is taken.  	 */ -	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && -	    (max_blocks > EXT4_MAX_TRANS_DATA)) -		max_blocks = EXT4_MAX_TRANS_DATA; +	get_blocks_flags = EXT4_GET_BLOCKS_CREATE | +			   EXT4_GET_BLOCKS_METADATA_NOFAIL; +	if (ext4_should_dioread_nolock(inode)) +		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; +	if (map->m_flags & (1 << BH_Delay)) +		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + +	err = ext4_map_blocks(handle, inode, map, get_blocks_flags); +	if (err < 0) +		return err; +	if (map->m_flags & EXT4_MAP_UNINIT) { +		if (!mpd->io_submit.io_end->handle && +		    ext4_handle_valid(handle)) { +			mpd->io_submit.io_end->handle = handle->h_rsv_handle; +			handle->h_rsv_handle = NULL; +		} +		ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); +	} -	return ext4_chunk_trans_blocks(inode, max_blocks); +	BUG_ON(map->m_len == 0); +	if (map->m_flags & EXT4_MAP_NEW) { +		struct block_device *bdev = inode->i_sb->s_bdev; +		int i; + +		for (i = 0; i < map->m_len; i++) +			unmap_underlying_metadata(bdev, map->m_pblk + i); +	} +	return 0;  }  /* - * write_cache_pages_da - walk the list of dirty pages of the given - * address space and accumulate pages that need writing, and call - * mpage_da_map_and_submit to map a single contiguous memory region - * and then write them. + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length + *				 mpd->len and submit pages underlying it for IO + * + * @handle - handle for journal operations + * @mpd - extent to map + * + * The function maps extent starting at mpd->lblk of length mpd->len. If it is + * delayed, blocks are allocated, if it is unwritten, we may need to convert + * them to initialized or split the described range from larger unwritten + * extent. Note that we need not map all the described range since allocation + * can return less blocks or the range is covered by more unwritten extents. We + * cannot map more because we are limited by reserved transaction credits. On + * the other hand we always make sure that the last touched page is fully + * mapped so that it can be written out (and thus forward progress is + * guaranteed). After mapping we submit all mapped pages for IO.   */ -static int write_cache_pages_da(handle_t *handle, -				struct address_space *mapping, -				struct writeback_control *wbc, -				struct mpage_da_data *mpd, -				pgoff_t *done_index) +static int mpage_map_and_submit_extent(handle_t *handle, +				       struct mpage_da_data *mpd, +				       bool *give_up_on_write)  { -	struct buffer_head	*bh, *head; -	struct inode		*inode = mapping->host; -	struct pagevec		pvec; -	unsigned int		nr_pages; -	sector_t		logical; -	pgoff_t			index, end; -	long			nr_to_write = wbc->nr_to_write; -	int			i, tag, ret = 0; +	struct inode *inode = mpd->inode; +	struct ext4_map_blocks *map = &mpd->map; +	int err; +	loff_t disksize; -	memset(mpd, 0, sizeof(struct mpage_da_data)); -	mpd->wbc = wbc; -	mpd->inode = inode; -	pagevec_init(&pvec, 0); -	index = wbc->range_start >> PAGE_CACHE_SHIFT; -	end = wbc->range_end >> PAGE_CACHE_SHIFT; +	mpd->io_submit.io_end->offset = +				((loff_t)map->m_lblk) << inode->i_blkbits; +	do { +		err = mpage_map_one_extent(handle, mpd); +		if (err < 0) { +			struct super_block *sb = inode->i_sb; -	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) +			if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) +				goto invalidate_dirty_pages; +			/* +			 * Let the uper layers retry transient errors. +			 * In the case of ENOSPC, if ext4_count_free_blocks() +			 * is non-zero, a commit should free up blocks. +			 */ +			if ((err == -ENOMEM) || +			    (err == -ENOSPC && ext4_count_free_clusters(sb))) +				return err; +			ext4_msg(sb, KERN_CRIT, +				 "Delayed block allocation failed for " +				 "inode %lu at logical offset %llu with" +				 " max blocks %u with error %d", +				 inode->i_ino, +				 (unsigned long long)map->m_lblk, +				 (unsigned)map->m_len, -err); +			ext4_msg(sb, KERN_CRIT, +				 "This should not happen!! Data will " +				 "be lost\n"); +			if (err == -ENOSPC) +				ext4_print_free_blocks(inode); +		invalidate_dirty_pages: +			*give_up_on_write = true; +			return err; +		} +		/* +		 * Update buffer state, submit mapped pages, and get us new +		 * extent to map +		 */ +		err = mpage_map_and_submit_buffers(mpd); +		if (err < 0) +			return err; +	} while (map->m_len); + +	/* Update on-disk size after IO is submitted */ +	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; +	if (disksize > i_size_read(inode)) +		disksize = i_size_read(inode); +	if (disksize > EXT4_I(inode)->i_disksize) { +		int err2; + +		ext4_update_i_disksize(inode, disksize); +		err2 = ext4_mark_inode_dirty(handle, inode); +		if (err2) +			ext4_error(inode->i_sb, +				   "Failed to mark inode %lu dirty", +				   inode->i_ino); +		if (!err) +			err = err2; +	} +	return err; +} + +/* + * Calculate the total number of credits to reserve for one writepages + * iteration. This is called from ext4_writepages(). We map an extent of + * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping + * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + + * bpp - 1 blocks in bpp different extents. + */ +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ +	int bpp = ext4_journal_blocks_per_page(inode); + +	return ext4_meta_trans_blocks(inode, +				MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); +} + +/* + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages + * 				 and underlying extent to map + * + * @mpd - where to look for pages + * + * Walk dirty pages in the mapping. If they are fully mapped, submit them for + * IO immediately. When we find a page which isn't mapped we start accumulating + * extent of buffers underlying these pages that needs mapping (formed by + * either delayed or unwritten buffers). We also lock the pages containing + * these buffers. The extent found is returned in @mpd structure (starting at + * mpd->lblk with length mpd->len blocks). + * + * Note that this function can attach bios to one io_end structure which are + * neither logically nor physically contiguous. Although it may seem as an + * unnecessary complication, it is actually inevitable in blocksize < pagesize + * case as we need to track IO to all buffers underlying a page in one io_end. + */ +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) +{ +	struct address_space *mapping = mpd->inode->i_mapping; +	struct pagevec pvec; +	unsigned int nr_pages; +	pgoff_t index = mpd->first_page; +	pgoff_t end = mpd->last_page; +	int tag; +	int i, err = 0; +	int blkbits = mpd->inode->i_blkbits; +	ext4_lblk_t lblk; +	struct buffer_head *head; + +	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)  		tag = PAGECACHE_TAG_TOWRITE;  	else  		tag = PAGECACHE_TAG_DIRTY; -	*done_index = index; +	pagevec_init(&pvec, 0); +	mpd->map.m_len = 0; +	mpd->next_page = index;  	while (index <= end) {  		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,  			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);  		if (nr_pages == 0) -			return 0; +			goto out;  		for (i = 0; i < nr_pages; i++) {  			struct page *page = pvec.pages[i]; @@ -2318,31 +2289,21 @@ static int write_cache_pages_da(handle_t *handle,  			if (page->index > end)  				goto out; -			*done_index = page->index + 1; - -			/* -			 * If we can't merge this page, and we have -			 * accumulated an contiguous region, write it -			 */ -			if ((mpd->next_page != page->index) && -			    (mpd->next_page != mpd->first_page)) { -				mpage_da_map_and_submit(mpd); -				goto ret_extent_tail; -			} +			/* If we can't merge this page, we are done. */ +			if (mpd->map.m_len > 0 && mpd->next_page != page->index) +				goto out;  			lock_page(page); -  			/* -			 * If the page is no longer dirty, or its -			 * mapping no longer corresponds to inode we -			 * are writing (which means it has been -			 * truncated or invalidated), or the page is -			 * already under writeback and we are not -			 * doing a data integrity writeback, skip the page +			 * If the page is no longer dirty, or its mapping no +			 * longer corresponds to inode we are writing (which +			 * means it has been truncated or invalidated), or the +			 * page is already under writeback and we are not doing +			 * a data integrity writeback, skip the page  			 */  			if (!PageDirty(page) ||  			    (PageWriteback(page) && -			     (wbc->sync_mode == WB_SYNC_NONE)) || +			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||  			    unlikely(page->mapping != mapping)) {  				unlock_page(page);  				continue; @@ -2351,106 +2312,70 @@ static int write_cache_pages_da(handle_t *handle,  			wait_on_page_writeback(page);  			BUG_ON(PageWriteback(page)); -			/* -			 * If we have inline data and arrive here, it means that -			 * we will soon create the block for the 1st page, so -			 * we'd better clear the inline data here. -			 */ -			if (ext4_has_inline_data(inode)) { -				BUG_ON(ext4_test_inode_state(inode, -						EXT4_STATE_MAY_INLINE_DATA)); -				ext4_destroy_inline_data(handle, inode); -			} - -			if (mpd->next_page != page->index) +			if (mpd->map.m_len == 0)  				mpd->first_page = page->index;  			mpd->next_page = page->index + 1; -			logical = (sector_t) page->index << -				(PAGE_CACHE_SHIFT - inode->i_blkbits); -  			/* Add all dirty buffers to mpd */ +			lblk = ((ext4_lblk_t)page->index) << +				(PAGE_CACHE_SHIFT - blkbits);  			head = page_buffers(page); -			bh = head; -			do { -				BUG_ON(buffer_locked(bh)); -				/* -				 * We need to try to allocate unmapped blocks -				 * in the same page.  Otherwise we won't make -				 * progress with the page in ext4_writepage -				 */ -				if (ext4_bh_delay_or_unwritten(NULL, bh)) { -					mpage_add_bh_to_extent(mpd, logical, -							       bh->b_state); -					if (mpd->io_done) -						goto ret_extent_tail; -				} else if (buffer_dirty(bh) && -					   buffer_mapped(bh)) { -					/* -					 * mapped dirty buffer. We need to -					 * update the b_state because we look -					 * at b_state in mpage_da_map_blocks. -					 * We don't update b_size because if we -					 * find an unmapped buffer_head later -					 * we need to use the b_state flag of -					 * that buffer_head. -					 */ -					if (mpd->b_size == 0) -						mpd->b_state = -							bh->b_state & BH_FLAGS; -				} -				logical++; -			} while ((bh = bh->b_this_page) != head); - -			if (nr_to_write > 0) { -				nr_to_write--; -				if (nr_to_write == 0 && -				    wbc->sync_mode == WB_SYNC_NONE) -					/* -					 * We stop writing back only if we are -					 * not doing integrity sync. In case of -					 * integrity sync we have to keep going -					 * because someone may be concurrently -					 * dirtying pages, and we might have -					 * synced a lot of newly appeared dirty -					 * pages, but have not synced all of the -					 * old dirty pages. -					 */ +			if (!add_page_bufs_to_extent(mpd, head, head, lblk)) +				goto out; +			/* So far everything mapped? Submit the page for IO. */ +			if (mpd->map.m_len == 0) { +				err = mpage_submit_page(mpd, page); +				if (err < 0)  					goto out;  			} + +			/* +			 * Accumulated enough dirty pages? This doesn't apply +			 * to WB_SYNC_ALL mode. For integrity sync we have to +			 * keep going because someone may be concurrently +			 * dirtying pages, and we might have synced a lot of +			 * newly appeared dirty pages, but have not synced all +			 * of the old dirty pages. +			 */ +			if (mpd->wbc->sync_mode == WB_SYNC_NONE && +			    mpd->next_page - mpd->first_page >= +							mpd->wbc->nr_to_write) +				goto out;  		}  		pagevec_release(&pvec);  		cond_resched();  	}  	return 0; -ret_extent_tail: -	ret = MPAGE_DA_EXTENT_TAIL;  out:  	pagevec_release(&pvec); -	cond_resched(); -	return ret; +	return err;  } +static int __writepage(struct page *page, struct writeback_control *wbc, +		       void *data) +{ +	struct address_space *mapping = data; +	int ret = ext4_writepage(page, wbc); +	mapping_set_error(mapping, ret); +	return ret; +} -static int ext4_da_writepages(struct address_space *mapping, -			      struct writeback_control *wbc) +static int ext4_writepages(struct address_space *mapping, +			   struct writeback_control *wbc)  { -	pgoff_t	index; +	pgoff_t	writeback_index = 0; +	long nr_to_write = wbc->nr_to_write;  	int range_whole = 0; +	int cycled = 1;  	handle_t *handle = NULL;  	struct mpage_da_data mpd;  	struct inode *inode = mapping->host; -	int pages_written = 0; -	unsigned int max_pages; -	int range_cyclic, cycled = 1, io_done = 0; -	int needed_blocks, ret = 0; -	long desired_nr_to_write, nr_to_writebump = 0; -	loff_t range_start = wbc->range_start; +	int needed_blocks, rsv_blocks = 0, ret = 0;  	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); -	pgoff_t done_index = 0; -	pgoff_t end; +	bool done;  	struct blk_plug plug; +	bool give_up_on_write = false; -	trace_ext4_da_writepages(inode, wbc); +	trace_ext4_writepages(inode, wbc);  	/*  	 * No pages to write? This is mainly a kludge to avoid starting @@ -2460,164 +2385,165 @@ static int ext4_da_writepages(struct address_space *mapping,  	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))  		return 0; +	if (ext4_should_journal_data(inode)) { +		struct blk_plug plug; +		int ret; + +		blk_start_plug(&plug); +		ret = write_cache_pages(mapping, wbc, __writepage, mapping); +		blk_finish_plug(&plug); +		return ret; +	} +  	/*  	 * If the filesystem has aborted, it is read-only, so return  	 * right away instead of dumping stack traces later on that  	 * will obscure the real source of the problem.  We test  	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because  	 * the latter could be true if the filesystem is mounted -	 * read-only, and in that case, ext4_da_writepages should +	 * read-only, and in that case, ext4_writepages should  	 * *never* be called, so if that ever happens, we would want  	 * the stack trace.  	 */  	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))  		return -EROFS; +	if (ext4_should_dioread_nolock(inode)) { +		/* +		 * We may need to convert upto one extent per block in +		 * the page and we may dirty the inode. +		 */ +		rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); +	} + +	/* +	 * If we have inline data and arrive here, it means that +	 * we will soon create the block for the 1st page, so +	 * we'd better clear the inline data here. +	 */ +	if (ext4_has_inline_data(inode)) { +		/* Just inode will be modified... */ +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); +		if (IS_ERR(handle)) { +			ret = PTR_ERR(handle); +			goto out_writepages; +		} +		BUG_ON(ext4_test_inode_state(inode, +				EXT4_STATE_MAY_INLINE_DATA)); +		ext4_destroy_inline_data(handle, inode); +		ext4_journal_stop(handle); +	} +  	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)  		range_whole = 1; -	range_cyclic = wbc->range_cyclic;  	if (wbc->range_cyclic) { -		index = mapping->writeback_index; -		if (index) +		writeback_index = mapping->writeback_index; +		if (writeback_index)  			cycled = 0; -		wbc->range_start = index << PAGE_CACHE_SHIFT; -		wbc->range_end  = LLONG_MAX; -		wbc->range_cyclic = 0; -		end = -1; +		mpd.first_page = writeback_index; +		mpd.last_page = -1;  	} else { -		index = wbc->range_start >> PAGE_CACHE_SHIFT; -		end = wbc->range_end >> PAGE_CACHE_SHIFT; -	} - -	/* -	 * This works around two forms of stupidity.  The first is in -	 * the writeback code, which caps the maximum number of pages -	 * written to be 1024 pages.  This is wrong on multiple -	 * levels; different architectues have a different page size, -	 * which changes the maximum amount of data which gets -	 * written.  Secondly, 4 megabytes is way too small.  XFS -	 * forces this value to be 16 megabytes by multiplying -	 * nr_to_write parameter by four, and then relies on its -	 * allocator to allocate larger extents to make them -	 * contiguous.  Unfortunately this brings us to the second -	 * stupidity, which is that ext4's mballoc code only allocates -	 * at most 2048 blocks.  So we force contiguous writes up to -	 * the number of dirty blocks in the inode, or -	 * sbi->max_writeback_mb_bump whichever is smaller. -	 */ -	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); -	if (!range_cyclic && range_whole) { -		if (wbc->nr_to_write == LONG_MAX) -			desired_nr_to_write = wbc->nr_to_write; -		else -			desired_nr_to_write = wbc->nr_to_write * 8; -	} else -		desired_nr_to_write = ext4_num_dirty_pages(inode, index, -							   max_pages); -	if (desired_nr_to_write > max_pages) -		desired_nr_to_write = max_pages; - -	if (wbc->nr_to_write < desired_nr_to_write) { -		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; -		wbc->nr_to_write = desired_nr_to_write; +		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; +		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;  	} +	mpd.inode = inode; +	mpd.wbc = wbc; +	ext4_io_submit_init(&mpd.io_submit, wbc);  retry:  	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) -		tag_pages_for_writeback(mapping, index, end); - +		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); +	done = false;  	blk_start_plug(&plug); -	while (!ret && wbc->nr_to_write > 0) { +	while (!done && mpd.first_page <= mpd.last_page) { +		/* For each extent of pages we use new io_end */ +		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); +		if (!mpd.io_submit.io_end) { +			ret = -ENOMEM; +			break; +		}  		/* -		 * we  insert one extent at a time. So we need -		 * credit needed for single extent allocation. -		 * journalled mode is currently not supported -		 * by delalloc +		 * We have two constraints: We find one extent to map and we +		 * must always write out whole page (makes a difference when +		 * blocksize < pagesize) so that we don't block on IO when we +		 * try to write out the rest of the page. Journalled mode is +		 * not supported by delalloc.  		 */  		BUG_ON(ext4_should_journal_data(inode));  		needed_blocks = ext4_da_writepages_trans_blocks(inode); -		/* start a new transaction*/ -		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, -					    needed_blocks); +		/* start a new transaction */ +		handle = ext4_journal_start_with_reserve(inode, +				EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);  		if (IS_ERR(handle)) {  			ret = PTR_ERR(handle);  			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "  			       "%ld pages, ino %lu; err %d", __func__,  				wbc->nr_to_write, inode->i_ino, ret); -			blk_finish_plug(&plug); -			goto out_writepages; +			/* Release allocated io_end */ +			ext4_put_io_end(mpd.io_submit.io_end); +			break;  		} -		/* -		 * Now call write_cache_pages_da() to find the next -		 * contiguous region of logical blocks that need -		 * blocks to be allocated by ext4 and submit them. -		 */ -		ret = write_cache_pages_da(handle, mapping, -					   wbc, &mpd, &done_index); -		/* -		 * If we have a contiguous extent of pages and we -		 * haven't done the I/O yet, map the blocks and submit -		 * them for I/O. -		 */ -		if (!mpd.io_done && mpd.next_page != mpd.first_page) { -			mpage_da_map_and_submit(&mpd); -			ret = MPAGE_DA_EXTENT_TAIL; +		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); +		ret = mpage_prepare_extent_to_map(&mpd); +		if (!ret) { +			if (mpd.map.m_len) +				ret = mpage_map_and_submit_extent(handle, &mpd, +					&give_up_on_write); +			else { +				/* +				 * We scanned the whole range (or exhausted +				 * nr_to_write), submitted what was mapped and +				 * didn't find anything needing mapping. We are +				 * done. +				 */ +				done = true; +			}  		} -		trace_ext4_da_write_pages(inode, &mpd); -		wbc->nr_to_write -= mpd.pages_written; -  		ext4_journal_stop(handle); +		/* Submit prepared bio */ +		ext4_io_submit(&mpd.io_submit); +		/* Unlock pages we didn't use */ +		mpage_release_unused_pages(&mpd, give_up_on_write); +		/* Drop our io_end reference we got from init */ +		ext4_put_io_end(mpd.io_submit.io_end); -		if ((mpd.retval == -ENOSPC) && sbi->s_journal) { -			/* commit the transaction which would +		if (ret == -ENOSPC && sbi->s_journal) { +			/* +			 * Commit the transaction which would  			 * free blocks released in the transaction  			 * and try again  			 */  			jbd2_journal_force_commit_nested(sbi->s_journal);  			ret = 0; -		} else if (ret == MPAGE_DA_EXTENT_TAIL) { -			/* -			 * Got one extent now try with rest of the pages. -			 * If mpd.retval is set -EIO, journal is aborted. -			 * So we don't need to write any more. -			 */ -			pages_written += mpd.pages_written; -			ret = mpd.retval; -			io_done = 1; -		} else if (wbc->nr_to_write) -			/* -			 * There is no more writeout needed -			 * or we requested for a noblocking writeout -			 * and we found the device congested -			 */ +			continue; +		} +		/* Fatal error - ENOMEM, EIO... */ +		if (ret)  			break;  	}  	blk_finish_plug(&plug); -	if (!io_done && !cycled) { +	if (!ret && !cycled) {  		cycled = 1; -		index = 0; -		wbc->range_start = index << PAGE_CACHE_SHIFT; -		wbc->range_end  = mapping->writeback_index - 1; +		mpd.last_page = writeback_index - 1; +		mpd.first_page = 0;  		goto retry;  	}  	/* Update index */ -	wbc->range_cyclic = range_cyclic;  	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))  		/* -		 * set the writeback_index so that range_cyclic +		 * Set the writeback_index so that range_cyclic  		 * mode will write it back later  		 */ -		mapping->writeback_index = done_index; +		mapping->writeback_index = mpd.first_page;  out_writepages: -	wbc->nr_to_write -= nr_to_writebump; -	wbc->range_start = range_start; -	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); +	trace_ext4_writepages_result(inode, wbc, ret, +				     nr_to_write - wbc->nr_to_write);  	return ret;  } @@ -2829,7 +2755,8 @@ static int ext4_da_write_end(struct file *file,  	return ret ? ret : copied;  } -static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +static void ext4_da_invalidatepage(struct page *page, unsigned int offset, +				   unsigned int length)  {  	/*  	 * Drop reserved blocks @@ -2838,10 +2765,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)  	if (!page_has_buffers(page))  		goto out; -	ext4_da_page_release_reservation(page, offset); +	ext4_da_page_release_reservation(page, offset, length);  out: -	ext4_invalidatepage(page, offset); +	ext4_invalidatepage(page, offset, length);  	return;  } @@ -2864,7 +2791,7 @@ int ext4_alloc_da_blocks(struct inode *inode)  	 * laptop_mode, not even desirable).  However, to do otherwise  	 * would require replicating code paths in:  	 * -	 * ext4_da_writepages() -> +	 * ext4_writepages() ->  	 *    write_cache_pages() ---> (via passed in callback function)  	 *        __mpage_da_writepage() -->  	 *           mpage_add_bh_to_extent() @@ -2989,37 +2916,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,  	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);  } -static void ext4_invalidatepage(struct page *page, unsigned long offset) +static void ext4_invalidatepage(struct page *page, unsigned int offset, +				unsigned int length)  { -	trace_ext4_invalidatepage(page, offset); +	trace_ext4_invalidatepage(page, offset, length);  	/* No journalling happens on data buffers when this function is used */  	WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); -	block_invalidatepage(page, offset); +	block_invalidatepage(page, offset, length);  }  static int __ext4_journalled_invalidatepage(struct page *page, -					    unsigned long offset) +					    unsigned int offset, +					    unsigned int length)  {  	journal_t *journal = EXT4_JOURNAL(page->mapping->host); -	trace_ext4_journalled_invalidatepage(page, offset); +	trace_ext4_journalled_invalidatepage(page, offset, length);  	/*  	 * If it's a full truncate we just forget about the pending dirtying  	 */ -	if (offset == 0) +	if (offset == 0 && length == PAGE_CACHE_SIZE)  		ClearPageChecked(page); -	return jbd2_journal_invalidatepage(journal, page, offset); +	return jbd2_journal_invalidatepage(journal, page, offset, length);  }  /* Wrapper for aops... */  static void ext4_journalled_invalidatepage(struct page *page, -					   unsigned long offset) +					   unsigned int offset, +					   unsigned int length)  { -	WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); +	WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);  }  static int ext4_releasepage(struct page *page, gfp_t wait) @@ -3067,9 +2997,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,  	struct inode *inode = file_inode(iocb->ki_filp);          ext4_io_end_t *io_end = iocb->private; -	/* if not async direct IO or dio with 0 bytes write, just return */ -	if (!io_end || !size) -		goto out; +	/* if not async direct IO just return */ +	if (!io_end) { +		inode_dio_done(inode); +		if (is_async) +			aio_complete(iocb, ret, 0); +		return; +	}  	ext_debug("ext4_end_io_dio(): io_end 0x%p "  		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -3077,25 +3011,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,  		  size);  	iocb->private = NULL; - -	/* if not aio dio with unwritten extents, just free io and return */ -	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { -		ext4_free_io_end(io_end); -out: -		inode_dio_done(inode); -		if (is_async) -			aio_complete(iocb, ret, 0); -		return; -	} -  	io_end->offset = offset;  	io_end->size = size;  	if (is_async) {  		io_end->iocb = iocb;  		io_end->result = ret;  	} - -	ext4_add_complete_io(io_end); +	ext4_put_io_end_defer(io_end);  }  /* @@ -3129,6 +3051,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  	get_block_t *get_block_func = NULL;  	int dio_flags = 0;  	loff_t final_size = offset + count; +	ext4_io_end_t *io_end = NULL;  	/* Use the old path for reads and writes beyond i_size. */  	if (rw != WRITE || final_size > inode->i_size) @@ -3136,11 +3059,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  	BUG_ON(iocb->private == NULL); +	/* +	 * Make all waiters for direct IO properly wait also for extent +	 * conversion. This also disallows race between truncate() and +	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex. +	 */ +	if (rw == WRITE) +		atomic_inc(&inode->i_dio_count); +  	/* If we do a overwrite dio, i_mutex locking can be released */  	overwrite = *((int *)iocb->private);  	if (overwrite) { -		atomic_inc(&inode->i_dio_count);  		down_read(&EXT4_I(inode)->i_data_sem);  		mutex_unlock(&inode->i_mutex);  	} @@ -3167,13 +3097,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  	iocb->private = NULL;  	ext4_inode_aio_set(inode, NULL);  	if (!is_sync_kiocb(iocb)) { -		ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); +		io_end = ext4_init_io_end(inode, GFP_NOFS);  		if (!io_end) {  			ret = -ENOMEM;  			goto retake_lock;  		}  		io_end->flag |= EXT4_IO_END_DIRECT; -		iocb->private = io_end; +		/* +		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio() +		 */ +		iocb->private = ext4_get_io_end(io_end);  		/*  		 * we save the io structure for current async direct  		 * IO, so that later ext4_map_blocks() could flag the @@ -3197,33 +3130,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  				   NULL,  				   dio_flags); -	if (iocb->private) -		ext4_inode_aio_set(inode, NULL);  	/* -	 * The io_end structure takes a reference to the inode, that -	 * structure needs to be destroyed and the reference to the -	 * inode need to be dropped, when IO is complete, even with 0 -	 * byte write, or failed. -	 * -	 * In the successful AIO DIO case, the io_end structure will -	 * be destroyed and the reference to the inode will be dropped -	 * after the end_io call back function is called. -	 * -	 * In the case there is 0 byte write, or error case, since VFS -	 * direct IO won't invoke the end_io call back function, we -	 * need to free the end_io structure here. +	 * Put our reference to io_end. This can free the io_end structure e.g. +	 * in sync IO case or in case of error. It can even perform extent +	 * conversion if all bios we submitted finished before we got here. +	 * Note that in that case iocb->private can be already set to NULL +	 * here.  	 */ -	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { -		ext4_free_io_end(iocb->private); -		iocb->private = NULL; -	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, +	if (io_end) { +		ext4_inode_aio_set(inode, NULL); +		ext4_put_io_end(io_end); +		/* +		 * When no IO was submitted ext4_end_io_dio() was not +		 * called so we have to put iocb's reference. +		 */ +		if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { +			WARN_ON(iocb->private != io_end); +			WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); +			WARN_ON(io_end->iocb); +			/* +			 * Generic code already did inode_dio_done() so we +			 * have to clear EXT4_IO_END_DIRECT to not do it for +			 * the second time. +			 */ +			io_end->flag = 0; +			ext4_put_io_end(io_end); +			iocb->private = NULL; +		} +	} +	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,  						EXT4_STATE_DIO_UNWRITTEN)) {  		int err;  		/*  		 * for non AIO case, since the IO is already  		 * completed, we could do the conversion right here  		 */ -		err = ext4_convert_unwritten_extents(inode, +		err = ext4_convert_unwritten_extents(NULL, inode,  						     offset, ret);  		if (err < 0)  			ret = err; @@ -3231,9 +3173,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  	}  retake_lock: +	if (rw == WRITE) +		inode_dio_done(inode);  	/* take i_mutex locking again if we do a ovewrite dio */  	if (overwrite) { -		inode_dio_done(inode);  		up_read(&EXT4_I(inode)->i_data_sem);  		mutex_lock(&inode->i_mutex);  	} @@ -3292,6 +3235,7 @@ static const struct address_space_operations ext4_aops = {  	.readpage		= ext4_readpage,  	.readpages		= ext4_readpages,  	.writepage		= ext4_writepage, +	.writepages		= ext4_writepages,  	.write_begin		= ext4_write_begin,  	.write_end		= ext4_write_end,  	.bmap			= ext4_bmap, @@ -3307,6 +3251,7 @@ static const struct address_space_operations ext4_journalled_aops = {  	.readpage		= ext4_readpage,  	.readpages		= ext4_readpages,  	.writepage		= ext4_writepage, +	.writepages		= ext4_writepages,  	.write_begin		= ext4_write_begin,  	.write_end		= ext4_journalled_write_end,  	.set_page_dirty		= ext4_journalled_set_page_dirty, @@ -3322,7 +3267,7 @@ static const struct address_space_operations ext4_da_aops = {  	.readpage		= ext4_readpage,  	.readpages		= ext4_readpages,  	.writepage		= ext4_writepage, -	.writepages		= ext4_da_writepages, +	.writepages		= ext4_writepages,  	.write_begin		= ext4_da_write_begin,  	.write_end		= ext4_da_write_end,  	.bmap			= ext4_bmap, @@ -3355,89 +3300,56 @@ void ext4_set_aops(struct inode *inode)  		inode->i_mapping->a_ops = &ext4_aops;  } -  /* - * ext4_discard_partial_page_buffers() - * Wrapper function for ext4_discard_partial_page_buffers_no_lock. - * This function finds and locks the page containing the offset - * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. - * Calling functions that already have the page locked should call - * ext4_discard_partial_page_buffers_no_lock directly. + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown.   */ -int ext4_discard_partial_page_buffers(handle_t *handle, -		struct address_space *mapping, loff_t from, -		loff_t length, int flags) +int ext4_block_truncate_page(handle_t *handle, +		struct address_space *mapping, loff_t from)  { +	unsigned offset = from & (PAGE_CACHE_SIZE-1); +	unsigned length; +	unsigned blocksize;  	struct inode *inode = mapping->host; -	struct page *page; -	int err = 0; -	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, -				   mapping_gfp_mask(mapping) & ~__GFP_FS); -	if (!page) -		return -ENOMEM; - -	err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, -		from, length, flags); +	blocksize = inode->i_sb->s_blocksize; +	length = blocksize - (offset & (blocksize - 1)); -	unlock_page(page); -	page_cache_release(page); -	return err; +	return ext4_block_zero_page_range(handle, mapping, from, length);  }  /* - * ext4_discard_partial_page_buffers_no_lock() - * Zeros a page range of length 'length' starting from offset 'from'. - * Buffer heads that correspond to the block aligned regions of the - * zeroed range will be unmapped.  Unblock aligned regions - * will have the corresponding buffer head mapped if needed so that - * that region of the page can be updated with the partial zero out. - * - * This function assumes that the page has already been  locked.  The - * The range to be discarded must be contained with in the given page. - * If the specified range exceeds the end of the page it will be shortened - * to the end of the page that corresponds to 'from'.  This function is - * appropriate for updating a page and it buffer heads to be unmapped and - * zeroed for blocks that have been either released, or are going to be - * released. - * - * handle: The journal handle - * inode:  The files inode - * page:   A locked page that contains the offset "from" - * from:   The starting byte offset (from the beginning of the file) - *         to begin discarding - * len:    The length of bytes to discard - * flags:  Optional flags that may be used: - * - *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED - *         Only zero the regions of the page whose buffer heads - *         have already been unmapped.  This flag is appropriate - *         for updating the contents of a page whose blocks may - *         have already been released, and we only want to zero - *         out the regions that correspond to those released blocks. - * - * Returns zero on success or negative on failure. + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'.  The range to be zero'd must + * be contained with in one block.  If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from'   */ -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, -		struct inode *inode, struct page *page, loff_t from, -		loff_t length, int flags) +int ext4_block_zero_page_range(handle_t *handle, +		struct address_space *mapping, loff_t from, loff_t length)  {  	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; -	unsigned int offset = from & (PAGE_CACHE_SIZE-1); -	unsigned int blocksize, max, pos; +	unsigned offset = from & (PAGE_CACHE_SIZE-1); +	unsigned blocksize, max, pos;  	ext4_lblk_t iblock; +	struct inode *inode = mapping->host;  	struct buffer_head *bh; +	struct page *page;  	int err = 0; -	blocksize = inode->i_sb->s_blocksize; -	max = PAGE_CACHE_SIZE - offset; +	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, +				   mapping_gfp_mask(mapping) & ~__GFP_FS); +	if (!page) +		return -ENOMEM; -	if (index != page->index) -		return -EINVAL; +	blocksize = inode->i_sb->s_blocksize; +	max = blocksize - (offset & (blocksize - 1));  	/*  	 * correct length if it does not fall between -	 * 'from' and the end of the page +	 * 'from' and the end of the block  	 */  	if (length > max || length < 0)  		length = max; @@ -3455,106 +3367,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,  		iblock++;  		pos += blocksize;  	} - -	pos = offset; -	while (pos < offset + length) { -		unsigned int end_of_block, range_to_discard; - -		err = 0; - -		/* The length of space left to zero and unmap */ -		range_to_discard = offset + length - pos; - -		/* The length of space until the end of the block */ -		end_of_block = blocksize - (pos & (blocksize-1)); - -		/* -		 * Do not unmap or zero past end of block -		 * for this buffer head -		 */ -		if (range_to_discard > end_of_block) -			range_to_discard = end_of_block; - - -		/* -		 * Skip this buffer head if we are only zeroing unampped -		 * regions of the page -		 */ -		if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && -			buffer_mapped(bh)) -				goto next; - -		/* If the range is block aligned, unmap */ -		if (range_to_discard == blocksize) { -			clear_buffer_dirty(bh); -			bh->b_bdev = NULL; -			clear_buffer_mapped(bh); -			clear_buffer_req(bh); -			clear_buffer_new(bh); -			clear_buffer_delay(bh); -			clear_buffer_unwritten(bh); -			clear_buffer_uptodate(bh); -			zero_user(page, pos, range_to_discard); -			BUFFER_TRACE(bh, "Buffer discarded"); -			goto next; -		} - -		/* -		 * If this block is not completely contained in the range -		 * to be discarded, then it is not going to be released. Because -		 * we need to keep this block, we need to make sure this part -		 * of the page is uptodate before we modify it by writeing -		 * partial zeros on it. -		 */ +	if (buffer_freed(bh)) { +		BUFFER_TRACE(bh, "freed: skip"); +		goto unlock; +	} +	if (!buffer_mapped(bh)) { +		BUFFER_TRACE(bh, "unmapped"); +		ext4_get_block(inode, iblock, bh, 0); +		/* unmapped? It's a hole - nothing to do */  		if (!buffer_mapped(bh)) { -			/* -			 * Buffer head must be mapped before we can read -			 * from the block -			 */ -			BUFFER_TRACE(bh, "unmapped"); -			ext4_get_block(inode, iblock, bh, 0); -			/* unmapped? It's a hole - nothing to do */ -			if (!buffer_mapped(bh)) { -				BUFFER_TRACE(bh, "still unmapped"); -				goto next; -			} +			BUFFER_TRACE(bh, "still unmapped"); +			goto unlock;  		} +	} -		/* Ok, it's mapped. Make sure it's up-to-date */ -		if (PageUptodate(page)) -			set_buffer_uptodate(bh); +	/* Ok, it's mapped. Make sure it's up-to-date */ +	if (PageUptodate(page)) +		set_buffer_uptodate(bh); -		if (!buffer_uptodate(bh)) { -			err = -EIO; -			ll_rw_block(READ, 1, &bh); -			wait_on_buffer(bh); -			/* Uhhuh. Read error. Complain and punt.*/ -			if (!buffer_uptodate(bh)) -				goto next; -		} +	if (!buffer_uptodate(bh)) { +		err = -EIO; +		ll_rw_block(READ, 1, &bh); +		wait_on_buffer(bh); +		/* Uhhuh. Read error. Complain and punt. */ +		if (!buffer_uptodate(bh)) +			goto unlock; +	} +	if (ext4_should_journal_data(inode)) { +		BUFFER_TRACE(bh, "get write access"); +		err = ext4_journal_get_write_access(handle, bh); +		if (err) +			goto unlock; +	} +	zero_user(page, offset, length); +	BUFFER_TRACE(bh, "zeroed end of block"); -		if (ext4_should_journal_data(inode)) { -			BUFFER_TRACE(bh, "get write access"); -			err = ext4_journal_get_write_access(handle, bh); -			if (err) -				goto next; -		} +	if (ext4_should_journal_data(inode)) { +		err = ext4_handle_dirty_metadata(handle, inode, bh); +	} else { +		err = 0; +		mark_buffer_dirty(bh); +		if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) +			err = ext4_jbd2_file_inode(handle, inode); +	} -		zero_user(page, pos, range_to_discard); +unlock: +	unlock_page(page); +	page_cache_release(page); +	return err; +} -		err = 0; -		if (ext4_should_journal_data(inode)) { -			err = ext4_handle_dirty_metadata(handle, inode, bh); -		} else -			mark_buffer_dirty(bh); +int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, +			     loff_t lstart, loff_t length) +{ +	struct super_block *sb = inode->i_sb; +	struct address_space *mapping = inode->i_mapping; +	unsigned partial_start, partial_end; +	ext4_fsblk_t start, end; +	loff_t byte_end = (lstart + length - 1); +	int err = 0; -		BUFFER_TRACE(bh, "Partial buffer zeroed"); -next: -		bh = bh->b_this_page; -		iblock++; -		pos += range_to_discard; -	} +	partial_start = lstart & (sb->s_blocksize - 1); +	partial_end = byte_end & (sb->s_blocksize - 1); + +	start = lstart >> sb->s_blocksize_bits; +	end = byte_end >> sb->s_blocksize_bits; +	/* Handle partial zero within the single block */ +	if (start == end && +	    (partial_start || (partial_end != sb->s_blocksize - 1))) { +		err = ext4_block_zero_page_range(handle, mapping, +						 lstart, length); +		return err; +	} +	/* Handle partial zero out on the start of the range */ +	if (partial_start) { +		err = ext4_block_zero_page_range(handle, mapping, +						 lstart, sb->s_blocksize); +		if (err) +			return err; +	} +	/* Handle partial zero out on the end of the range */ +	if (partial_end != sb->s_blocksize - 1) +		err = ext4_block_zero_page_range(handle, mapping, +						 byte_end - partial_end, +						 partial_end + 1);  	return err;  } @@ -3580,14 +3477,12 @@ int ext4_can_truncate(struct inode *inode)   * Returns: 0 on success or negative on failure   */ -int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  { -	struct inode *inode = file_inode(file);  	struct super_block *sb = inode->i_sb;  	ext4_lblk_t first_block, stop_block;  	struct address_space *mapping = inode->i_mapping; -	loff_t first_page, last_page, page_len; -	loff_t first_page_offset, last_page_offset; +	loff_t first_block_offset, last_block_offset;  	handle_t *handle;  	unsigned int credits;  	int ret = 0; @@ -3638,23 +3533,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)  		   offset;  	} -	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -	last_page = (offset + length) >> PAGE_CACHE_SHIFT; - -	first_page_offset = first_page << PAGE_CACHE_SHIFT; -	last_page_offset = last_page << PAGE_CACHE_SHIFT; +	first_block_offset = round_up(offset, sb->s_blocksize); +	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; -	/* Now release the pages */ -	if (last_page_offset > first_page_offset) { -		truncate_pagecache_range(inode, first_page_offset, -					 last_page_offset - 1); -	} +	/* Now release the pages and zero block aligned part of pages*/ +	if (last_block_offset > first_block_offset) +		truncate_pagecache_range(inode, first_block_offset, +					 last_block_offset);  	/* Wait all existing dio workers, newcomers will block on i_mutex */  	ext4_inode_block_unlocked_dio(inode); -	ret = ext4_flush_unwritten_io(inode); -	if (ret) -		goto out_dio;  	inode_dio_wait(inode);  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -3668,66 +3556,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)  		goto out_dio;  	} -	/* -	 * Now we need to zero out the non-page-aligned data in the -	 * pages at the start and tail of the hole, and unmap the -	 * buffer heads for the block aligned regions of the page that -	 * were completely zeroed. -	 */ -	if (first_page > last_page) { -		/* -		 * If the file space being truncated is contained -		 * within a page just zero out and unmap the middle of -		 * that page -		 */ -		ret = ext4_discard_partial_page_buffers(handle, -			mapping, offset, length, 0); - -		if (ret) -			goto out_stop; -	} else { -		/* -		 * zero out and unmap the partial page that contains -		 * the start of the hole -		 */ -		page_len = first_page_offset - offset; -		if (page_len > 0) { -			ret = ext4_discard_partial_page_buffers(handle, mapping, -						offset, page_len, 0); -			if (ret) -				goto out_stop; -		} - -		/* -		 * zero out and unmap the partial page that contains -		 * the end of the hole -		 */ -		page_len = offset + length - last_page_offset; -		if (page_len > 0) { -			ret = ext4_discard_partial_page_buffers(handle, mapping, -					last_page_offset, page_len, 0); -			if (ret) -				goto out_stop; -		} -	} - -	/* -	 * If i_size is contained in the last page, we need to -	 * unmap and zero the partial page after i_size -	 */ -	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && -	   inode->i_size % PAGE_CACHE_SIZE != 0) { -		page_len = PAGE_CACHE_SIZE - -			(inode->i_size & (PAGE_CACHE_SIZE - 1)); - -		if (page_len > 0) { -			ret = ext4_discard_partial_page_buffers(handle, -					mapping, inode->i_size, page_len, 0); - -			if (ret) -				goto out_stop; -		} -	} +	ret = ext4_zero_partial_blocks(handle, inode, offset, +				       length); +	if (ret) +		goto out_stop;  	first_block = (offset + sb->s_blocksize - 1) >>  		EXT4_BLOCK_SIZE_BITS(sb); @@ -3803,7 +3635,6 @@ void ext4_truncate(struct inode *inode)  	unsigned int credits;  	handle_t *handle;  	struct address_space *mapping = inode->i_mapping; -	loff_t page_len;  	/*  	 * There is a possibility that we're either freeing the inode @@ -3830,12 +3661,6 @@ void ext4_truncate(struct inode *inode)  			return;  	} -	/* -	 * finish any pending end_io work so we won't run the risk of -	 * converting any truncated blocks to initialized later -	 */ -	ext4_flush_unwritten_io(inode); -  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))  		credits = ext4_writepage_trans_blocks(inode);  	else @@ -3847,14 +3672,8 @@ void ext4_truncate(struct inode *inode)  		return;  	} -	if (inode->i_size % PAGE_CACHE_SIZE != 0) { -		page_len = PAGE_CACHE_SIZE - -			(inode->i_size & (PAGE_CACHE_SIZE - 1)); - -		if (ext4_discard_partial_page_buffers(handle, -				mapping, inode->i_size, page_len, 0)) -			goto out_stop; -	} +	if (inode->i_size & (inode->i_sb->s_blocksize - 1)) +		ext4_block_truncate_page(handle, mapping, inode->i_size);  	/*  	 * We add the inode to the orphan list, so that if this @@ -4623,7 +4442,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)  				      inode->i_size >> PAGE_CACHE_SHIFT);  		if (!page)  			return; -		ret = __ext4_journalled_invalidatepage(page, offset); +		ret = __ext4_journalled_invalidatepage(page, offset, +						PAGE_CACHE_SIZE - offset);  		unlock_page(page);  		page_cache_release(page);  		if (ret != -EBUSY) @@ -4805,7 +4625,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,  		 struct kstat *stat)  {  	struct inode *inode; -	unsigned long delalloc_blocks; +	unsigned long long delalloc_blocks;  	inode = dentry->d_inode;  	generic_fillattr(inode, stat); @@ -4823,15 +4643,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,  	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),  				EXT4_I(inode)->i_reserved_data_blocks); -	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; +	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);  	return 0;  } -static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_index_trans_blocks(struct inode *inode, int lblocks, +				   int pextents)  {  	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) -		return ext4_ind_trans_blocks(inode, nrblocks, chunk); -	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); +		return ext4_ind_trans_blocks(inode, lblocks); +	return ext4_ext_index_trans_blocks(inode, pextents);  }  /* @@ -4845,7 +4666,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)   *   * Also account for superblock, inode, quota and xattr blocks   */ -static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, +				  int pextents)  {  	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);  	int gdpblocks; @@ -4853,14 +4675,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)  	int ret = 0;  	/* -	 * How many index blocks need to touch to modify nrblocks? -	 * The "Chunk" flag indicating whether the nrblocks is -	 * physically contiguous on disk -	 * -	 * For Direct IO and fallocate, they calls get_block to allocate -	 * one single extent at a time, so they could set the "Chunk" flag +	 * How many index blocks need to touch to map @lblocks logical blocks +	 * to @pextents physical extents?  	 */ -	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); +	idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);  	ret = idxblocks; @@ -4868,12 +4686,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)  	 * Now let's see how many group bitmaps and group descriptors need  	 * to account  	 */ -	groups = idxblocks; -	if (chunk) -		groups += 1; -	else -		groups += nrblocks; - +	groups = idxblocks + pextents;  	gdpblocks = groups;  	if (groups > ngroups)  		groups = ngroups; @@ -4904,7 +4717,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)  	int bpp = ext4_journal_blocks_per_page(inode);  	int ret; -	ret = ext4_meta_trans_blocks(inode, bpp, 0); +	ret = ext4_meta_trans_blocks(inode, bpp, bpp);  	/* Account for data blocks for journalled mode */  	if (ext4_should_journal_data(inode)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 9491ac0590f7..c0427e2f6648 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -77,8 +77,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)  	memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));  	memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));  	memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); -	memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); -	memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); +	ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); +	ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); +	ext4_es_lru_del(inode1); +	ext4_es_lru_del(inode2);  	isize = i_size_read(inode1);  	i_size_write(inode1, i_size_read(inode2)); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index def84082a9a9..4bbbf13bd743 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2105,6 +2105,7 @@ repeat:  		group = ac->ac_g_ex.fe_group;  		for (i = 0; i < ngroups; group++, i++) { +			cond_resched();  			/*  			 * Artificially restricted ngroups for non-extent  			 * files makes group > ngroups possible on first loop. @@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,  repeat:  		/* allocate space in core */  		*errp = ext4_mb_regular_allocator(ac); -		if (*errp) { -			ext4_discard_allocated_blocks(ac); -			goto errout; -		} +		if (*errp) +			goto discard_and_exit;  		/* as we've just preallocated more space than -		 * user requested orinally, we store allocated +		 * user requested originally, we store allocated  		 * space in a special descriptor */  		if (ac->ac_status == AC_STATUS_FOUND && -				ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) -			ext4_mb_new_preallocation(ac); +		    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) +			*errp = ext4_mb_new_preallocation(ac); +		if (*errp) { +		discard_and_exit: +			ext4_discard_allocated_blocks(ac); +			goto errout; +		}  	}  	if (likely(ac->ac_status == AC_STATUS_FOUND)) {  		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,  		BUG_ON(bh && (count > 1));  		for (i = 0; i < count; i++) { +			cond_resched();  			if (!bh)  				tbh = sb_find_get_block(inode->i_sb,  							block + i); -			if (unlikely(!tbh)) +			if (!tbh)  				continue;  			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,  				    inode, tbh, block + i); @@ -4735,11 +4740,16 @@ do_more:  		 * blocks being freed are metadata. these blocks shouldn't  		 * be used until this transaction is committed  		 */ +	retry:  		new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);  		if (!new_entry) { -			ext4_mb_unload_buddy(&e4b); -			err = -ENOMEM; -			goto error_return; +			/* +			 * We use a retry loop because +			 * ext4_free_blocks() is not allowed to fail. +			 */ +			cond_resched(); +			congestion_wait(BLK_RW_ASYNC, HZ/50); +			goto retry;  		}  		new_entry->efd_start_cluster = bit;  		new_entry->efd_group = block_group; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 3dcbf364022f..e86dddbd8296 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,  	struct page *pagep[2] = {NULL, NULL};  	handle_t *handle;  	ext4_lblk_t orig_blk_offset; -	long long offs = orig_page_offset << PAGE_CACHE_SHIFT;  	unsigned long blocksize = orig_inode->i_sb->s_blocksize;  	unsigned int w_flags = 0;  	unsigned int tmp_data_size, data_size, replaced_size; @@ -940,8 +939,6 @@ again:  	orig_blk_offset = orig_page_offset * blocks_per_page +  		data_offset_in_page; -	offs = (long long)orig_blk_offset << orig_inode->i_blkbits; -  	/* Calculate data_size */  	if ((orig_blk_offset + block_len_in_page - 1) ==  	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6653fc35ecb7..35f55a0dbc4b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,  				bh->b_data, bh->b_size,  				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))  					 + ((char *)de - bh->b_data))) { -			/* On error, skip the f_pos to the next block. */ -			dir_file->f_pos = (dir_file->f_pos | -					(dir->i_sb->s_blocksize - 1)) + 1; -			brelse(bh); -			return count; +			/* silently ignore the rest of the block */ +			break;  		}  		ext4fs_dirhash(de->name, de->name_len, hinfo);  		if ((hinfo->hash < start_hash) || @@ -2299,6 +2296,45 @@ retry:  	return err;  } +static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ +	handle_t *handle; +	struct inode *inode; +	int err, retries = 0; + +	dquot_initialize(dir); + +retry: +	inode = ext4_new_inode_start_handle(dir, mode, +					    NULL, 0, NULL, +					    EXT4_HT_DIR, +			EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + +			  4 + EXT4_XATTR_TRANS_BLOCKS); +	handle = ext4_journal_current_handle(); +	err = PTR_ERR(inode); +	if (!IS_ERR(inode)) { +		inode->i_op = &ext4_file_inode_operations; +		inode->i_fop = &ext4_file_operations; +		ext4_set_aops(inode); +		d_tmpfile(dentry, inode); +		err = ext4_orphan_add(handle, inode); +		if (err) +			goto err_drop_inode; +		mark_inode_dirty(inode); +		unlock_new_inode(inode); +	} +	if (handle) +		ext4_journal_stop(handle); +	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) +		goto retry; +	return err; +err_drop_inode: +	ext4_journal_stop(handle); +	unlock_new_inode(inode); +	iput(inode); +	return err; +} +  struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,  			  struct ext4_dir_entry_2 *de,  			  int blocksize, int csum_size, @@ -2906,7 +2942,7 @@ static int ext4_link(struct dentry *old_dentry,  retry:  	handle = ext4_journal_start(dir, EXT4_HT_DIR,  		(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + -		 EXT4_INDEX_EXTRA_TRANS_BLOCKS)); +		 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);  	if (IS_ERR(handle))  		return PTR_ERR(handle); @@ -2920,6 +2956,11 @@ retry:  	err = ext4_add_entry(handle, dentry, inode);  	if (!err) {  		ext4_mark_inode_dirty(handle, inode); +		/* this can happen only for tmpfile being +		 * linked the first time +		 */ +		if (inode->i_nlink == 1) +			ext4_orphan_del(handle, inode);  		d_instantiate(dentry, inode);  	} else {  		drop_nlink(inode); @@ -3172,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = {  	.mkdir		= ext4_mkdir,  	.rmdir		= ext4_rmdir,  	.mknod		= ext4_mknod, +	.tmpfile	= ext4_tmpfile,  	.rename		= ext4_rename,  	.setattr	= ext4_setattr,  	.setxattr	= generic_setxattr, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4acf1f78881b..6625d210fb45 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -25,6 +25,7 @@  #include <linux/kernel.h>  #include <linux/slab.h>  #include <linux/mm.h> +#include <linux/ratelimit.h>  #include "ext4_jbd2.h"  #include "xattr.h" @@ -46,46 +47,121 @@ void ext4_exit_pageio(void)  }  /* - * This function is called by ext4_evict_inode() to make sure there is - * no more pending I/O completion work left to do. + * Print an buffer I/O error compatible with the fs/buffer.c.  This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message.  We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics...   */ -void ext4_ioend_shutdown(struct inode *inode) +static void buffer_io_error(struct buffer_head *bh) +{ +	char b[BDEVNAME_SIZE]; +	printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", +			bdevname(bh->b_bdev, b), +			(unsigned long long)bh->b_blocknr); +} + +static void ext4_finish_bio(struct bio *bio)  { -	wait_queue_head_t *wq = ext4_ioend_wq(inode); +	int i; +	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); -	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); -	/* -	 * We need to make sure the work structure is finished being -	 * used before we let the inode get destroyed. -	 */ -	if (work_pending(&EXT4_I(inode)->i_unwritten_work)) -		cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); +	for (i = 0; i < bio->bi_vcnt; i++) { +		struct bio_vec *bvec = &bio->bi_io_vec[i]; +		struct page *page = bvec->bv_page; +		struct buffer_head *bh, *head; +		unsigned bio_start = bvec->bv_offset; +		unsigned bio_end = bio_start + bvec->bv_len; +		unsigned under_io = 0; +		unsigned long flags; + +		if (!page) +			continue; + +		if (error) { +			SetPageError(page); +			set_bit(AS_EIO, &page->mapping->flags); +		} +		bh = head = page_buffers(page); +		/* +		 * We check all buffers in the page under BH_Uptodate_Lock +		 * to avoid races with other end io clearing async_write flags +		 */ +		local_irq_save(flags); +		bit_spin_lock(BH_Uptodate_Lock, &head->b_state); +		do { +			if (bh_offset(bh) < bio_start || +			    bh_offset(bh) + bh->b_size > bio_end) { +				if (buffer_async_write(bh)) +					under_io++; +				continue; +			} +			clear_buffer_async_write(bh); +			if (error) +				buffer_io_error(bh); +		} while ((bh = bh->b_this_page) != head); +		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); +		local_irq_restore(flags); +		if (!under_io) +			end_page_writeback(page); +	}  } -void ext4_free_io_end(ext4_io_end_t *io) +static void ext4_release_io_end(ext4_io_end_t *io_end)  { -	BUG_ON(!io); -	BUG_ON(!list_empty(&io->list)); -	BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); +	struct bio *bio, *next_bio; + +	BUG_ON(!list_empty(&io_end->list)); +	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); +	WARN_ON(io_end->handle); -	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) -		wake_up_all(ext4_ioend_wq(io->inode)); -	kmem_cache_free(io_end_cachep, io); +	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) +		wake_up_all(ext4_ioend_wq(io_end->inode)); + +	for (bio = io_end->bio; bio; bio = next_bio) { +		next_bio = bio->bi_private; +		ext4_finish_bio(bio); +		bio_put(bio); +	} +	if (io_end->flag & EXT4_IO_END_DIRECT) +		inode_dio_done(io_end->inode); +	if (io_end->iocb) +		aio_complete(io_end->iocb, io_end->result, 0); +	kmem_cache_free(io_end_cachep, io_end);  } -/* check a range of space and convert unwritten extents to written. */ +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ +	struct inode *inode = io_end->inode; + +	io_end->flag &= ~EXT4_IO_END_UNWRITTEN; +	/* Wake up anyone waiting on unwritten extent conversion */ +	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) +		wake_up_all(ext4_ioend_wq(inode)); +} + +/* + * Check a range of space and convert unwritten extents to written. Note that + * we are protected from truncate touching same part of extent tree by the + * fact that truncate code waits for all DIO to finish (thus exclusion from + * direct IO is achieved) and also waits for PageWriteback bits. Thus we + * cannot get to ext4_ext_truncate() before all IOs overlapping that range are + * completed (happens from ext4_free_ioend()). + */  static int ext4_end_io(ext4_io_end_t *io)  {  	struct inode *inode = io->inode;  	loff_t offset = io->offset;  	ssize_t size = io->size; +	handle_t *handle = io->handle;  	int ret = 0;  	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"  		   "list->prev 0x%p\n",  		   io, inode->i_ino, io->list.next, io->list.prev); -	ret = ext4_convert_unwritten_extents(inode, offset, size); +	io->handle = NULL;	/* Following call will use up the handle */ +	ret = ext4_convert_unwritten_extents(handle, inode, offset, size);  	if (ret < 0) {  		ext4_msg(inode->i_sb, KERN_EMERG,  			 "failed to convert unwritten extents to written " @@ -93,30 +169,22 @@ static int ext4_end_io(ext4_io_end_t *io)  			 "(inode %lu, offset %llu, size %zd, error %d)",  			 inode->i_ino, offset, size, ret);  	} -	/* Wake up anyone waiting on unwritten extent conversion */ -	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) -		wake_up_all(ext4_ioend_wq(inode)); -	if (io->flag & EXT4_IO_END_DIRECT) -		inode_dio_done(inode); -	if (io->iocb) -		aio_complete(io->iocb, io->result, 0); +	ext4_clear_io_unwritten_flag(io); +	ext4_release_io_end(io);  	return ret;  } -static void dump_completed_IO(struct inode *inode) +static void dump_completed_IO(struct inode *inode, struct list_head *head)  {  #ifdef	EXT4FS_DEBUG  	struct list_head *cur, *before, *after;  	ext4_io_end_t *io, *io0, *io1; -	if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { -		ext4_debug("inode %lu completed_io list is empty\n", -			   inode->i_ino); +	if (list_empty(head))  		return; -	} -	ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); -	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { +	ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); +	list_for_each_entry(io, head, list) {  		cur = &io->list;  		before = cur->prev;  		io0 = container_of(before, ext4_io_end_t, list); @@ -130,23 +198,30 @@ static void dump_completed_IO(struct inode *inode)  }  /* Add the io_end to per-inode completed end_io list. */ -void ext4_add_complete_io(ext4_io_end_t *io_end) +static void ext4_add_complete_io(ext4_io_end_t *io_end)  {  	struct ext4_inode_info *ei = EXT4_I(io_end->inode);  	struct workqueue_struct *wq;  	unsigned long flags;  	BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); -	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; -  	spin_lock_irqsave(&ei->i_completed_io_lock, flags); -	if (list_empty(&ei->i_completed_io_list)) -		queue_work(wq, &ei->i_unwritten_work); -	list_add_tail(&io_end->list, &ei->i_completed_io_list); +	if (io_end->handle) { +		wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; +		if (list_empty(&ei->i_rsv_conversion_list)) +			queue_work(wq, &ei->i_rsv_conversion_work); +		list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); +	} else { +		wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq; +		if (list_empty(&ei->i_unrsv_conversion_list)) +			queue_work(wq, &ei->i_unrsv_conversion_work); +		list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list); +	}  	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);  } -static int ext4_do_flush_completed_IO(struct inode *inode) +static int ext4_do_flush_completed_IO(struct inode *inode, +				      struct list_head *head)  {  	ext4_io_end_t *io;  	struct list_head unwritten; @@ -155,8 +230,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)  	int err, ret = 0;  	spin_lock_irqsave(&ei->i_completed_io_lock, flags); -	dump_completed_IO(inode); -	list_replace_init(&ei->i_completed_io_list, &unwritten); +	dump_completed_IO(inode, head); +	list_replace_init(head, &unwritten);  	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);  	while (!list_empty(&unwritten)) { @@ -167,30 +242,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)  		err = ext4_end_io(io);  		if (unlikely(!ret && err))  			ret = err; -		io->flag &= ~EXT4_IO_END_UNWRITTEN; -		ext4_free_io_end(io);  	}  	return ret;  }  /* - * work on completed aio dio IO, to convert unwritten extents to extents + * work on completed IO, to convert unwritten extents to extents   */ -void ext4_end_io_work(struct work_struct *work) +void ext4_end_io_rsv_work(struct work_struct *work)  {  	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, -						  i_unwritten_work); -	ext4_do_flush_completed_IO(&ei->vfs_inode); +						  i_rsv_conversion_work); +	ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);  } -int ext4_flush_unwritten_io(struct inode *inode) +void ext4_end_io_unrsv_work(struct work_struct *work)  { -	int ret; -	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && -		     !(inode->i_state & I_FREEING)); -	ret = ext4_do_flush_completed_IO(inode); -	ext4_unwritten_wait(inode); -	return ret; +	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, +						  i_unrsv_conversion_work); +	ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);  }  ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) @@ -200,83 +270,59 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)  		atomic_inc(&EXT4_I(inode)->i_ioend_count);  		io->inode = inode;  		INIT_LIST_HEAD(&io->list); +		atomic_set(&io->count, 1);  	}  	return io;  } -/* - * Print an buffer I/O error compatible with the fs/buffer.c.  This - * provides compatibility with dmesg scrapers that look for a specific - * buffer I/O error message.  We really need a unified error reporting - * structure to userspace ala Digital Unix's uerf system, but it's - * probably not going to happen in my lifetime, due to LKML politics... - */ -static void buffer_io_error(struct buffer_head *bh) +void ext4_put_io_end_defer(ext4_io_end_t *io_end)  { -	char b[BDEVNAME_SIZE]; -	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", -			bdevname(bh->b_bdev, b), -			(unsigned long long)bh->b_blocknr); +	if (atomic_dec_and_test(&io_end->count)) { +		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { +			ext4_release_io_end(io_end); +			return; +		} +		ext4_add_complete_io(io_end); +	} +} + +int ext4_put_io_end(ext4_io_end_t *io_end) +{ +	int err = 0; + +	if (atomic_dec_and_test(&io_end->count)) { +		if (io_end->flag & EXT4_IO_END_UNWRITTEN) { +			err = ext4_convert_unwritten_extents(io_end->handle, +						io_end->inode, io_end->offset, +						io_end->size); +			io_end->handle = NULL; +			ext4_clear_io_unwritten_flag(io_end); +		} +		ext4_release_io_end(io_end); +	} +	return err;  } +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ +	atomic_inc(&io_end->count); +	return io_end; +} + +/* BIO completion function for page writeback */  static void ext4_end_bio(struct bio *bio, int error)  {  	ext4_io_end_t *io_end = bio->bi_private; -	struct inode *inode; -	int i; -	int blocksize;  	sector_t bi_sector = bio->bi_sector;  	BUG_ON(!io_end); -	inode = io_end->inode; -	blocksize = 1 << inode->i_blkbits; -	bio->bi_private = NULL;  	bio->bi_end_io = NULL;  	if (test_bit(BIO_UPTODATE, &bio->bi_flags))  		error = 0; -	for (i = 0; i < bio->bi_vcnt; i++) { -		struct bio_vec *bvec = &bio->bi_io_vec[i]; -		struct page *page = bvec->bv_page; -		struct buffer_head *bh, *head; -		unsigned bio_start = bvec->bv_offset; -		unsigned bio_end = bio_start + bvec->bv_len; -		unsigned under_io = 0; -		unsigned long flags; - -		if (!page) -			continue; - -		if (error) { -			SetPageError(page); -			set_bit(AS_EIO, &page->mapping->flags); -		} -		bh = head = page_buffers(page); -		/* -		 * We check all buffers in the page under BH_Uptodate_Lock -		 * to avoid races with other end io clearing async_write flags -		 */ -		local_irq_save(flags); -		bit_spin_lock(BH_Uptodate_Lock, &head->b_state); -		do { -			if (bh_offset(bh) < bio_start || -			    bh_offset(bh) + blocksize > bio_end) { -				if (buffer_async_write(bh)) -					under_io++; -				continue; -			} -			clear_buffer_async_write(bh); -			if (error) -				buffer_io_error(bh); -		} while ((bh = bh->b_this_page) != head); -		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); -		local_irq_restore(flags); -		if (!under_io) -			end_page_writeback(page); -	} -	bio_put(bio);  	if (error) { -		io_end->flag |= EXT4_IO_END_ERROR; +		struct inode *inode = io_end->inode; +  		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "  			     "(offset %llu size %ld starting block %llu)",  			     inode->i_ino, @@ -286,12 +332,23 @@ static void ext4_end_bio(struct bio *bio, int error)  			     bi_sector >> (inode->i_blkbits - 9));  	} -	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { -		ext4_free_io_end(io_end); -		return; +	if (io_end->flag & EXT4_IO_END_UNWRITTEN) { +		/* +		 * Link bio into list hanging from io_end. We have to do it +		 * atomically as bio completions can be racing against each +		 * other. +		 */ +		bio->bi_private = xchg(&io_end->bio, bio); +		ext4_put_io_end_defer(io_end); +	} else { +		/* +		 * Drop io_end reference early. Inode can get freed once +		 * we finish the bio. +		 */ +		ext4_put_io_end_defer(io_end); +		ext4_finish_bio(bio); +		bio_put(bio);  	} - -	ext4_add_complete_io(io_end);  }  void ext4_io_submit(struct ext4_io_submit *io) @@ -305,43 +362,38 @@ void ext4_io_submit(struct ext4_io_submit *io)  		bio_put(io->io_bio);  	}  	io->io_bio = NULL; -	io->io_op = 0; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, +			 struct writeback_control *wbc) +{ +	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE); +	io->io_bio = NULL;  	io->io_end = NULL;  } -static int io_submit_init(struct ext4_io_submit *io, -			  struct inode *inode, -			  struct writeback_control *wbc, -			  struct buffer_head *bh) +static int io_submit_init_bio(struct ext4_io_submit *io, +			      struct buffer_head *bh)  { -	ext4_io_end_t *io_end; -	struct page *page = bh->b_page;  	int nvecs = bio_get_nr_vecs(bh->b_bdev);  	struct bio *bio; -	io_end = ext4_init_io_end(inode, GFP_NOFS); -	if (!io_end) -		return -ENOMEM;  	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); +	if (!bio) +		return -ENOMEM;  	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);  	bio->bi_bdev = bh->b_bdev; -	bio->bi_private = io->io_end = io_end;  	bio->bi_end_io = ext4_end_bio; - -	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); - +	bio->bi_private = ext4_get_io_end(io->io_end);  	io->io_bio = bio; -	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);  	io->io_next_block = bh->b_blocknr;  	return 0;  }  static int io_submit_add_bh(struct ext4_io_submit *io,  			    struct inode *inode, -			    struct writeback_control *wbc,  			    struct buffer_head *bh)  { -	ext4_io_end_t *io_end;  	int ret;  	if (io->io_bio && bh->b_blocknr != io->io_next_block) { @@ -349,18 +401,14 @@ submit_and_retry:  		ext4_io_submit(io);  	}  	if (io->io_bio == NULL) { -		ret = io_submit_init(io, inode, wbc, bh); +		ret = io_submit_init_bio(io, bh);  		if (ret)  			return ret;  	} -	io_end = io->io_end; -	if (test_clear_buffer_uninit(bh)) -		ext4_set_io_unwritten_flag(inode, io_end); -	io->io_end->size += bh->b_size; -	io->io_next_block++;  	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));  	if (ret != bh->b_size)  		goto submit_and_retry; +	io->io_next_block++;  	return 0;  } @@ -432,7 +480,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,  	do {  		if (!buffer_async_write(bh))  			continue; -		ret = io_submit_add_bh(io, inode, wbc, bh); +		ret = io_submit_add_bh(io, inode, bh);  		if (ret) {  			/*  			 * We only get here on ENOMEM.  Not much else diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b27c96d01965..c5adbb318a90 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,  	ext4_fsblk_t end = start + input->blocks_count;  	ext4_group_t group = input->group;  	ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; -	unsigned overhead = ext4_group_overhead_blocks(sb, group); -	ext4_fsblk_t metaend = start + overhead; +	unsigned overhead; +	ext4_fsblk_t metaend;  	struct buffer_head *bh = NULL;  	ext4_grpblk_t free_blocks_count, offset;  	int err = -EINVAL; +	if (group != sbi->s_groups_count) { +		ext4_warning(sb, "Cannot add at group %u (only %u groups)", +			     input->group, sbi->s_groups_count); +		return -EINVAL; +	} + +	overhead = ext4_group_overhead_blocks(sb, group); +	metaend = start + overhead;  	input->free_blocks_count = free_blocks_count =  		input->blocks_count - 2 - overhead - sbi->s_itb_per_group; @@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,  		       free_blocks_count, input->reserved_blocks);  	ext4_get_group_no_and_offset(sb, start, NULL, &offset); -	if (group != sbi->s_groups_count) -		ext4_warning(sb, "Cannot add at group %u (only %u groups)", -			     input->group, sbi->s_groups_count); -	else if (offset != 0) +	if (offset != 0)  			ext4_warning(sb, "Last group not full");  	else if (input->reserved_blocks > input->blocks_count / 5)  		ext4_warning(sb, "Reserved blocks too high (%u)", @@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)  	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?  		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;  	struct inode *inode = NULL; -	int gdb_off, gdb_num; +	int gdb_off;  	int err;  	__u16 bg_flags = 0; -	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);  	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);  	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -1656,12 +1660,10 @@ errout:  		err = err2;  	if (!err) { -		ext4_fsblk_t first_block; -		first_block = ext4_group_first_block_no(sb, 0);  		if (test_opt(sb, DEBUG))  			printk(KERN_DEBUG "EXT4-fs: extended group to %llu "  			       "blocks\n", ext4_blocks_count(es)); -		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, +		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,  			       (char *)es, sizeof(struct ext4_super_block), 0);  	}  	return err; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 94cc84db7c9a..b59373b625e9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,  static void ext4_clear_journal_err(struct super_block *sb,  				   struct ext4_super_block *es);  static int ext4_sync_fs(struct super_block *sb, int wait); +static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);  static int ext4_remount(struct super_block *sb, int *flags, char *data);  static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);  static int ext4_unfreeze(struct super_block *sb); @@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)  	}  	if (test_opt(sb, ERRORS_RO)) {  		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); +		/* +		 * Make sure updated value of ->s_mount_flags will be visible +		 * before ->s_flags update +		 */ +		smp_wmb();  		sb->s_flags |= MS_RDONLY;  	}  	if (test_opt(sb, ERRORS_PANIC)) @@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,  	ext4_handle_error(sb);  } -void ext4_error_inode(struct inode *inode, const char *function, -		      unsigned int line, ext4_fsblk_t block, -		      const char *fmt, ...) +void __ext4_error_inode(struct inode *inode, const char *function, +			unsigned int line, ext4_fsblk_t block, +			const char *fmt, ...)  {  	va_list args;  	struct va_format vaf; @@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,  	ext4_handle_error(inode->i_sb);  } -void ext4_error_file(struct file *file, const char *function, -		     unsigned int line, ext4_fsblk_t block, -		     const char *fmt, ...) +void __ext4_error_file(struct file *file, const char *function, +		       unsigned int line, ext4_fsblk_t block, +		       const char *fmt, ...)  {  	va_list args;  	struct va_format vaf; @@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,  	if ((sb->s_flags & MS_RDONLY) == 0) {  		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); -		sb->s_flags |= MS_RDONLY;  		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; +		/* +		 * Make sure updated value of ->s_mount_flags will be visible +		 * before ->s_flags update +		 */ +		smp_wmb(); +		sb->s_flags |= MS_RDONLY;  		if (EXT4_SB(sb)->s_journal)  			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);  		save_error_info(sb, function, line); @@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,  		panic("EXT4-fs panic from previous error\n");  } -void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +void __ext4_msg(struct super_block *sb, +		const char *prefix, const char *fmt, ...)  {  	struct va_format vaf;  	va_list args; @@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)  	ext4_unregister_li_request(sb);  	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); -	flush_workqueue(sbi->dio_unwritten_wq); -	destroy_workqueue(sbi->dio_unwritten_wq); +	flush_workqueue(sbi->unrsv_conversion_wq); +	flush_workqueue(sbi->rsv_conversion_wq); +	destroy_workqueue(sbi->unrsv_conversion_wq); +	destroy_workqueue(sbi->rsv_conversion_wq);  	if (sbi->s_journal) {  		err = jbd2_journal_destroy(sbi->s_journal); @@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)  			ext4_abort(sb, "Couldn't clean up the journal");  	} -	ext4_es_unregister_shrinker(sb); +	ext4_es_unregister_shrinker(sbi);  	del_timer(&sbi->s_err_report);  	ext4_release_system_zone(sb);  	ext4_mb_release(sb); @@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)  	rwlock_init(&ei->i_es_lock);  	INIT_LIST_HEAD(&ei->i_es_lru);  	ei->i_es_lru_nr = 0; +	ei->i_touch_when = 0;  	ei->i_reserved_data_blocks = 0;  	ei->i_reserved_meta_blocks = 0;  	ei->i_allocated_meta_blocks = 0; @@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)  	ei->i_reserved_quota = 0;  #endif  	ei->jinode = NULL; -	INIT_LIST_HEAD(&ei->i_completed_io_list); +	INIT_LIST_HEAD(&ei->i_rsv_conversion_list); +	INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);  	spin_lock_init(&ei->i_completed_io_lock);  	ei->i_sync_tid = 0;  	ei->i_datasync_tid = 0;  	atomic_set(&ei->i_ioend_count, 0);  	atomic_set(&ei->i_unwritten, 0); -	INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); +	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); +	INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);  	return &ei->vfs_inode;  } @@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {  	.dirty_inode	= ext4_dirty_inode,  	.drop_inode	= ext4_drop_inode,  	.evict_inode	= ext4_evict_inode, +	.sync_fs	= ext4_sync_fs_nojournal,  	.put_super	= ext4_put_super,  	.statfs		= ext4_statfs,  	.remount_fs	= ext4_remount, @@ -1341,7 +1359,7 @@ static const struct mount_opts {  	{Opt_delalloc, EXT4_MOUNT_DELALLOC,  	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},  	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC, -	 MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT}, +	 MOPT_EXT4_ONLY | MOPT_CLEAR},  	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,  	 MOPT_EXT4_ONLY | MOPT_SET},  	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | @@ -1684,12 +1702,6 @@ static inline void ext4_show_quota_options(struct seq_file *seq,  	if (sbi->s_qf_names[GRPQUOTA])  		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - -	if (test_opt(sb, USRQUOTA)) -		seq_puts(seq, ",usrquota"); - -	if (test_opt(sb, GRPQUOTA)) -		seq_puts(seq, ",grpquota");  #endif  } @@ -1908,7 +1920,6 @@ static int ext4_fill_flex_info(struct super_block *sb)  	struct ext4_sb_info *sbi = EXT4_SB(sb);  	struct ext4_group_desc *gdp = NULL;  	ext4_group_t flex_group; -	unsigned int groups_per_flex = 0;  	int i, err;  	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; @@ -1916,7 +1927,6 @@ static int ext4_fill_flex_info(struct super_block *sb)  		sbi->s_log_groups_per_flex = 0;  		return 1;  	} -	groups_per_flex = 1U << sbi->s_log_groups_per_flex;  	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);  	if (err) @@ -2164,19 +2174,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,  		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);  		dquot_initialize(inode);  		if (inode->i_nlink) { -			ext4_msg(sb, KERN_DEBUG, -				"%s: truncating inode %lu to %lld bytes", -				__func__, inode->i_ino, inode->i_size); +			if (test_opt(sb, DEBUG)) +				ext4_msg(sb, KERN_DEBUG, +					"%s: truncating inode %lu to %lld bytes", +					__func__, inode->i_ino, inode->i_size);  			jbd_debug(2, "truncating inode %lu to %lld bytes\n",  				  inode->i_ino, inode->i_size);  			mutex_lock(&inode->i_mutex); +			truncate_inode_pages(inode->i_mapping, inode->i_size);  			ext4_truncate(inode);  			mutex_unlock(&inode->i_mutex);  			nr_truncates++;  		} else { -			ext4_msg(sb, KERN_DEBUG, -				"%s: deleting unreferenced inode %lu", -				__func__, inode->i_ino); +			if (test_opt(sb, DEBUG)) +				ext4_msg(sb, KERN_DEBUG, +					"%s: deleting unreferenced inode %lu", +					__func__, inode->i_ino);  			jbd_debug(2, "deleting unreferenced inode %lu\n",  				  inode->i_ino);  			nr_orphans++; @@ -2377,7 +2390,10 @@ struct ext4_attr {  	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);  	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,  			 const char *, size_t); -	int offset; +	union { +		int offset; +		int deprecated_val; +	} u;  };  static int parse_strtoull(const char *buf, @@ -2446,7 +2462,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,  static ssize_t sbi_ui_show(struct ext4_attr *a,  			   struct ext4_sb_info *sbi, char *buf)  { -	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); +	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);  	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);  } @@ -2455,7 +2471,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,  			    struct ext4_sb_info *sbi,  			    const char *buf, size_t count)  { -	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); +	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);  	unsigned long t;  	int ret; @@ -2504,12 +2520,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,  	return count;  } +static ssize_t sbi_deprecated_show(struct ext4_attr *a, +				   struct ext4_sb_info *sbi, char *buf) +{ +	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); +} +  #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \  static struct ext4_attr ext4_attr_##_name = {			\  	.attr = {.name = __stringify(_name), .mode = _mode },	\  	.show	= _show,					\  	.store	= _store,					\ -	.offset = offsetof(struct ext4_sb_info, _elname),	\ +	.u = {							\ +		.offset = offsetof(struct ext4_sb_info, _elname),\ +	},							\  }  #define EXT4_ATTR(name, mode, show, store) \  static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) @@ -2520,6 +2544,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)  #define EXT4_RW_ATTR_SBI_UI(name, elname)	\  	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)  #define ATTR_LIST(name) &ext4_attr_##name.attr +#define EXT4_DEPRECATED_ATTR(_name, _val)	\ +static struct ext4_attr ext4_attr_##_name = {			\ +	.attr = {.name = __stringify(_name), .mode = 0444 },	\ +	.show	= sbi_deprecated_show,				\ +	.u = {							\ +		.deprecated_val = _val,				\ +	},							\ +}  EXT4_RO_ATTR(delayed_allocation_blocks);  EXT4_RO_ATTR(session_write_kbytes); @@ -2534,7 +2566,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); +EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);  EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);  EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); @@ -3451,7 +3483,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  		}  		if (test_opt(sb, DIOREAD_NOLOCK)) {  			ext4_msg(sb, KERN_ERR, "can't mount with " -				 "both data=journal and delalloc"); +				 "both data=journal and dioread_nolock");  			goto failed_mount;  		}  		if (test_opt(sb, DELALLOC)) @@ -3586,10 +3618,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));  	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); -	/* Do we have standard group size of blocksize * 8 blocks ? */ -	if (sbi->s_blocks_per_group == blocksize << 3) -		set_opt2(sb, STD_GROUP_SIZE); -  	for (i = 0; i < 4; i++)  		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);  	sbi->s_def_hash_version = es->s_def_hash_version; @@ -3659,6 +3687,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  		goto failed_mount;  	} +	/* Do we have standard group size of clustersize * 8 blocks ? */ +	if (sbi->s_blocks_per_group == clustersize << 3) +		set_opt2(sb, STD_GROUP_SIZE); +  	/*  	 * Test whether we have more sectors than will fit in sector_t,  	 * and whether the max offset is addressable by the page cache. @@ -3763,7 +3795,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	sbi->s_err_report.data = (unsigned long) sb;  	/* Register extent status tree shrinker */ -	ext4_es_register_shrinker(sb); +	ext4_es_register_shrinker(sbi);  	err = percpu_counter_init(&sbi->s_freeclusters_counter,  			ext4_count_free_clusters(sb)); @@ -3787,7 +3819,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	}  	sbi->s_stripe = ext4_get_stripe_size(sbi); -	sbi->s_max_writeback_mb_bump = 128;  	sbi->s_extent_max_zeroout_kb = 32;  	/* @@ -3915,12 +3946,20 @@ no_journal:  	 * The maximum number of concurrent works can be high and  	 * concurrency isn't really necessary.  Limit it to 1.  	 */ -	EXT4_SB(sb)->dio_unwritten_wq = -		alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); -	if (!EXT4_SB(sb)->dio_unwritten_wq) { -		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); +	EXT4_SB(sb)->rsv_conversion_wq = +		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); +	if (!EXT4_SB(sb)->rsv_conversion_wq) { +		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");  		ret = -ENOMEM; -		goto failed_mount_wq; +		goto failed_mount4; +	} + +	EXT4_SB(sb)->unrsv_conversion_wq = +		alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); +	if (!EXT4_SB(sb)->unrsv_conversion_wq) { +		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); +		ret = -ENOMEM; +		goto failed_mount4;  	}  	/* @@ -4074,14 +4113,17 @@ failed_mount4a:  	sb->s_root = NULL;  failed_mount4:  	ext4_msg(sb, KERN_ERR, "mount failed"); -	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); +	if (EXT4_SB(sb)->rsv_conversion_wq) +		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); +	if (EXT4_SB(sb)->unrsv_conversion_wq) +		destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);  failed_mount_wq:  	if (sbi->s_journal) {  		jbd2_journal_destroy(sbi->s_journal);  		sbi->s_journal = NULL;  	}  failed_mount3: -	ext4_es_unregister_shrinker(sb); +	ext4_es_unregister_shrinker(sbi);  	del_timer(&sbi->s_err_report);  	if (sbi->s_flex_groups)  		ext4_kvfree(sbi->s_flex_groups); @@ -4517,19 +4559,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)  {  	int ret = 0;  	tid_t target; +	bool needs_barrier = false;  	struct ext4_sb_info *sbi = EXT4_SB(sb);  	trace_ext4_sync_fs(sb, wait); -	flush_workqueue(sbi->dio_unwritten_wq); +	flush_workqueue(sbi->rsv_conversion_wq); +	flush_workqueue(sbi->unrsv_conversion_wq);  	/*  	 * Writeback quota in non-journalled quota case - journalled quota has  	 * no dirty dquots  	 */  	dquot_writeback_dquots(sb, -1); +	/* +	 * Data writeback is possible w/o journal transaction, so barrier must +	 * being sent at the end of the function. But we can skip it if +	 * transaction_commit will do it for us. +	 */ +	target = jbd2_get_latest_transaction(sbi->s_journal); +	if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && +	    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) +		needs_barrier = true; +  	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {  		if (wait) -			jbd2_log_wait_commit(sbi->s_journal, target); +			ret = jbd2_log_wait_commit(sbi->s_journal, target);  	} +	if (needs_barrier) { +		int err; +		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); +		if (!ret) +			ret = err; +	} + +	return ret; +} + +static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) +{ +	int ret = 0; + +	trace_ext4_sync_fs(sb, wait); +	flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); +	flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); +	dquot_writeback_dquots(sb, -1); +	if (wait && test_opt(sb, BARRIER)) +		ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); +  	return ret;  } @@ -4652,6 +4727,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  		goto restore_opts;  	} +	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { +		if (test_opt2(sb, EXPLICIT_DELALLOC)) { +			ext4_msg(sb, KERN_ERR, "can't mount with " +				 "both data=journal and delalloc"); +			err = -EINVAL; +			goto restore_opts; +		} +		if (test_opt(sb, DIOREAD_NOLOCK)) { +			ext4_msg(sb, KERN_ERR, "can't mount with " +				 "both data=journal and dioread_nolock"); +			err = -EINVAL; +			goto restore_opts; +		} +	} +  	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)  		ext4_abort(sb, "Abort forced by user"); @@ -5406,6 +5496,7 @@ static void __exit ext4_exit_fs(void)  	kset_unregister(ext4_kset);  	ext4_exit_system_zone();  	ext4_exit_pageio(); +	ext4_exit_es();  }  MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); | 
