diff options
| -rw-r--r-- | fs/dax.c | 252 | ||||
| -rw-r--r-- | fs/ext2/Kconfig | 1 | ||||
| -rw-r--r-- | fs/ext2/ext2.h | 1 | ||||
| -rw-r--r-- | fs/ext2/file.c | 76 | ||||
| -rw-r--r-- | fs/ext2/inode.c | 100 | ||||
| -rw-r--r-- | fs/internal.h | 11 | ||||
| -rw-r--r-- | fs/iomap.c | 5 | ||||
| -rw-r--r-- | fs/xfs/xfs_aops.c | 31 | ||||
| -rw-r--r-- | fs/xfs/xfs_aops.h | 1 | ||||
| -rw-r--r-- | fs/xfs/xfs_file.c | 79 | ||||
| -rw-r--r-- | fs/xfs/xfs_iomap.c | 22 | ||||
| -rw-r--r-- | include/linux/dax.h | 6 | ||||
| -rw-r--r-- | include/linux/iomap.h | 1 | 
13 files changed, 464 insertions, 122 deletions
| @@ -31,6 +31,8 @@  #include <linux/vmstat.h>  #include <linux/pfn_t.h>  #include <linux/sizes.h> +#include <linux/iomap.h> +#include "internal.h"  /*   * We use lowest available bit in exceptional entry for locking, other two @@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry,  	return VM_FAULT_LOCKED;  } -static int copy_user_bh(struct page *to, struct inode *inode, -		struct buffer_head *bh, unsigned long vaddr) +static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, +		struct page *to, unsigned long vaddr)  {  	struct blk_dax_ctl dax = { -		.sector = to_sector(bh, inode), -		.size = bh->b_size, +		.sector = sector, +		.size = size,  	}; -	struct block_device *bdev = bh->b_bdev;  	void *vto;  	if (dax_map_atomic(bdev, &dax) < 0) @@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);  static int dax_insert_mapping(struct address_space *mapping, -			struct buffer_head *bh, void **entryp, -			struct vm_area_struct *vma, struct vm_fault *vmf) +		struct block_device *bdev, sector_t sector, size_t size, +		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)  {  	unsigned long vaddr = (unsigned long)vmf->virtual_address; -	struct block_device *bdev = bh->b_bdev;  	struct blk_dax_ctl dax = { -		.sector = to_sector(bh, mapping->host), -		.size = bh->b_size, +		.sector = sector, +		.size = size,  	};  	void *ret;  	void *entry = *entryp; @@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,  	if (vmf->cow_page) {  		struct page *new_page = vmf->cow_page;  		if (buffer_written(&bh)) -			error = copy_user_bh(new_page, inode, &bh, vaddr); +			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), +					bh.b_size, new_page, vaddr);  		else  			clear_user_highpage(new_page, vaddr);  		if (error) @@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,  	/* Filesystem should not return unwritten buffers to us! */  	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); -	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); +	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), +			bh.b_size, &entry, vma, vmf);   unlock_entry:  	put_locked_mapping_entry(mapping, vmf->pgoff, entry);   out: @@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)  	return dax_zero_page_range(inode, from, length, get_block);  }  EXPORT_SYMBOL_GPL(dax_truncate_page); + +#ifdef CONFIG_FS_IOMAP +static loff_t +iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, +		struct iomap *iomap) +{ +	struct iov_iter *iter = data; +	loff_t end = pos + length, done = 0; +	ssize_t ret = 0; + +	if (iov_iter_rw(iter) == READ) { +		end = min(end, i_size_read(inode)); +		if (pos >= end) +			return 0; + +		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) +			return iov_iter_zero(min(length, end - pos), iter); +	} + +	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) +		return -EIO; + +	while (pos < end) { +		unsigned offset = pos & (PAGE_SIZE - 1); +		struct blk_dax_ctl dax = { 0 }; +		ssize_t map_len; + +		dax.sector = iomap->blkno + +			(((pos & PAGE_MASK) - iomap->offset) >> 9); +		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; +		map_len = dax_map_atomic(iomap->bdev, &dax); +		if (map_len < 0) { +			ret = map_len; +			break; +		} + +		dax.addr += offset; +		map_len -= offset; +		if (map_len > end - pos) +			map_len = end - pos; + +		if (iov_iter_rw(iter) == WRITE) +			map_len = copy_from_iter_pmem(dax.addr, map_len, iter); +		else +			map_len = copy_to_iter(dax.addr, map_len, iter); +		dax_unmap_atomic(iomap->bdev, &dax); +		if (map_len <= 0) { +			ret = map_len ? map_len : -EFAULT; +			break; +		} + +		pos += map_len; +		length -= map_len; +		done += map_len; +	} + +	return done ? done : ret; +} + +/** + * iomap_dax_rw - Perform I/O to a DAX file + * @iocb:	The control block for this I/O + * @iter:	The addresses to do I/O from or to + * @ops:	iomap ops passed from the file system + * + * This function performs read and write operations to directly mapped + * persistent memory.  The callers needs to take care of read/write exclusion + * and evicting any page cache pages in the region under I/O. + */ +ssize_t +iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, +		struct iomap_ops *ops) +{ +	struct address_space *mapping = iocb->ki_filp->f_mapping; +	struct inode *inode = mapping->host; +	loff_t pos = iocb->ki_pos, ret = 0, done = 0; +	unsigned flags = 0; + +	if (iov_iter_rw(iter) == WRITE) +		flags |= IOMAP_WRITE; + +	/* +	 * Yes, even DAX files can have page cache attached to them:  A zeroed +	 * page is inserted into the pagecache when we have to serve a write +	 * fault on a hole.  It should never be dirtied and can simply be +	 * dropped from the pagecache once we get real data for the page. +	 * +	 * XXX: This is racy against mmap, and there's nothing we can do about +	 * it. We'll eventually need to shift this down even further so that +	 * we can check if we allocated blocks over a hole first. +	 */ +	if (mapping->nrpages) { +		ret = invalidate_inode_pages2_range(mapping, +				pos >> PAGE_SHIFT, +				(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); +		WARN_ON_ONCE(ret); +	} + +	while (iov_iter_count(iter)) { +		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, +				iter, iomap_dax_actor); +		if (ret <= 0) +			break; +		pos += ret; +		done += ret; +	} + +	iocb->ki_pos += done; +	return done ? done : ret; +} +EXPORT_SYMBOL_GPL(iomap_dax_rw); + +/** + * iomap_dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @ops: iomap ops passed from the file system + * + * When a page fault occurs, filesystems may call this helper in their fault + * or mkwrite handler for DAX files. Assumes the caller has done all the + * necessary locking for the page fault to proceed successfully. + */ +int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, +			struct iomap_ops *ops) +{ +	struct address_space *mapping = vma->vm_file->f_mapping; +	struct inode *inode = mapping->host; +	unsigned long vaddr = (unsigned long)vmf->virtual_address; +	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; +	sector_t sector; +	struct iomap iomap = { 0 }; +	unsigned flags = 0; +	int error, major = 0; +	void *entry; + +	/* +	 * Check whether offset isn't beyond end of file now. Caller is supposed +	 * to hold locks serializing us with truncate / punch hole so this is +	 * a reliable test. +	 */ +	if (pos >= i_size_read(inode)) +		return VM_FAULT_SIGBUS; + +	entry = grab_mapping_entry(mapping, vmf->pgoff); +	if (IS_ERR(entry)) { +		error = PTR_ERR(entry); +		goto out; +	} + +	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) +		flags |= IOMAP_WRITE; + +	/* +	 * Note that we don't bother to use iomap_apply here: DAX required +	 * the file system block size to be equal the page size, which means +	 * that we never have to deal with more than a single extent here. +	 */ +	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); +	if (error) +		goto unlock_entry; +	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { +		error = -EIO;		/* fs corruption? */ +		goto unlock_entry; +	} + +	sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); + +	if (vmf->cow_page) { +		switch (iomap.type) { +		case IOMAP_HOLE: +		case IOMAP_UNWRITTEN: +			clear_user_highpage(vmf->cow_page, vaddr); +			break; +		case IOMAP_MAPPED: +			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, +					vmf->cow_page, vaddr); +			break; +		default: +			WARN_ON_ONCE(1); +			error = -EIO; +			break; +		} + +		if (error) +			goto unlock_entry; +		if (!radix_tree_exceptional_entry(entry)) { +			vmf->page = entry; +			return VM_FAULT_LOCKED; +		} +		vmf->entry = entry; +		return VM_FAULT_DAX_LOCKED; +	} + +	switch (iomap.type) { +	case IOMAP_MAPPED: +		if (iomap.flags & IOMAP_F_NEW) { +			count_vm_event(PGMAJFAULT); +			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); +			major = VM_FAULT_MAJOR; +		} +		error = dax_insert_mapping(mapping, iomap.bdev, sector, +				PAGE_SIZE, &entry, vma, vmf); +		break; +	case IOMAP_UNWRITTEN: +	case IOMAP_HOLE: +		if (!(vmf->flags & FAULT_FLAG_WRITE)) +			return dax_load_hole(mapping, entry, vmf); +		/*FALLTHRU*/ +	default: +		WARN_ON_ONCE(1); +		error = -EIO; +		break; +	} + + unlock_entry: +	put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: +	if (error == -ENOMEM) +		return VM_FAULT_OOM | major; +	/* -EBUSY is fine, somebody else faulted on the same PTE */ +	if (error < 0 && error != -EBUSY) +		return VM_FAULT_SIGBUS | major; +	return VM_FAULT_NOPAGE | major; +} +EXPORT_SYMBOL_GPL(iomap_dax_fault); +#endif /* CONFIG_FS_IOMAP */ diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index c634874e12d9..36bea5adcaba 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,5 +1,6 @@  config EXT2_FS  	tristate "Second extended fs support" +	select FS_IOMAP if FS_DAX  	help  	  Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 06af2f92226c..37e2be784ac7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations;  /* inode.c */  extern const struct address_space_operations ext2_aops;  extern const struct address_space_operations ext2_nobh_aops; +extern struct iomap_ops ext2_iomap_ops;  /* namei.c */  extern const struct inode_operations ext2_dir_inode_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 5efeefe17abb..423cc01c9d41 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -22,11 +22,59 @@  #include <linux/pagemap.h>  #include <linux/dax.h>  #include <linux/quotaops.h> +#include <linux/iomap.h> +#include <linux/uio.h>  #include "ext2.h"  #include "xattr.h"  #include "acl.h"  #ifdef CONFIG_FS_DAX +static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ +	struct inode *inode = iocb->ki_filp->f_mapping->host; +	ssize_t ret; + +	if (!iov_iter_count(to)) +		return 0; /* skip atime */ + +	inode_lock_shared(inode); +	ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); +	inode_unlock_shared(inode); + +	file_accessed(iocb->ki_filp); +	return ret; +} + +static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ +	struct file *file = iocb->ki_filp; +	struct inode *inode = file->f_mapping->host; +	ssize_t ret; + +	inode_lock(inode); +	ret = generic_write_checks(iocb, from); +	if (ret <= 0) +		goto out_unlock; +	ret = file_remove_privs(file); +	if (ret) +		goto out_unlock; +	ret = file_update_time(file); +	if (ret) +		goto out_unlock; + +	ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); +	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { +		i_size_write(inode, iocb->ki_pos); +		mark_inode_dirty(inode); +	} + +out_unlock: +	inode_unlock(inode); +	if (ret > 0) +		ret = generic_write_sync(iocb, ret); +	return ret; +} +  /*   * The lock ordering for ext2 DAX fault paths is:   * @@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)  	}  	down_read(&ei->dax_sem); -	ret = dax_fault(vma, vmf, ext2_get_block); +	ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);  	up_read(&ei->dax_sem);  	if (vmf->flags & FAULT_FLAG_WRITE) @@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)  	return ret;  } -/* - * We have mostly NULL's here: the current defaults are ok for - * the ext2 filesystem. - */ +static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ +#ifdef CONFIG_FS_DAX +	if (IS_DAX(iocb->ki_filp->f_mapping->host)) +		return ext2_dax_read_iter(iocb, to); +#endif +	return generic_file_read_iter(iocb, to); +} + +static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ +#ifdef CONFIG_FS_DAX +	if (IS_DAX(iocb->ki_filp->f_mapping->host)) +		return ext2_dax_write_iter(iocb, from); +#endif +	return generic_file_write_iter(iocb, from); +} +  const struct file_operations ext2_file_operations = {  	.llseek		= generic_file_llseek, -	.read_iter	= generic_file_read_iter, -	.write_iter	= generic_file_write_iter, +	.read_iter	= ext2_file_read_iter, +	.write_iter	= ext2_file_write_iter,  	.unlocked_ioctl = ext2_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl	= ext2_compat_ioctl, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index d5c7d09919f3..c7dbb4661119 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -32,6 +32,7 @@  #include <linux/buffer_head.h>  #include <linux/mpage.h>  #include <linux/fiemap.h> +#include <linux/iomap.h>  #include <linux/namei.h>  #include <linux/uio.h>  #include "ext2.h" @@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode,   */  static int ext2_get_blocks(struct inode *inode,  			   sector_t iblock, unsigned long maxblocks, -			   struct buffer_head *bh_result, +			   u32 *bno, bool *new, bool *boundary,  			   int create)  {  	int err = -EIO; @@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode,  	/* Simplest case - block found, no allocation needed */  	if (!partial) {  		first_block = le32_to_cpu(chain[depth - 1].key); -		clear_buffer_new(bh_result); /* What's this do? */  		count++;  		/*map more blocks*/  		while (count < maxblocks && count <= blocks_to_boundary) { @@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode,  			mutex_unlock(&ei->truncate_mutex);  			if (err)  				goto cleanup; -			clear_buffer_new(bh_result);  			goto got_it;  		}  	} @@ -745,15 +744,16 @@ static int ext2_get_blocks(struct inode *inode,  			mutex_unlock(&ei->truncate_mutex);  			goto cleanup;  		} -	} else -		set_buffer_new(bh_result); +	} else { +		*new = true; +	}  	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);  	mutex_unlock(&ei->truncate_mutex);  got_it: -	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); +	*bno = le32_to_cpu(chain[depth-1].key);  	if (count > blocks_to_boundary) -		set_buffer_boundary(bh_result); +		*boundary = true;  	err = count;  	/* Clean up and exit */  	partial = chain + depth - 1;	/* the whole chain */ @@ -765,19 +765,82 @@ cleanup:  	return err;  } -int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +int ext2_get_block(struct inode *inode, sector_t iblock, +		struct buffer_head *bh_result, int create)  {  	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; -	int ret = ext2_get_blocks(inode, iblock, max_blocks, -			      bh_result, create); -	if (ret > 0) { -		bh_result->b_size = (ret << inode->i_blkbits); -		ret = 0; +	bool new = false, boundary = false; +	u32 bno; +	int ret; + +	ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary, +			create); +	if (ret <= 0) +		return ret; + +	map_bh(bh_result, inode->i_sb, bno); +	bh_result->b_size = (ret << inode->i_blkbits); +	if (new) +		set_buffer_new(bh_result); +	if (boundary) +		set_buffer_boundary(bh_result); +	return 0; + +} + +#ifdef CONFIG_FS_DAX +static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, +		unsigned flags, struct iomap *iomap) +{ +	unsigned int blkbits = inode->i_blkbits; +	unsigned long first_block = offset >> blkbits; +	unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; +	bool new = false, boundary = false; +	u32 bno; +	int ret; + +	ret = ext2_get_blocks(inode, first_block, max_blocks, +			&bno, &new, &boundary, flags & IOMAP_WRITE); +	if (ret < 0) +		return ret; + +	iomap->flags = 0; +	iomap->bdev = inode->i_sb->s_bdev; +	iomap->offset = (u64)first_block << blkbits; + +	if (ret == 0) { +		iomap->type = IOMAP_HOLE; +		iomap->blkno = IOMAP_NULL_BLOCK; +		iomap->length = 1 << blkbits; +	} else { +		iomap->type = IOMAP_MAPPED; +		iomap->blkno = (sector_t)bno << (blkbits - 9); +		iomap->length = (u64)ret << blkbits; +		iomap->flags |= IOMAP_F_MERGED;  	} -	return ret; +	if (new) +		iomap->flags |= IOMAP_F_NEW; +	return 0;  } +static int +ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, +		ssize_t written, unsigned flags, struct iomap *iomap) +{ +	if (iomap->type == IOMAP_MAPPED && +	    written < length && +	    (flags & IOMAP_WRITE)) +		ext2_write_failed(inode->i_mapping, offset + length); +	return 0; +} + +struct iomap_ops ext2_iomap_ops = { +	.iomap_begin		= ext2_iomap_begin, +	.iomap_end		= ext2_iomap_end, +}; +#endif /* CONFIG_FS_DAX */ +  int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  		u64 start, u64 len)  { @@ -863,11 +926,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)  	loff_t offset = iocb->ki_pos;  	ssize_t ret; -	if (IS_DAX(inode)) -		ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, -				DIO_LOCKING); -	else -		ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); +	if (WARN_ON_ONCE(IS_DAX(inode))) +		return -EIO; + +	ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);  	if (ret < 0 && iov_iter_rw(iter) == WRITE)  		ext2_write_failed(mapping, offset + count);  	return ret; diff --git a/fs/internal.h b/fs/internal.h index ba0737649d4a..859178692ce4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -12,6 +12,7 @@  struct super_block;  struct file_system_type;  struct iomap; +struct iomap_ops;  struct linux_binprm;  struct path;  struct mount; @@ -164,3 +165,13 @@ extern struct dentry_operations ns_dentry_operations;  extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,  		    unsigned long arg);  extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* + * iomap support: + */ +typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, +		void *data, struct iomap *iomap); + +loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, +		unsigned flags, struct iomap_ops *ops, void *data, +		iomap_actor_t actor); diff --git a/fs/iomap.c b/fs/iomap.c index ec411a6b9edc..013d1d36fbbf 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -27,9 +27,6 @@  #include <linux/dax.h>  #include "internal.h" -typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, -		void *data, struct iomap *iomap); -  /*   * Execute a iomap write on a segment of the mapping that spans a   * contiguous range of pages that have identical block mapping state. @@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,   * resources they require in the iomap_begin call, and release them in the   * iomap_end call.   */ -static loff_t +loff_t  iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,  		struct iomap_ops *ops, void *data, iomap_actor_t actor)  { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 7575cfc3ad15..4a28fa91e3b1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -200,7 +200,7 @@ xfs_setfilesize_trans_alloc(   * Update on-disk file size now that data has been written to disk.   */  STATIC int -xfs_setfilesize( +__xfs_setfilesize(  	struct xfs_inode	*ip,  	struct xfs_trans	*tp,  	xfs_off_t		offset, @@ -225,6 +225,23 @@ xfs_setfilesize(  	return xfs_trans_commit(tp);  } +int +xfs_setfilesize( +	struct xfs_inode	*ip, +	xfs_off_t		offset, +	size_t			size) +{ +	struct xfs_mount	*mp = ip->i_mount; +	struct xfs_trans	*tp; +	int			error; + +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); +	if (error) +		return error; + +	return __xfs_setfilesize(ip, tp, offset, size); +} +  STATIC int  xfs_setfilesize_ioend(  	struct xfs_ioend	*ioend, @@ -247,7 +264,7 @@ xfs_setfilesize_ioend(  		return error;  	} -	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); +	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);  }  /* @@ -1336,13 +1353,12 @@ xfs_end_io_direct_write(  {  	struct inode		*inode = file_inode(iocb->ki_filp);  	struct xfs_inode	*ip = XFS_I(inode); -	struct xfs_mount	*mp = ip->i_mount;  	uintptr_t		flags = (uintptr_t)private;  	int			error = 0;  	trace_xfs_end_io_direct_write(ip, offset, size); -	if (XFS_FORCED_SHUTDOWN(mp)) +	if (XFS_FORCED_SHUTDOWN(ip->i_mount))  		return -EIO;  	if (size <= 0) @@ -1380,14 +1396,9 @@ xfs_end_io_direct_write(  		error = xfs_iomap_write_unwritten(ip, offset, size);  	} else if (flags & XFS_DIO_FLAG_APPEND) { -		struct xfs_trans *tp; -  		trace_xfs_end_io_direct_write_append(ip, offset, size); -		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, -				&tp); -		if (!error) -			error = xfs_setfilesize(ip, tp, offset, size); +		error = xfs_setfilesize(ip, offset, size);  	}  	return error; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index bf2d9a141a73..1950e3bca2ac 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -62,6 +62,7 @@ int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,  int	xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,  		ssize_t size, void *private); +int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);  extern void xfs_count_page_state(struct page *, int *, int *);  extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b927ea9abe33..c68517b0f248 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -333,10 +333,7 @@ xfs_file_dax_read(  	struct kiocb		*iocb,  	struct iov_iter		*to)  { -	struct address_space	*mapping = iocb->ki_filp->f_mapping; -	struct inode		*inode = mapping->host; -	struct xfs_inode	*ip = XFS_I(inode); -	struct iov_iter		data = *to; +	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);  	size_t			count = iov_iter_count(to);  	ssize_t			ret = 0; @@ -346,11 +343,7 @@ xfs_file_dax_read(  		return 0; /* skip atime */  	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); -	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); -	if (ret > 0) { -		iocb->ki_pos += ret; -		iov_iter_advance(to, ret); -	} +	ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);  	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);  	file_accessed(iocb->ki_filp); @@ -712,70 +705,32 @@ xfs_file_dax_write(  	struct kiocb		*iocb,  	struct iov_iter		*from)  { -	struct address_space	*mapping = iocb->ki_filp->f_mapping; -	struct inode		*inode = mapping->host; +	struct inode		*inode = iocb->ki_filp->f_mapping->host;  	struct xfs_inode	*ip = XFS_I(inode); -	struct xfs_mount	*mp = ip->i_mount; -	ssize_t			ret = 0; -	int			unaligned_io = 0; -	int			iolock; -	struct iov_iter		data; +	int			iolock = XFS_IOLOCK_EXCL; +	ssize_t			ret, error = 0; +	size_t			count; +	loff_t			pos; -	/* "unaligned" here means not aligned to a filesystem block */ -	if ((iocb->ki_pos & mp->m_blockmask) || -	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { -		unaligned_io = 1; -		iolock = XFS_IOLOCK_EXCL; -	} else if (mapping->nrpages) { -		iolock = XFS_IOLOCK_EXCL; -	} else { -		iolock = XFS_IOLOCK_SHARED; -	}  	xfs_rw_ilock(ip, iolock); -  	ret = xfs_file_aio_write_checks(iocb, from, &iolock);  	if (ret)  		goto out; -	/* -	 * Yes, even DAX files can have page cache attached to them:  A zeroed -	 * page is inserted into the pagecache when we have to serve a write -	 * fault on a hole.  It should never be dirtied and can simply be -	 * dropped from the pagecache once we get real data for the page. -	 * -	 * XXX: This is racy against mmap, and there's nothing we can do about -	 * it. dax_do_io() should really do this invalidation internally as -	 * it will know if we've allocated over a holei for this specific IO and -	 * if so it needs to update the mapping tree and invalidate existing -	 * PTEs over the newly allocated range. Remove this invalidation when -	 * dax_do_io() is fixed up. -	 */ -	if (mapping->nrpages) { -		loff_t end = iocb->ki_pos + iov_iter_count(from) - 1; +	pos = iocb->ki_pos; +	count = iov_iter_count(from); -		ret = invalidate_inode_pages2_range(mapping, -						    iocb->ki_pos >> PAGE_SHIFT, -						    end >> PAGE_SHIFT); -		WARN_ON_ONCE(ret); -	} +	trace_xfs_file_dax_write(ip, count, pos); -	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { -		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); -		iolock = XFS_IOLOCK_SHARED; +	ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); +	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { +		i_size_write(inode, iocb->ki_pos); +		error = xfs_setfilesize(ip, pos, ret);  	} -	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); - -	data = *from; -	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, -			xfs_end_io_direct_write, 0); -	if (ret > 0) { -		iocb->ki_pos += ret; -		iov_iter_advance(from, ret); -	}  out:  	xfs_rw_iunlock(ip, iolock); -	return ret; +	return error ? error : ret;  }  STATIC ssize_t @@ -1514,7 +1469,7 @@ xfs_filemap_page_mkwrite(  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);  	if (IS_DAX(inode)) { -		ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); +		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);  	} else {  		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);  		ret = block_page_mkwrite_return(ret); @@ -1548,7 +1503,7 @@ xfs_filemap_fault(  		 * changes to xfs_get_blocks_direct() to map unwritten extent  		 * ioend for conversion on read-only mappings.  		 */ -		ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); +		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);  	} else  		ret = filemap_fault(vma, vmf);  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f96c8ffce5f4..c08253e11545 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -934,11 +934,13 @@ error_on_bmapi_transaction:  	return error;  } -static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +static inline bool imap_needs_alloc(struct inode *inode, +		struct xfs_bmbt_irec *imap, int nimaps)  {  	return !nimaps ||  		imap->br_startblock == HOLESTARTBLOCK || -		imap->br_startblock == DELAYSTARTBLOCK; +		imap->br_startblock == DELAYSTARTBLOCK || +		(IS_DAX(inode) && ISUNWRITTEN(imap));  }  static int @@ -954,16 +956,18 @@ xfs_file_iomap_begin(  	struct xfs_bmbt_irec	imap;  	xfs_fileoff_t		offset_fsb, end_fsb;  	int			nimaps = 1, error = 0; +	unsigned		lockmode;  	if (XFS_FORCED_SHUTDOWN(mp))  		return -EIO; -	if ((flags & IOMAP_WRITE) && !xfs_get_extsz_hint(ip)) { +	if ((flags & IOMAP_WRITE) && +	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {  		return xfs_file_iomap_begin_delay(inode, offset, length, flags,  				iomap);  	} -	xfs_ilock(ip, XFS_ILOCK_EXCL); +	lockmode = xfs_ilock_data_map_shared(ip);  	ASSERT(offset <= mp->m_super->s_maxbytes);  	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) @@ -974,11 +978,11 @@ xfs_file_iomap_begin(  	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,  			       &nimaps, XFS_BMAPI_ENTIRE);  	if (error) { -		xfs_iunlock(ip, XFS_ILOCK_EXCL); +		xfs_iunlock(ip, lockmode);  		return error;  	} -	if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { +	if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {  		/*  		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES  		 * pages to keep the chunks of work done where somewhat symmetric @@ -994,17 +998,19 @@ xfs_file_iomap_begin(  		 * xfs_iomap_write_direct() expects the shared lock. It  		 * is unlocked on return.  		 */ -		xfs_ilock_demote(ip, XFS_ILOCK_EXCL); +		if (lockmode == XFS_ILOCK_EXCL) +			xfs_ilock_demote(ip, lockmode);  		error = xfs_iomap_write_direct(ip, offset, length, &imap,  				nimaps);  		if (error)  			return error; +		iomap->flags = IOMAP_F_NEW;  		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);  	} else {  		ASSERT(nimaps); -		xfs_iunlock(ip, XFS_ILOCK_EXCL); +		xfs_iunlock(ip, lockmode);  		trace_xfs_iomap_found(ip, offset, length, 0, &imap);  	} diff --git a/include/linux/dax.h b/include/linux/dax.h index 9c6dc7704043..add6c4bc568f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -6,13 +6,19 @@  #include <linux/radix-tree.h>  #include <asm/pgtable.h> +struct iomap_ops; +  /* We use lowest available exceptional entry bit for locking */  #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) +ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, +		struct iomap_ops *ops);  ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,  		  get_block_t, dio_iodone_t, int flags);  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);  int dax_truncate_page(struct inode *, loff_t from, get_block_t); +int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, +			struct iomap_ops *ops);  int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);  void dax_wake_mapping_entry_waiter(struct address_space *mapping, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index c74226a738a3..e63e288dee83 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -23,6 +23,7 @@ struct vm_fault;   */  #define IOMAP_F_MERGED	0x01	/* contains multiple blocks/extents */  #define IOMAP_F_SHARED	0x02	/* block shared with another file */ +#define IOMAP_F_NEW	0x04	/* blocks have been newly allocated */  /*   * Magic value for blkno: | 
