From 06bd3c36a733ac27962fea7d6f47168841376824 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 24 Apr 2016 00:56:03 -0400
Subject: ext4: fix data exposure after a crash

Huang has reported that in his powerfail testing he is seeing stale
block contents in some of recently allocated blocks although he mounts
ext4 in data=ordered mode. After some investigation I have found out
that indeed when delayed allocation is used, we don't add inode to
transaction's list of inodes needing flushing before commit. Originally
we were doing that but commit f3b59291a69d removed the logic with a
flawed argument that it is not needed.

The problem is that although for delayed allocated blocks we write their
contents immediately after allocating them, there is no guarantee that
the IO scheduler or device doesn't reorder things and thus transaction
allocating blocks and attaching them to inode can reach stable storage
before actual block contents. Actually whenever we attach freshly
allocated blocks to inode using a written extent, we should add inode to
transaction's ordered inode list to make sure we properly wait for block
contents to be written before committing the transaction. So that is
what we do in this patch. This also handles other cases where stale data
exposure was possible - like filling hole via mmap in
data=ordered,nodelalloc mode.

The only exception to the above rule are extending direct IO writes where
blkdev_direct_IO() waits for IO to complete before increasing i_size and
thus stale data exposure is not possible. For now we don't complicate
the code with optimizing this special case since the overhead is pretty
low. In case this is observed to be a performance problem we can always
handle it using a special flag to ext4_map_blocks().

CC: stable@vger.kernel.org
Fixes: f3b59291a69d0b734be1fc8be489fef2dd846d3d
Reported-by: "HUANG Weller (CM/ESW12-CN)" <Weller.Huang@cn.bosch.com>
Tested-by: "HUANG Weller (CM/ESW12-CN)" <Weller.Huang@cn.bosch.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'fs/ext4/inode.c')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 981a1fc30eaa..250c2df04a92 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -684,6 +684,21 @@ out_sem:
 		ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
+
+		/*
+		 * Inodes with freshly allocated blocks where contents will be
+		 * visible after transaction commit must be on transaction's
+		 * ordered data list.
+		 */
+		if (map->m_flags & EXT4_MAP_NEW &&
+		    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
+		    !(flags & EXT4_GET_BLOCKS_ZERO) &&
+		    !IS_NOQUOTA(inode) &&
+		    ext4_should_order_data(inode)) {
+			ret = ext4_jbd2_file_inode(handle, inode);
+			if (ret)
+				return ret;
+		}
 	}
 	return retval;
 }
@@ -1289,15 +1304,6 @@ static int ext4_write_end(struct file *file,
 	int i_size_changed = 0;
 
 	trace_ext4_write_end(inode, pos, len, copied);
-	if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
-		ret = ext4_jbd2_file_inode(handle, inode);
-		if (ret) {
-			unlock_page(page);
-			put_page(page);
-			goto errout;
-		}
-	}
-
 	if (ext4_has_inline_data(inode)) {
 		ret = ext4_write_inline_data_end(inode, pos, len,
 						 copied, page);
-- 
cgit v1.2.3-70-g09d2


From 3957ef53a5033bd519b19cf375061be1929bdb5f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 24 Apr 2016 00:56:05 -0400
Subject: ext4: remove EXT4_STATE_ORDERED_MODE

This flag is just duplicating what ext4_should_order_data() tells you
and is used in a single place. Furthermore it doesn't reflect changes to
inode data journalling flag so it may be possibly misleading. Just
remove it.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 1 -
 fs/ext4/inode.c | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs/ext4/inode.c')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 349afebe21ee..f75e9ebd4ca2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1549,7 +1549,6 @@ enum {
 	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
 					   nolocking */
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
-	EXT4_STATE_ORDERED_MODE,	/* data=ordered mode */
 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
 };
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 250c2df04a92..8ba46ad06aed 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3540,10 +3540,7 @@ void ext4_set_aops(struct inode *inode)
 {
 	switch (ext4_inode_journal_mode(inode)) {
 	case EXT4_INODE_ORDERED_DATA_MODE:
-		ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
-		break;
 	case EXT4_INODE_WRITEBACK_DATA_MODE:
-		ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
 		break;
 	case EXT4_INODE_JOURNAL_DATA_MODE:
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
@@ -3636,7 +3633,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 	} else {
 		err = 0;
 		mark_buffer_dirty(bh);
-		if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+		if (ext4_should_order_data(inode))
 			err = ext4_jbd2_file_inode(handle, inode);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From ee0876bc69ee8d24d524fb2e9e41e3682aaebb11 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 24 Apr 2016 00:56:08 -0400
Subject: ext4: do not ask jbd2 to write data for delalloc buffers

Currently we ask jbd2 to write all dirty allocated buffers before
committing a transaction when doing writeback of delay allocated blocks.
However this is unnecessary since we move all pages to writeback state
before dropping a transaction handle and then submit all the necessary
IO. We still need the transaction commit to wait for all the outstanding
writeback before flushing disk caches during transaction commit to avoid
data exposure issues though. Use the new jbd2 capability and ask it to
only wait for outstanding writeback during transaction commit when
writing back data in ext4_writepages().

Tested-by: "HUANG Weller (CM/ESW12-CN)" <Weller.Huang@cn.bosch.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h        |  3 +++
 fs/ext4/ext4_jbd2.h   | 12 +++++++++++-
 fs/ext4/inode.c       | 10 +++++++---
 fs/ext4/move_extent.c |  2 +-
 4 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs/ext4/inode.c')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f75e9ebd4ca2..cb00b1119ec9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -581,6 +581,9 @@ enum {
 #define EXT4_GET_BLOCKS_ZERO			0x0200
 #define EXT4_GET_BLOCKS_CREATE_ZERO		(EXT4_GET_BLOCKS_CREATE |\
 					EXT4_GET_BLOCKS_ZERO)
+	/* Caller will submit data before dropping transaction handle. This
+	 * allows jbd2 to avoid submitting data before commit. */
+#define EXT4_GET_BLOCKS_IO_SUBMIT		0x0400
 
 /*
  * The bit position of these flags must not overlap with any of the
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index f1c940b38b30..09c1ef38cbe6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -359,7 +359,8 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 	return 0;
 }
 
-static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+static inline int ext4_jbd2_inode_add_write(handle_t *handle,
+					    struct inode *inode)
 {
 	if (ext4_handle_valid(handle))
 		return jbd2_journal_inode_add_write(handle,
@@ -367,6 +368,15 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 	return 0;
 }
 
+static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
+					   struct inode *inode)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_inode_add_wait(handle,
+						   EXT4_I(inode)->jinode);
+	return 0;
+}
+
 static inline void ext4_update_inode_fsync_trans(handle_t *handle,
 						 struct inode *inode,
 						 int datasync)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8ba46ad06aed..17bfa42ac971 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -695,7 +695,10 @@ out_sem:
 		    !(flags & EXT4_GET_BLOCKS_ZERO) &&
 		    !IS_NOQUOTA(inode) &&
 		    ext4_should_order_data(inode)) {
-			ret = ext4_jbd2_file_inode(handle, inode);
+			if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+				ret = ext4_jbd2_inode_add_wait(handle, inode);
+			else
+				ret = ext4_jbd2_inode_add_write(handle, inode);
 			if (ret)
 				return ret;
 		}
@@ -2319,7 +2322,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
 	 * the data was copied into the page cache.
 	 */
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-			   EXT4_GET_BLOCKS_METADATA_NOFAIL;
+			   EXT4_GET_BLOCKS_METADATA_NOFAIL |
+			   EXT4_GET_BLOCKS_IO_SUBMIT;
 	dioread_nolock = ext4_should_dioread_nolock(inode);
 	if (dioread_nolock)
 		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -3634,7 +3638,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 		err = 0;
 		mark_buffer_dirty(bh);
 		if (ext4_should_order_data(inode))
-			err = ext4_jbd2_file_inode(handle, inode);
+			err = ext4_jbd2_inode_add_write(handle, inode);
 	}
 
 unlock:
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 325cef48b39a..a920c5d29fac 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -400,7 +400,7 @@ data_copy:
 
 	/* Even in case of data=writeback it is reasonable to pin
 	 * inode to transaction, to prevent unexpected data loss */
-	*err = ext4_jbd2_file_inode(handle, orig_inode);
+	*err = ext4_jbd2_inode_add_write(handle, orig_inode);
 
 unlock_pages:
 	unlock_page(pagep[0]);
-- 
cgit v1.2.3-70-g09d2


From 4c54659269ecb799133758330e7ea2a6fa4c65ca Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daeho.jeong@samsung.com>
Date: Mon, 25 Apr 2016 23:21:00 -0400
Subject: ext4: handle unwritten or delalloc buffers before enabling data
 journaling

We already allocate delalloc blocks before changing the inode mode into
"per-file data journal" mode to prevent delalloc blocks from remaining
not allocated, but another issue concerned with "BH_Unwritten" status
still exists. For example, by fallocate(), several buffers' status
change into "BH_Unwritten", but these buffers cannot be processed by
ext4_alloc_da_blocks(). So, they still remain in unwritten status after
per-file data journaling is enabled and they cannot be changed into
written status any more and, if they are journaled and eventually
checkpointed, these unwritten buffer will cause a kernel panic by the
below BUG_ON() function of submit_bh_wbc() when they are submitted
during checkpointing.

static int submit_bh_wbc(int rw, struct buffer_head *bh,...
{
        ...
        BUG_ON(buffer_unwritten(bh));

Moreover, when "dioread_nolock" option is enabled, the status of a
buffer is changed into "BH_Unwritten" after write_begin() completes and
the "BH_Unwritten" status will be cleared after I/O is done. Therefore,
if a buffer's status is changed into unwrutten but the buffer's I/O is
not submitted and completed, it can cause the same problem after
enabling per-file data journaling. You can easily generate this bug by
executing the following command.

./kvm-xfstests -C 10000 -m nodelalloc,dioread_nolock generic/269

To resolve these problems and define a boundary between the previous
mode and per-file data journaling mode, we need to flush and wait all
the I/O of buffers of a file before enabling per-file data journaling
of the file.

Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'fs/ext4/inode.c')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 17bfa42ac971..779ef4c11bc1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5452,22 +5452,29 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		return 0;
 	if (is_journal_aborted(journal))
 		return -EROFS;
-	/* We have to allocate physical blocks for delalloc blocks
-	 * before flushing journal. otherwise delalloc blocks can not
-	 * be allocated any more. even more truncate on delalloc blocks
-	 * could trigger BUG by flushing delalloc blocks in journal.
-	 * There is no delalloc block in non-journal data mode.
-	 */
-	if (val && test_opt(inode->i_sb, DELALLOC)) {
-		err = ext4_alloc_da_blocks(inode);
-		if (err < 0)
-			return err;
-	}
 
 	/* Wait for all existing dio workers */
 	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
+	/*
+	 * Before flushing the journal and switching inode's aops, we have
+	 * to flush all dirty data the inode has. There can be outstanding
+	 * delayed allocations, there can be unwritten extents created by
+	 * fallocate or buffered writes in dioread_nolock mode covered by
+	 * dirty data which can be converted only after flushing the dirty
+	 * data (and journalled aops don't know how to handle these cases).
+	 */
+	if (val) {
+		down_write(&EXT4_I(inode)->i_mmap_sem);
+		err = filemap_write_and_wait(inode->i_mapping);
+		if (err < 0) {
+			up_write(&EXT4_I(inode)->i_mmap_sem);
+			ext4_inode_resume_unlocked_dio(inode);
+			return err;
+		}
+	}
+
 	jbd2_journal_lock_updates(journal);
 
 	/*
@@ -5492,6 +5499,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
+	if (val)
+		up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
 
 	/* Finally we can mark the inode as dirty. */
-- 
cgit v1.2.3-70-g09d2


From c8585c6fcaf2011de54c3592e80a634a2b9e1a7f Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daeho.jeong@samsung.com>
Date: Mon, 25 Apr 2016 23:22:35 -0400
Subject: ext4: fix races between changing inode journal mode and
 ext4_writepages

In ext4, there is a race condition between changing inode journal mode
and ext4_writepages(). While ext4_writepages() is executed on a
non-journalled mode inode, the inode's journal mode could be enabled
by ioctl() and then, some pages dirtied after switching the journal
mode will be still exposed to ext4_writepages() in non-journaled mode.
To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan
Kara's suggestion because we don't want to waste ext4_inode_info's
space for this extra rare case.

Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h                |  4 ++++
 fs/ext4/inode.c               | 15 ++++++++++++---
 fs/ext4/super.c               |  4 ++++
 kernel/locking/percpu-rwsem.c |  1 +
 4 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'fs/ext4/inode.c')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cb00b1119ec9..ba5aecc07fbc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
+#include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -1508,6 +1509,9 @@ struct ext4_sb_info {
 	struct ratelimit_state s_err_ratelimit_state;
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
+
+	/* Barrier between changing inodes' journal flags and writepages ops. */
+	struct percpu_rw_semaphore s_journal_flag_rwsem;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 779ef4c11bc1..4d8ebbe00456 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2612,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
 	struct blk_plug plug;
 	bool give_up_on_write = false;
 
+	percpu_down_read(&sbi->s_journal_flag_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
-	if (dax_mapping(mapping))
-		return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
-						   wbc);
+	if (dax_mapping(mapping)) {
+		ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+						  wbc);
+		goto out_writepages;
+	}
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2786,6 +2789,7 @@ retry:
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
+	percpu_up_read(&sbi->s_journal_flag_rwsem);
 	return ret;
 }
 
@@ -5436,6 +5440,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	journal_t *journal;
 	handle_t *handle;
 	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 	/*
 	 * We have to be very careful here: changing a data block's
@@ -5475,6 +5480,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		}
 	}
 
+	percpu_down_write(&sbi->s_journal_flag_rwsem);
 	jbd2_journal_lock_updates(journal);
 
 	/*
@@ -5491,6 +5497,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		err = jbd2_journal_flush(journal);
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
+			percpu_up_write(&sbi->s_journal_flag_rwsem);
 			ext4_inode_resume_unlocked_dio(inode);
 			return err;
 		}
@@ -5499,6 +5506,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
+	percpu_up_write(&sbi->s_journal_flag_rwsem);
+
 	if (val)
 		up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 304c712dbe12..20c5d52253b4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3930,6 +3931,9 @@ no_journal:
 	if (!err)
 		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
 					  GFP_KERNEL);
+	if (!err)
+		err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount6;
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f231e0bb311c..bec0b647f9cc 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 	free_percpu(brw->fast_read_ctr);
 	brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 /*
  * This is the fast-path for down_read/up_read. If it succeeds we rely
-- 
cgit v1.2.3-70-g09d2


From 7cb476f834d0dc2092e04eeafe9cbe509f6380eb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 13 May 2016 00:38:16 -0400
Subject: ext4: handle transient ENOSPC properly for DAX

ext4_dax_get_blocks() was accidentally omitted fixing get blocks
handlers to properly handle transient ENOSPC errors. Fix it now to use
ext4_get_blocks_trans() helper which takes care of these errors.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 75 +++++++++++++++------------------------------------------
 1 file changed, 20 insertions(+), 55 deletions(-)

(limited to 'fs/ext4/inode.c')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4d8ebbe00456..32825dee81d4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3232,72 +3232,37 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
 			    struct buffer_head *bh_result, int create)
 {
-	int ret, err;
-	int credits;
-	struct ext4_map_blocks map;
-	handle_t *handle = NULL;
-	int flags = 0;
+	int ret;
 
 	ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	map.m_lblk = iblock;
-	map.m_len = bh_result->b_size >> inode->i_blkbits;
-	credits = ext4_chunk_trans_blocks(inode, map.m_len);
-	if (create) {
-		flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
-		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			return ret;
-		}
-	}
+	if (!create)
+		return _ext4_get_block(inode, iblock, bh_result, 0);
 
-	ret = ext4_map_blocks(handle, inode, &map, flags);
-	if (create) {
-		err = ext4_journal_stop(handle);
-		if (ret >= 0 && err < 0)
-			ret = err;
-	}
-	if (ret <= 0)
-		goto out;
-	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-		int err2;
+	ret = ext4_get_block_trans(inode, iblock, bh_result,
+				   EXT4_GET_BLOCKS_PRE_IO |
+				   EXT4_GET_BLOCKS_CREATE_ZERO);
+	if (ret < 0)
+		return ret;
 
+	if (buffer_unwritten(bh_result)) {
 		/*
 		 * We are protected by i_mmap_sem so we know block cannot go
 		 * away from under us even though we dropped i_data_sem.
 		 * Convert extent to written and write zeros there.
-		 *
-		 * Note: We may get here even when create == 0.
 		 */
-		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto out;
-		}
-
-		err = ext4_map_blocks(handle, inode, &map,
-		      EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
-		if (err < 0)
-			ret = err;
-		err2 = ext4_journal_stop(handle);
-		if (err2 < 0 && ret > 0)
-			ret = err2;
-	}
-out:
-	WARN_ON_ONCE(ret == 0 && create);
-	if (ret > 0) {
-		map_bh(bh_result, inode->i_sb, map.m_pblk);
-		/*
-		 * At least for now we have to clear BH_New so that DAX code
-		 * doesn't attempt to zero blocks again in a racy way.
-		 */
-		map.m_flags &= ~EXT4_MAP_NEW;
-		ext4_update_bh_state(bh_result, map.m_flags);
-		bh_result->b_size = map.m_len << inode->i_blkbits;
-		ret = 0;
+		ret = ext4_get_block_trans(inode, iblock, bh_result,
+					   EXT4_GET_BLOCKS_CONVERT |
+					   EXT4_GET_BLOCKS_CREATE_ZERO);
+		if (ret < 0)
+			return ret;
 	}
-	return ret;
+	/*
+	 * At least for now we have to clear BH_New so that DAX code
+	 * doesn't attempt to zero blocks again in a racy way.
+	 */
+	clear_buffer_new(bh_result);
+	return 0;
 }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 914f82a32d026884743fb3de9f6f0a5908a9d5dd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 13 May 2016 00:44:16 -0400
Subject: ext4: refactor direct IO code

Currently ext4 direct IO handling is split between ext4_ext_direct_IO()
and ext4_ind_direct_IO(). However the extent based function calls into
the indirect based one for some cases and for example it is not able to
handle file extending. Previously it was not also properly handling
retries in case of ENOSPC errors. With DAX things would get even more
contrieved so just refactor the direct IO code and instead of indirect /
extent split do the split to read vs writes.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h     |   2 -
 fs/ext4/indirect.c | 127 ---------------------------------------------------
 fs/ext4/inode.c    | 131 ++++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 114 insertions(+), 146 deletions(-)

(limited to 'fs/ext4/inode.c')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ba5aecc07fbc..89e1bcb21341 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2587,8 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 				struct ext4_map_blocks *map, int flags);
-extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				  loff_t offset);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3027fa681de5..bc15c2c17633 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -648,133 +648,6 @@ out:
 	return err;
 }
 
-/*
- * O_DIRECT for ext3 (or indirect map) based files
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list.  So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			   loff_t offset)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	handle_t *handle;
-	ssize_t ret;
-	int orphan = 0;
-	size_t count = iov_iter_count(iter);
-	int retries = 0;
-
-	if (iov_iter_rw(iter) == WRITE) {
-		loff_t final_size = offset + count;
-
-		if (final_size > inode->i_size) {
-			/* Credits for sb + inode write */
-			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-			if (IS_ERR(handle)) {
-				ret = PTR_ERR(handle);
-				goto out;
-			}
-			ret = ext4_orphan_add(handle, inode);
-			if (ret) {
-				ext4_journal_stop(handle);
-				goto out;
-			}
-			orphan = 1;
-			ei->i_disksize = inode->i_size;
-			ext4_journal_stop(handle);
-		}
-	}
-
-retry:
-	if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
-		/*
-		 * Nolock dioread optimization may be dynamically disabled
-		 * via ext4_inode_block_unlocked_dio(). Check inode's state
-		 * while holding extra i_dio_count ref.
-		 */
-		inode_dio_begin(inode);
-		smp_mb();
-		if (unlikely(ext4_test_inode_state(inode,
-						    EXT4_STATE_DIOREAD_LOCK))) {
-			inode_dio_end(inode);
-			goto locked;
-		}
-		if (IS_DAX(inode))
-			ret = dax_do_io(iocb, inode, iter, offset,
-					ext4_dio_get_block, NULL, 0);
-		else
-			ret = __blockdev_direct_IO(iocb, inode,
-						   inode->i_sb->s_bdev, iter,
-						   offset, ext4_dio_get_block,
-						   NULL, NULL, 0);
-		inode_dio_end(inode);
-	} else {
-locked:
-		if (IS_DAX(inode))
-			ret = dax_do_io(iocb, inode, iter, offset,
-					ext4_dio_get_block, NULL, DIO_LOCKING);
-		else
-			ret = blockdev_direct_IO(iocb, inode, iter, offset,
-						 ext4_dio_get_block);
-
-		if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
-			loff_t isize = i_size_read(inode);
-			loff_t end = offset + count;
-
-			if (end > isize)
-				ext4_truncate_failed_write(inode);
-		}
-	}
-	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-
-	if (orphan) {
-		int err;
-
-		/* Credits for sb + inode write */
-		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-		if (IS_ERR(handle)) {
-			/* This is really bad luck. We've written the data
-			 * but cannot extend i_size. Bail out and pretend
-			 * the write failed... */
-			ret = PTR_ERR(handle);
-			if (inode->i_nlink)
-				ext4_orphan_del(NULL, inode);
-
-			goto out;
-		}
-		if (inode->i_nlink)
-			ext4_orphan_del(handle, inode);
-		if (ret > 0) {
-			loff_t end = offset + ret;
-			if (end > inode->i_size) {
-				ei->i_disksize = end;
-				i_size_write(inode, end);
-				/*
-				 * We're going to return a positive `ret'
-				 * here due to non-zero-length I/O, so there's
-				 * no way of reporting error returns from
-				 * ext4_mark_inode_dirty() to userspace.  So
-				 * ignore it.
-				 */
-				ext4_mark_inode_dirty(handle, inode);
-			}
-		}
-		err = ext4_journal_stop(handle);
-		if (ret == 0)
-			ret = err;
-	}
-out:
-	return ret;
-}
-
 /*
  * Calculate the number of metadata blocks need to reserve
  * to allocate a new block at @lblocks for non extent file based file
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 32825dee81d4..4879e93c91d3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3295,7 +3295,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 }
 
 /*
- * For ext4 extent files, ext4 will do direct-io write to holes,
+ * Handling of direct IO writes.
+ *
+ * For ext4 extent files, ext4 will do direct-io write even to holes,
  * preallocated extents, and those write extend the file, no need to
  * fall back to buffered IO.
  *
@@ -3313,21 +3315,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
  * if the machine crashes during the write.
  *
  */
-static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				  loff_t offset)
+static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
+				    loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	struct ext4_inode_info *ei = EXT4_I(inode);
 	ssize_t ret;
 	size_t count = iov_iter_count(iter);
 	int overwrite = 0;
 	get_block_t *get_block_func = NULL;
 	int dio_flags = 0;
 	loff_t final_size = offset + count;
+	int orphan = 0;
+	handle_t *handle;
 
-	/* Use the old path for reads and writes beyond i_size. */
-	if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
-		return ext4_ind_direct_IO(iocb, iter, offset);
+	if (final_size > inode->i_size) {
+		/* Credits for sb + inode write */
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		ret = ext4_orphan_add(handle, inode);
+		if (ret) {
+			ext4_journal_stop(handle);
+			goto out;
+		}
+		orphan = 1;
+		ei->i_disksize = inode->i_size;
+		ext4_journal_stop(handle);
+	}
 
 	BUG_ON(iocb->private == NULL);
 
@@ -3336,8 +3354,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 * conversion. This also disallows race between truncate() and
 	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
 	 */
-	if (iov_iter_rw(iter) == WRITE)
-		inode_dio_begin(inode);
+	inode_dio_begin(inode);
 
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
@@ -3346,7 +3363,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		inode_unlock(inode);
 
 	/*
-	 * We could direct write to holes and fallocate.
+	 * For extent mapped files we could direct write to holes and fallocate.
 	 *
 	 * Allocated blocks to fill the hole are marked as unwritten to prevent
 	 * parallel buffered read to expose the stale data before DIO complete
@@ -3368,7 +3385,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	iocb->private = NULL;
 	if (overwrite)
 		get_block_func = ext4_dio_get_block_overwrite;
-	else if (is_sync_kiocb(iocb)) {
+	else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+		 round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+		get_block_func = ext4_dio_get_block;
+		dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
+	} else if (is_sync_kiocb(iocb)) {
 		get_block_func = ext4_dio_get_block_unwritten_sync;
 		dio_flags = DIO_LOCKING;
 	} else {
@@ -3378,10 +3399,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
 #endif
-	if (IS_DAX(inode))
+	if (IS_DAX(inode)) {
+		dio_flags &= ~DIO_SKIP_HOLES;
 		ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
 				ext4_end_io_dio, dio_flags);
-	else
+	} else
 		ret = __blockdev_direct_IO(iocb, inode,
 					   inode->i_sb->s_bdev, iter, offset,
 					   get_block_func,
@@ -3401,12 +3423,87 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 	}
 
-	if (iov_iter_rw(iter) == WRITE)
-		inode_dio_end(inode);
+	inode_dio_end(inode);
 	/* take i_mutex locking again if we do a ovewrite dio */
 	if (overwrite)
 		inode_lock(inode);
 
+	if (ret < 0 && final_size > inode->i_size)
+		ext4_truncate_failed_write(inode);
+
+	/* Handle extending of i_size after direct IO write */
+	if (orphan) {
+		int err;
+
+		/* Credits for sb + inode write */
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+		if (IS_ERR(handle)) {
+			/* This is really bad luck. We've written the data
+			 * but cannot extend i_size. Bail out and pretend
+			 * the write failed... */
+			ret = PTR_ERR(handle);
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+
+			goto out;
+		}
+		if (inode->i_nlink)
+			ext4_orphan_del(handle, inode);
+		if (ret > 0) {
+			loff_t end = offset + ret;
+			if (end > inode->i_size) {
+				ei->i_disksize = end;
+				i_size_write(inode, end);
+				/*
+				 * We're going to return a positive `ret'
+				 * here due to non-zero-length I/O, so there's
+				 * no way of reporting error returns from
+				 * ext4_mark_inode_dirty() to userspace.  So
+				 * ignore it.
+				 */
+				ext4_mark_inode_dirty(handle, inode);
+			}
+		}
+		err = ext4_journal_stop(handle);
+		if (ret == 0)
+			ret = err;
+	}
+out:
+	return ret;
+}
+
+static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter,
+				   loff_t offset)
+{
+	int unlocked = 0;
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	ssize_t ret;
+
+	if (ext4_should_dioread_nolock(inode)) {
+		/*
+		 * Nolock dioread optimization may be dynamically disabled
+		 * via ext4_inode_block_unlocked_dio(). Check inode's state
+		 * while holding extra i_dio_count ref.
+		 */
+		inode_dio_begin(inode);
+		smp_mb();
+		if (unlikely(ext4_test_inode_state(inode,
+						    EXT4_STATE_DIOREAD_LOCK)))
+			inode_dio_end(inode);
+		else
+			unlocked = 1;
+	}
+	if (IS_DAX(inode)) {
+		ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block,
+				NULL, unlocked ? 0 : DIO_LOCKING);
+	} else {
+		ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+					   iter, offset, ext4_dio_get_block,
+					   NULL, NULL,
+					   unlocked ? 0 : DIO_LOCKING);
+	}
+	if (unlocked)
+		inode_dio_end(inode);
 	return ret;
 }
 
@@ -3434,10 +3531,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 
 	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		ret = ext4_ext_direct_IO(iocb, iter, offset);
+	if (iov_iter_rw(iter) == READ)
+		ret = ext4_direct_IO_read(iocb, iter, offset);
 	else
-		ret = ext4_ind_direct_IO(iocb, iter, offset);
+		ret = ext4_direct_IO_write(iocb, iter, offset);
 	trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 12735f881952c32b31bc4e433768f18489f79ec9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 13 May 2016 00:51:15 -0400
Subject: ext4: pre-zero allocated blocks for DAX IO

Currently ext4 treats DAX IO the same way as direct IO. I.e., it
allocates unwritten extents before IO is done and converts unwritten
extents afterwards. However this way DAX IO can race with page fault to
the same area:

ext4_ext_direct_IO()				dax_fault()
  dax_io()
    get_block() - allocates unwritten extent
    copy_from_iter_pmem()
						  get_block() - converts
						    unwritten block to
						    written and zeroes it
						    out
  ext4_convert_unwritten_extents()

So data written with DAX IO gets lost. Similarly dax_new_buf() called
from dax_io() can overwrite data that has been already written to the
block via mmap.

Fix the problem by using pre-zeroed blocks for DAX IO the same way as we
use them for DAX mmap. The downside of this solution is that every
allocating write writes each block twice (once zeros, once data). Fixing
the race with locking is possible as well however we would need to
lock-out faults for the whole range written to by DAX IO. And that is
not easy to do without locking-out faults for the whole file which seems
too aggressive.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 11 +++++++++--
 fs/ext4/file.c  |  4 ++--
 fs/ext4/inode.c | 43 +++++++++++++++++++++++++++++++++----------
 3 files changed, 44 insertions(+), 14 deletions(-)

(limited to 'fs/ext4/inode.c')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 89e1bcb21341..b84aa1ca480a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2527,8 +2527,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
 			     struct buffer_head *bh_result, int create);
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
-			    struct buffer_head *bh_result, int create);
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+		       struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create);
 int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -3334,6 +3334,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 	}
 }
 
+static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
+{
+	int blksize = 1 << inode->i_blkbits;
+
+	return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
+}
+
 #endif	/* __KERNEL__ */
 
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3e850b988923..37e28082885a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,7 +207,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (IS_ERR(handle))
 		result = VM_FAULT_SIGBUS;
 	else
-		result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
+		result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL);
 
 	if (write) {
 		if (!IS_ERR(handle))
@@ -243,7 +243,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 		result = VM_FAULT_SIGBUS;
 	else
 		result = __dax_pmd_fault(vma, addr, pmd, flags,
-				ext4_dax_mmap_get_block, NULL);
+					 ext4_dax_get_block, NULL);
 
 	if (write) {
 		if (!IS_ERR(handle))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4879e93c91d3..f9ab1e8cc416 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3229,13 +3229,17 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 
 #ifdef CONFIG_FS_DAX
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
-			    struct buffer_head *bh_result, int create)
+/*
+ * Get block function for DAX IO and mmap faults. It takes care of converting
+ * unwritten extents to written ones and initializes new / converted blocks
+ * to zeros.
+ */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+		       struct buffer_head *bh_result, int create)
 {
 	int ret;
 
-	ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
-		   inode->i_ino, create);
+	ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
 	if (!create)
 		return _ext4_get_block(inode, iblock, bh_result, 0);
 
@@ -3247,9 +3251,9 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
 
 	if (buffer_unwritten(bh_result)) {
 		/*
-		 * We are protected by i_mmap_sem so we know block cannot go
-		 * away from under us even though we dropped i_data_sem.
-		 * Convert extent to written and write zeros there.
+		 * We are protected by i_mmap_sem or i_mutex so we know block
+		 * cannot go away from under us even though we dropped
+		 * i_data_sem. Convert extent to written and write zeros there.
 		 */
 		ret = ext4_get_block_trans(inode, iblock, bh_result,
 					   EXT4_GET_BLOCKS_CONVERT |
@@ -3264,6 +3268,14 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
 	clear_buffer_new(bh_result);
 	return 0;
 }
+#else
+/* Just define empty function, it will never get called. */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+		       struct buffer_head *bh_result, int create)
+{
+	BUG();
+	return 0;
+}
 #endif
 
 static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -3385,8 +3397,20 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
 	iocb->private = NULL;
 	if (overwrite)
 		get_block_func = ext4_dio_get_block_overwrite;
-	else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
-		 round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+	else if (IS_DAX(inode)) {
+		/*
+		 * We can avoid zeroing for aligned DAX writes beyond EOF. Other
+		 * writes need zeroing either because they can race with page
+		 * faults or because they use partial blocks.
+		 */
+		if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
+		    ext4_aligned_io(inode, offset, count))
+			get_block_func = ext4_dio_get_block;
+		else
+			get_block_func = ext4_dax_get_block;
+		dio_flags = DIO_LOCKING;
+	} else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+		   round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
 		get_block_func = ext4_dio_get_block;
 		dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
 	} else if (is_sync_kiocb(iocb)) {
@@ -3400,7 +3424,6 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
 	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
 #endif
 	if (IS_DAX(inode)) {
-		dio_flags &= ~DIO_SKIP_HOLES;
 		ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
 				ext4_end_io_dio, dio_flags);
 	} else
-- 
cgit v1.2.3-70-g09d2