diff options
Diffstat (limited to 'fs/btrfs/inode.c')
| -rw-r--r-- | fs/btrfs/inode.c | 312 | 
1 files changed, 225 insertions, 87 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0117d867ecf8..487533c35ddb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@  #include <linux/sched/mm.h>  #include <linux/iomap.h>  #include <asm/unaligned.h> +#include <linux/fsverity.h>  #include "misc.h"  #include "ctree.h"  #include "disk-io.h" @@ -286,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,  			cur_size = min_t(unsigned long, compressed_size,  				       PAGE_SIZE); -			kaddr = kmap_atomic(cpage); +			kaddr = page_address(cpage);  			write_extent_buffer(leaf, kaddr, ptr, cur_size); -			kunmap_atomic(kaddr);  			i++;  			ptr += cur_size; @@ -490,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow,   */  static inline bool inode_can_compress(struct btrfs_inode *inode)  { +	/* Subpage doesn't support compression yet */ +	if (inode->root->fs_info->sectorsize < PAGE_SIZE) +		return false;  	if (inode->flags & BTRFS_INODE_NODATACOW ||  	    inode->flags & BTRFS_INODE_NODATASUM)  		return false; @@ -629,7 +632,7 @@ again:  	 * inode has not been flagged as nocompress.  This flag can  	 * change at any time if we discover bad compression ratios.  	 */ -	if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) { +	if (inode_need_compress(BTRFS_I(inode), start, end)) {  		WARN_ON(pages);  		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);  		if (!pages) { @@ -682,7 +685,11 @@ again:  		}  	}  cont: -	if (start == 0) { +	/* +	 * Check cow_file_range() for why we don't even try to create inline +	 * extent for subpage case. +	 */ +	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {  		/* lets try to make an inline extent */  		if (ret || total_in < actual_end) {  			/* we didn't compress the entire range, try @@ -973,7 +980,7 @@ retry:  			p->mapping = inode->vfs_inode.i_mapping;  			btrfs_writepage_endio_finish_ordered(inode, p, start, -							     end, 0); +							     end, false);  			p->mapping = NULL;  			extent_clear_unlock_delalloc(inode, start, end, NULL, 0, @@ -1080,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  	inode_should_defrag(inode, start, end, num_bytes, SZ_64K); -	if (start == 0) { +	/* +	 * Due to the page size limit, for subpage we can only trigger the +	 * writeback for the dirty sectors of page, that means data writeback +	 * is doing more writeback than what we want. +	 * +	 * This is especially unexpected for some call sites like fallocate, +	 * where we only increase i_size after everything is done. +	 * This means we can trigger inline extent even if we didn't want to. +	 * So here we skip inline extent creation completely. +	 */ +	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {  		/* lets try to make an inline extent */  		ret = cow_file_range_inline(inode, start, end, 0,  					    BTRFS_COMPRESS_NONE, NULL); @@ -1290,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)  	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>  		PAGE_SHIFT; -	/* atomic_sub_return implies a barrier */ -	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < -	    5 * SZ_1M) -		cond_wake_up_nomb(&fs_info->async_submit_wait); -  	/*  	 * ->inode could be NULL if async_chunk_start has failed to compress,  	 * in which case we don't have anything to submit, yet we need to @@ -1303,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)  	 */  	if (async_chunk->inode)  		submit_compressed_extents(async_chunk); + +	/* atomic_sub_return implies a barrier */ +	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < +	    5 * SZ_1M) +		cond_wake_up_nomb(&fs_info->async_submit_wait);  }  static noinline void async_cow_free(struct btrfs_work *work) @@ -1946,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page  		ret = cow_file_range_async(inode, wbc, locked_page, start, end,  					   page_started, nr_written);  	} +	ASSERT(ret <= 0);  	if (ret)  		btrfs_cleanup_ordered_extents(inode, locked_page, start,  					      end - start + 1); @@ -2285,7 +2303,6 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,  	struct extent_map *split_mid = NULL;  	struct extent_map *split_post = NULL;  	int ret = 0; -	int modified;  	unsigned long flags;  	/* Sanity check */ @@ -2315,11 +2332,12 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,  	ASSERT(em->len == len);  	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));  	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); +	ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); +	ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); +	ASSERT(!list_empty(&em->list));  	flags = em->flags;  	clear_bit(EXTENT_FLAG_PINNED, &em->flags); -	clear_bit(EXTENT_FLAG_LOGGING, &flags); -	modified = !list_empty(&em->list);  	/* First, replace the em with a new extent_map starting from * em->start */  	split_pre->start = em->start; @@ -2333,7 +2351,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,  	split_pre->compress_type = em->compress_type;  	split_pre->generation = em->generation; -	replace_extent_mapping(em_tree, em, split_pre, modified); +	replace_extent_mapping(em_tree, em, split_pre, 1);  	/*  	 * Now we only have an extent_map at: @@ -2353,7 +2371,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,  		split_mid->flags = flags;  		split_mid->compress_type = em->compress_type;  		split_mid->generation = em->generation; -		add_extent_mapping(em_tree, split_mid, modified); +		add_extent_mapping(em_tree, split_mid, 1);  	}  	if (post) { @@ -2367,7 +2385,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,  		split_post->flags = flags;  		split_post->compress_type = em->compress_type;  		split_post->generation = em->generation; -		add_extent_mapping(em_tree, split_post, modified); +		add_extent_mapping(em_tree, split_post, 1);  	}  	/* Once for us */ @@ -2770,7 +2788,7 @@ out_page:   * to fix it up.  The async helper will wait for ordered extents, set   * the delalloc bit and make it safe to write the page.   */ -int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page)  {  	struct inode *inode = page->mapping->host;  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3171,7 +3189,7 @@ static void finish_ordered_fn(struct btrfs_work *work)  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,  					  struct page *page, u64 start, -					  u64 end, int uptodate) +					  u64 end, bool uptodate)  {  	trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); @@ -3257,25 +3275,44 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,  		return 0;  	} -	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) +	/* +	 * For subpage case, above PageChecked is not safe as it's not subpage +	 * compatible. +	 * But for now only cow fixup and compressed read utilize PageChecked +	 * flag, while in this context we can easily use io_bio->csum to +	 * determine if we really need to do csum verification. +	 * +	 * So for now, just exit if io_bio->csum is NULL, as it means it's +	 * compressed read, and its compressed data csum has already been +	 * verified. +	 */ +	if (io_bio->csum == NULL)  		return 0; -	if (!root->fs_info->csum_root) +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)  		return 0; -	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && -	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { -		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); +	if (!root->fs_info->csum_root)  		return 0; -	}  	ASSERT(page_offset(page) <= start &&  	       end <= page_offset(page) + PAGE_SIZE - 1);  	for (pg_off = offset_in_page(start);  	     pg_off < offset_in_page(end);  	     pg_off += sectorsize, bio_offset += sectorsize) { +		u64 file_offset = pg_off + page_offset(page);  		int ret; +		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && +		    test_range_bit(io_tree, file_offset, +				   file_offset + sectorsize - 1, +				   EXTENT_NODATASUM, 1, NULL)) { +			/* Skip the range without csum for data reloc inode */ +			clear_extent_bits(io_tree, file_offset, +					  file_offset + sectorsize - 1, +					  EXTENT_NODATASUM); +			continue; +		}  		ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,  				      page_offset(page) + pg_off);  		if (ret < 0) { @@ -3520,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		/*  		 * If we have an inode with links, there are a couple of -		 * possibilities. Old kernels (before v3.12) used to create an +		 * possibilities: +		 * +		 * 1. We were halfway through creating fsverity metadata for the +		 * file. In that case, the orphan item represents incomplete +		 * fsverity metadata which must be cleaned up with +		 * btrfs_drop_verity_items and deleting the orphan item. + +		 * 2. Old kernels (before v3.12) used to create an  		 * orphan item for truncate indicating that there were possibly  		 * extent items past i_size that needed to be deleted. In v3.12,  		 * truncate was changed to update i_size in sync with the extent @@ -3538,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		 * but either way, we can delete the orphan item.  		 */  		if (ret == -ENOENT || inode->i_nlink) { -			if (!ret) +			if (!ret) { +				ret = btrfs_drop_verity_items(BTRFS_I(inode));  				iput(inode); +				if (ret) +					goto out; +			}  			trans = btrfs_start_transaction(root, 1);  			if (IS_ERR(trans)) {  				ret = PTR_ERR(trans); @@ -3728,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode,  	rdev = btrfs_inode_rdev(leaf, inode_item);  	BTRFS_I(inode)->index_cnt = (u64)-1; -	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); +	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), +				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);  cache_index:  	/* @@ -3859,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  			    struct inode *inode)  {  	struct btrfs_map_token token; +	u64 flags;  	btrfs_init_map_token(&token, leaf); @@ -3894,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));  	btrfs_set_token_inode_transid(&token, item, trans->transid);  	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); -	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); +	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, +					  BTRFS_I(inode)->ro_flags); +	btrfs_set_token_inode_flags(&token, item, flags);  	btrfs_set_token_inode_block_group(&token, item, 0);  } @@ -5088,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,  	int ret;  	/* -	 * Still need to make sure the inode looks like it's been updated so -	 * that any holes get logged if we fsync. +	 * If NO_HOLES is enabled, we don't need to do anything. +	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() +	 * or btrfs_update_inode() will be called, which guarantee that the next +	 * fsync will know this inode was changed and needs to be logged.  	 */ -	if (btrfs_fs_incompat(fs_info, NO_HOLES)) { -		inode->last_trans = fs_info->generation; -		inode->last_sub_trans = root->log_transid; -		inode->last_log_commit = root->last_log_commit; +	if (btrfs_fs_incompat(fs_info, NO_HOLES))  		return 0; -	}  	/*  	 * 1 - for the one we're dropping @@ -5342,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr  	if (btrfs_root_readonly(root))  		return -EROFS; -	err = setattr_prepare(&init_user_ns, dentry, attr); +	err = setattr_prepare(mnt_userns, dentry, attr);  	if (err)  		return err; @@ -5353,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr  	}  	if (attr->ia_valid) { -		setattr_copy(&init_user_ns, inode, attr); +		setattr_copy(mnt_userns, inode, attr);  		inode_inc_iversion(inode);  		err = btrfs_dirty_inode(inode);  		if (!err && attr->ia_valid & ATTR_MODE) -			err = posix_acl_chmod(&init_user_ns, inode, -					      inode->i_mode); +			err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);  	}  	return err; @@ -5522,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode)  	trace_btrfs_inode_evict(inode);  	if (!root) { +		fsverity_cleanup_inode(inode);  		clear_inode(inode);  		return;  	} @@ -5604,6 +5654,7 @@ no_delete:  	 * to retry these periodically in the future.  	 */  	btrfs_remove_delayed_node(BTRFS_I(inode)); +	fsverity_cleanup_inode(inode);  	clear_inode(inode);  } @@ -6370,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)  static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root, +				     struct user_namespace *mnt_userns,  				     struct inode *dir,  				     const char *name, int name_len,  				     u64 ref_objectid, u64 objectid, @@ -6479,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	if (ret != 0)  		goto fail_unlock; -	inode_init_owner(&init_user_ns, inode, dir, mode); +	inode_init_owner(mnt_userns, inode, dir, mode);  	inode_set_bytes(inode, 0);  	inode->i_mtime = current_time(inode); @@ -6664,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,  	if (err)  		goto out_unlock; -	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, -			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, -			mode, &index); +	inode = btrfs_new_inode(trans, root, mnt_userns, dir, +			dentry->d_name.name, dentry->d_name.len, +			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);  	if (IS_ERR(inode)) {  		err = PTR_ERR(inode);  		inode = NULL; @@ -6728,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,  	if (err)  		goto out_unlock; -	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, -			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, -			mode, &index); +	inode = btrfs_new_inode(trans, root, mnt_userns, dir, +			dentry->d_name.name, dentry->d_name.len, +			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);  	if (IS_ERR(inode)) {  		err = PTR_ERR(inode);  		inode = NULL; @@ -6873,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,  	if (err)  		goto out_fail; -	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, -			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, +	inode = btrfs_new_inode(trans, root, mnt_userns, dir, +			dentry->d_name.name, dentry->d_name.len, +			btrfs_ino(BTRFS_I(dir)), objectid,  			S_IFDIR | mode, &index);  	if (IS_ERR(inode)) {  		err = PTR_ERR(inode); @@ -8194,9 +8247,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,  	return dip;  } -static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, +static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,  		struct bio *dio_bio, loff_t file_offset)  { +	struct inode *inode = iter->inode;  	const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	const bool raid56 = (btrfs_data_alloc_profile(fs_info) & @@ -8206,13 +8260,13 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,  	u64 start_sector;  	int async_submit = 0;  	u64 submit_len; -	int clone_offset = 0; -	int clone_len; +	u64 clone_offset = 0; +	u64 clone_len;  	u64 logical;  	int ret;  	blk_status_t status;  	struct btrfs_io_geometry geom; -	struct btrfs_dio_data *dio_data = iomap->private; +	struct btrfs_dio_data *dio_data = iter->iomap.private;  	struct extent_map *em = NULL;  	dip = btrfs_create_dio_private(dio_bio, inode, file_offset); @@ -8255,9 +8309,9 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,  			status = errno_to_blk_status(ret);  			goto out_err_em;  		} -		ASSERT(geom.len <= INT_MAX); -		clone_len = min_t(int, submit_len, geom.len); +		clone_len = min(submit_len, geom.len); +		ASSERT(clone_len <= UINT_MAX);  		/*  		 * This will never fail as it's passing GPF_NOFS and @@ -8401,11 +8455,47 @@ static void btrfs_readahead(struct readahead_control *rac)  	extent_readahead(rac);  } +/* + * For releasepage() and invalidatepage() we have a race window where + * end_page_writeback() is called but the subpage spinlock is not yet released. + * If we continue to release/invalidate the page, we could cause use-after-free + * for subpage spinlock.  So this function is to spin and wait for subpage + * spinlock. + */ +static void wait_subpage_spinlock(struct page *page) +{ +	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); +	struct btrfs_subpage *subpage; + +	if (fs_info->sectorsize == PAGE_SIZE) +		return; + +	ASSERT(PagePrivate(page) && page->private); +	subpage = (struct btrfs_subpage *)page->private; + +	/* +	 * This may look insane as we just acquire the spinlock and release it, +	 * without doing anything.  But we just want to make sure no one is +	 * still holding the subpage spinlock. +	 * And since the page is not dirty nor writeback, and we have page +	 * locked, the only possible way to hold a spinlock is from the endio +	 * function to clear page writeback. +	 * +	 * Here we just acquire the spinlock so that all existing callers +	 * should exit and we're safe to release/invalidate the page. +	 */ +	spin_lock_irq(&subpage->lock); +	spin_unlock_irq(&subpage->lock); +} +  static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)  {  	int ret = try_release_extent_mapping(page, gfp_flags); -	if (ret == 1) + +	if (ret == 1) { +		wait_subpage_spinlock(page);  		clear_page_extent_mapped(page); +	}  	return ret;  } @@ -8469,6 +8559,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,  	 * do double ordered extent accounting on the same page.  	 */  	wait_on_page_writeback(page); +	wait_subpage_spinlock(page);  	/*  	 * For subpage case, we have call sites like @@ -8557,7 +8648,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,  		spin_unlock_irq(&inode->ordered_tree.lock);  		if (btrfs_dec_test_ordered_pending(inode, &ordered, -					cur, range_end + 1 - cur, 1)) { +						   cur, range_end + 1 - cur)) {  			btrfs_finish_ordered_io(ordered);  			/*  			 * The ordered extent has finished, now we're again @@ -8938,7 +9029,8 @@ out:   */  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,  			     struct btrfs_root *new_root, -			     struct btrfs_root *parent_root) +			     struct btrfs_root *parent_root, +			     struct user_namespace *mnt_userns)  {  	struct inode *inode;  	int err; @@ -8949,7 +9041,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,  	if (err < 0)  		return err; -	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino, +	inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, +				ino, ino,  				S_IFDIR | (~current_umask() & S_IRWXUGO),  				&index);  	if (IS_ERR(inode)) @@ -8993,6 +9086,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->defrag_bytes = 0;  	ei->disk_i_size = 0;  	ei->flags = 0; +	ei->ro_flags = 0;  	ei->csum_bytes = 0;  	ei->index_cnt = (u64)-1;  	ei->dir_index = 0; @@ -9174,6 +9268,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,  	struct inode *inode = d_inode(path->dentry);  	u32 blocksize = inode->i_sb->s_blocksize;  	u32 bi_flags = BTRFS_I(inode)->flags; +	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;  	stat->result_mask |= STATX_BTIME;  	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; @@ -9186,13 +9281,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,  		stat->attributes |= STATX_ATTR_IMMUTABLE;  	if (bi_flags & BTRFS_INODE_NODUMP)  		stat->attributes |= STATX_ATTR_NODUMP; +	if (bi_ro_flags & BTRFS_INODE_RO_VERITY) +		stat->attributes |= STATX_ATTR_VERITY;  	stat->attributes_mask |= (STATX_ATTR_APPEND |  				  STATX_ATTR_COMPRESSED |  				  STATX_ATTR_IMMUTABLE |  				  STATX_ATTR_NODUMP); -	generic_fillattr(&init_user_ns, inode, stat); +	generic_fillattr(mnt_userns, inode, stat);  	stat->dev = BTRFS_I(inode)->root->anon_dev;  	spin_lock(&BTRFS_I(inode)->lock); @@ -9226,8 +9323,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,  	bool dest_log_pinned = false;  	bool need_abort = false; -	/* we only allow rename subvolume link between subvolumes */ -	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) +	/* +	 * For non-subvolumes allow exchange only within one subvolume, in the +	 * same inode namespace. Two subvolumes (represented as directory) can +	 * be exchanged as they're a logical link and have a fixed inode number. +	 */ +	if (root != dest && +	    (old_ino != BTRFS_FIRST_FREE_OBJECTID || +	     new_ino != BTRFS_FIRST_FREE_OBJECTID))  		return -EXDEV;  	/* close the race window with snapshot create/destroy ioctl */ @@ -9274,8 +9377,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,  		/* force full log commit if subvolume involved. */  		btrfs_set_log_full_commit(trans);  	} else { -		btrfs_pin_log_trans(root); -		root_log_pinned = true;  		ret = btrfs_insert_inode_ref(trans, dest,  					     new_dentry->d_name.name,  					     new_dentry->d_name.len, @@ -9292,8 +9393,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,  		/* force full log commit if subvolume involved. */  		btrfs_set_log_full_commit(trans);  	} else { -		btrfs_pin_log_trans(dest); -		dest_log_pinned = true;  		ret = btrfs_insert_inode_ref(trans, root,  					     old_dentry->d_name.name,  					     old_dentry->d_name.len, @@ -9324,6 +9423,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,  				BTRFS_I(new_inode), 1);  	} +	/* +	 * Now pin the logs of the roots. We do it to ensure that no other task +	 * can sync the logs while we are in progress with the rename, because +	 * that could result in an inconsistency in case any of the inodes that +	 * are part of this rename operation were logged before. +	 * +	 * We pin the logs even if at this precise moment none of the inodes was +	 * logged before. This is because right after we checked for that, some +	 * other task fsyncing some other inode not involved with this rename +	 * operation could log that one of our inodes exists. +	 * +	 * We don't need to pin the logs before the above calls to +	 * btrfs_insert_inode_ref(), since those don't ever need to change a log. +	 */ +	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { +		btrfs_pin_log_trans(root); +		root_log_pinned = true; +	} +	if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { +		btrfs_pin_log_trans(dest); +		dest_log_pinned = true; +	} +  	/* src is a subvolume */  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {  		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9405,8 +9527,7 @@ out_fail:  		if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||  		    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||  		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || -		    (new_inode && -		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) +		    btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))  			btrfs_set_log_full_commit(trans);  		if (root_log_pinned) { @@ -9430,6 +9551,7 @@ out_notrans:  static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root, +				     struct user_namespace *mnt_userns,  				     struct inode *dir,  				     struct dentry *dentry)  { @@ -9442,7 +9564,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,  	if (ret)  		return ret; -	inode = btrfs_new_inode(trans, root, dir, +	inode = btrfs_new_inode(trans, root, mnt_userns, dir,  				dentry->d_name.name,  				dentry->d_name.len,  				btrfs_ino(BTRFS_I(dir)), @@ -9479,9 +9601,10 @@ out:  	return ret;  } -static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, -			   struct inode *new_dir, struct dentry *new_dentry, -			   unsigned int flags) +static int btrfs_rename(struct user_namespace *mnt_userns, +			struct inode *old_dir, struct dentry *old_dentry, +			struct inode *new_dir, struct dentry *new_dentry, +			unsigned int flags)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);  	struct btrfs_trans_handle *trans; @@ -9576,8 +9699,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  		/* force full log commit if subvolume involved. */  		btrfs_set_log_full_commit(trans);  	} else { -		btrfs_pin_log_trans(root); -		log_pinned = true;  		ret = btrfs_insert_inode_ref(trans, dest,  					     new_dentry->d_name.name,  					     new_dentry->d_name.len, @@ -9601,6 +9722,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {  		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);  	} else { +		/* +		 * Now pin the log. We do it to ensure that no other task can +		 * sync the log while we are in progress with the rename, as +		 * that could result in an inconsistency in case any of the +		 * inodes that are part of this rename operation were logged +		 * before. +		 * +		 * We pin the log even if at this precise moment none of the +		 * inodes was logged before. This is because right after we +		 * checked for that, some other task fsyncing some other inode +		 * not involved with this rename operation could log that one of +		 * our inodes exists. +		 * +		 * We don't need to pin the logs before the above call to +		 * btrfs_insert_inode_ref(), since that does not need to change +		 * a log. +		 */ +		btrfs_pin_log_trans(root); +		log_pinned = true;  		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),  					BTRFS_I(d_inode(old_dentry)),  					old_dentry->d_name.name, @@ -9654,8 +9794,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	}  	if (flags & RENAME_WHITEOUT) { -		ret = btrfs_whiteout_for_rename(trans, root, old_dir, -						old_dentry); +		ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, +						old_dir, old_dentry);  		if (ret) {  			btrfs_abort_transaction(trans, ret); @@ -9705,7 +9845,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di  		return btrfs_rename_exchange(old_dir, old_dentry, new_dir,  					  new_dentry); -	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); +	return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, +			    new_dentry, flags);  }  struct btrfs_delalloc_work { @@ -9802,11 +9943,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,  			btrfs_queue_work(root->fs_info->flush_workers,  					 &work->work);  		} else { -			ret = sync_inode(inode, wbc); -			if (!ret && -			    test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -				     &BTRFS_I(inode)->runtime_flags)) -				ret = sync_inode(inode, wbc); +			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);  			btrfs_add_delayed_iput(inode);  			if (ret || wbc->nr_to_write <= 0)  				goto out; @@ -9941,9 +10078,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,  	if (err)  		goto out_unlock; -	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, -				dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), -				objectid, S_IFLNK|S_IRWXUGO, &index); +	inode = btrfs_new_inode(trans, root, mnt_userns, dir, +				dentry->d_name.name, dentry->d_name.len, +				btrfs_ino(BTRFS_I(dir)), objectid, +				S_IFLNK | S_IRWXUGO, &index);  	if (IS_ERR(inode)) {  		err = PTR_ERR(inode);  		inode = NULL; @@ -10267,7 +10405,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,  		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)  			return -EACCES;  	} -	return generic_permission(&init_user_ns, inode, mask); +	return generic_permission(mnt_userns, inode, mask);  }  static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, @@ -10292,7 +10430,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,  	if (ret)  		goto out; -	inode = btrfs_new_inode(trans, root, dir, NULL, 0, +	inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,  			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);  	if (IS_ERR(inode)) {  		ret = PTR_ERR(inode);  | 
